linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
@ 2013-06-19 17:57 Naveen N. Rao
  2013-06-19 17:57 ` [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors Naveen N. Rao
  2013-06-20  7:39 ` [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC Borislav Petkov
  0 siblings, 2 replies; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-19 17:57 UTC (permalink / raw)
  To: tony.luck, bp; +Cc: ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

The Corrected Machine Check structure (CMC) in HEST has a flag which can be
set by the firmware to indicate to the OS that it prefers to process the
corrected error events first. In this scenario, the OS is expected to not
monitor for corrected errors (through CMCI/polling). Instead, the firmware
notifies the OS on corrected error events through GHES.

Linux already has support for GHES. This patch adds support for parsing CMC
structure and to disable CMCI/polling if the firmware first flag is set.

Further, the list of machine check bank structures at the end of CMC is used
to determine which MCA banks function in FF mode, so that we continue to
monitor error events on the other banks.


- Naveen

--
Changes:
- Incorporated comments from Boris and Tony from the previous thread at
  http://thread.gmane.org/gmane.linux.acpi.devel/60802
- Added patch to disable firmware first mode through a boot option.


Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
 arch/x86/include/asm/mce.h                |    3 ++
 arch/x86/kernel/cpu/mcheck/mce-internal.h |    3 ++
 arch/x86/kernel/cpu/mcheck/mce.c          |   25 ++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce_intel.c    |   40 +++++++++++++++++++++++------
 drivers/acpi/apei/hest.c                  |   36 ++++++++++++++++++++++++++
 5 files changed, 99 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index fa5f71e..380fff8 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -188,6 +188,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
 				    const char __user *ubuf,
 				    size_t usize, loff_t *off));
 
+/* Disable CMCI/polling for MCA bank claimed by firmware */
+extern void mce_disable_ce_bank(int bank);
+
 /*
  * Exception handler
  */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 5b7d4fa..193edc1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -25,6 +25,9 @@ int mce_severity(struct mce *a, int tolerant, char **msg);
 struct dentry *mce_get_debugfs_dir(void);
 
 extern struct mce_bank *mce_banks;
+extern mce_banks_t mce_banks_ce_disabled;
+
+void cmci_disable_bank(int bank);
 
 #ifdef CONFIG_X86_MCE_INTEL
 unsigned long mce_intel_adjust_timer(unsigned long interval);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9239504..9512034 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -94,6 +94,9 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 };
 
+/* MCA banks controlled through firmware first for corrected errors */
+mce_banks_t mce_banks_ce_disabled;
+
 static DEFINE_PER_CPU(struct work_struct, mce_work);
 
 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
@@ -1932,6 +1935,28 @@ static struct miscdevice mce_chrdev_device = {
 	&mce_chrdev_ops,
 };
 
+static void __mce_disable_ce_bank(void *arg)
+{
+	int bank = *((int *)arg);
+
+	/* Ensure we don't poll this bank */
+	__clear_bit(bank, __get_cpu_var(mce_poll_banks));
+	/* Disable CMCI */
+	cmci_disable_bank(bank);
+}
+
+void mce_disable_ce_bank(int bank)
+{
+	if (bank >= mca_cfg.banks) {
+		pr_warning(FW_BUG 
+			"Ignoring request to disable invalid MCA bank %d.\n",
+			bank);
+		return;
+	}
+	set_bit(bank, mce_banks_ce_disabled);
+	on_each_cpu(__mce_disable_ce_bank, &bank, 1);
+}
+
 /*
  * mce=off Disables machine check
  * mce=no_cmci Disables CMCI
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index ae1697c..78256c0 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -191,6 +191,10 @@ static void cmci_discover(int banks)
 		if (test_bit(i, owned))
 			continue;
 
+		/* Skip banks in firmware first mode */
+		if (test_bit(i, mce_banks_ce_disabled))
+			continue;
+
 		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
 
 		/* Already owned by someone else? */
@@ -259,6 +263,20 @@ void cmci_recheck(void)
 	local_irq_restore(flags);
 }
 
+/* Caller must hold the lock on cmci_discover_lock */
+static void __cmci_disable_bank(int bank)
+{
+	u64 val;
+
+	if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
+		return;
+	/* Disable CMCI */
+	rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+	val &= ~MCI_CTL2_CMCI_EN;
+	wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+	__clear_bit(bank, __get_cpu_var(mce_banks_owned));
+}
+
 /*
  * Disable CMCI on this CPU for all banks it owns when it goes down.
  * This allows other CPUs to claim the banks on rediscovery.
@@ -268,19 +286,12 @@ void cmci_clear(void)
 	unsigned long flags;
 	int i;
 	int banks;
-	u64 val;
 
 	if (!cmci_supported(&banks))
 		return;
 	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
 	for (i = 0; i < banks; i++) {
-		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
-			continue;
-		/* Disable CMCI */
-		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
-		val &= ~MCI_CTL2_CMCI_EN;
-		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
-		__clear_bit(i, __get_cpu_var(mce_banks_owned));
+		__cmci_disable_bank(i);
 	}
 	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
 }
@@ -315,6 +326,19 @@ void cmci_reenable(void)
 		cmci_discover(banks);
 }
 
+void cmci_disable_bank(int bank)
+{
+	int banks;
+	unsigned long flags;
+
+	if (!cmci_supported(&banks))
+		return;
+
+	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+	__cmci_disable_bank(bank);
+	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
 static void intel_init_cmci(void)
 {
 	int banks;
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index f5ef5d5..d8c69ba 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -36,6 +36,7 @@
 #include <linux/io.h>
 #include <linux/platform_device.h>
 #include <acpi/apei.h>
+#include <asm/mce.h>
 
 #include "apei-internal.h"
 
@@ -121,6 +122,39 @@ int apei_hest_parse(apei_hest_func_t func, void *data)
 }
 EXPORT_SYMBOL_GPL(apei_hest_parse);
 
+/*
+ * Check if firmware advertises firmware first mode. We need FF bit to be set
+ * along with a set of MC banks which work in FF mode.
+ */
+static int __init hest_parse_cmc(struct acpi_hest_header *hest_hdr, void *data)
+{
+	int i;
+	struct acpi_hest_ia_corrected *cmc;
+	struct acpi_hest_ia_error_bank *mc_bank;
+
+	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
+		return 0;
+
+	cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
+	if (!cmc->enabled || !(cmc->flags & ACPI_HEST_FIRMWARE_FIRST))
+		return 0;
+
+	/*
+	 * We expect HEST to provide a list of MC banks that
+	 * report errors through firmware first mode.
+	 */
+	if (cmc->num_hardware_banks == 0)
+		return 0;
+
+	pr_info(HEST_PFX "Enabling Firmware First mode for corrected errors.\n");
+
+	mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
+	for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
+		mce_disable_ce_bank(mc_bank->bank_number);
+
+	return 0;
+}
+
 struct ghes_arr {
 	struct platform_device **ghes_devs;
 	unsigned int count;
@@ -227,6 +261,8 @@ void __init acpi_hest_init(void)
 		goto err;
 	}
 
+	apei_hest_parse(hest_parse_cmc, NULL);
+
 	if (!ghes_disable) {
 		rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count);
 		if (rc)


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 17:57 [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC Naveen N. Rao
@ 2013-06-19 17:57 ` Naveen N. Rao
  2013-06-19 18:04   ` Borislav Petkov
  2013-06-20  7:48   ` Borislav Petkov
  2013-06-20  7:39 ` [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC Borislav Petkov
  1 sibling, 2 replies; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-19 17:57 UTC (permalink / raw)
  To: tony.luck, bp; +Cc: ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

Add a boot option to disable firmware first mode for corrected errors.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
 Documentation/x86/x86_64/boot-options.txt |    5 +++++
 arch/x86/include/asm/acpi.h               |    2 ++
 arch/x86/kernel/acpi/boot.c               |    5 +++++
 drivers/acpi/apei/hest.c                  |    3 ++-
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index e9e8ddb..1228b22 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -176,6 +176,11 @@ ACPI
 
   acpi=noirq	Don't route interrupts
 
+  acpi=nocmcff	Disable firmware first mode for corrected errors. This
+		disables parsing the HEST CMC error source to check if
+		firmware has set the FF flag. This may result in
+		duplicate corrected error reports.
+
 PCI
 
   pci=off		Don't use PCI
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index b31bf97..42db2b8 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -86,6 +86,7 @@ extern int acpi_pci_disabled;
 extern int acpi_skip_timer_override;
 extern int acpi_use_timer_override;
 extern int acpi_fix_pin2_polarity;
+extern int acpi_disable_cmcff;
 
 extern u8 acpi_sci_flags;
 extern int acpi_sci_override_gsi;
@@ -168,6 +169,7 @@ static inline void arch_acpi_set_pdc_bits(u32 *buf)
 
 #define acpi_lapic 0
 #define acpi_ioapic 0
+#define acpi_disable_cmcff 0
 static inline void acpi_noirq_set(void) { }
 static inline void acpi_disable_pci(void) { }
 static inline void disable_acpi(void) { }
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 230c8ea..d1998d5 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -66,6 +66,7 @@ EXPORT_SYMBOL(acpi_pci_disabled);
 int acpi_lapic;
 int acpi_ioapic;
 int acpi_strict;
+int acpi_disable_cmcff;
 
 u8 acpi_sci_flags __initdata;
 int acpi_sci_override_gsi __initdata;
@@ -1619,6 +1620,10 @@ static int __init parse_acpi(char *arg)
 	/* "acpi=copy_dsdt" copys DSDT */
 	else if (strcmp(arg, "copy_dsdt") == 0) {
 		acpi_gbl_copy_dsdt_locally = 1;
+	}
+	/* "acpi=nocmcff" disables FF mode for corrected errors */
+	else if (strcmp(arg, "nocmcff") == 0) {
+		acpi_disable_cmcff = 1;
 	} else {
 		/* Core will printk when we return error. */
 		return -EINVAL;
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index d8c69ba..f8b1115 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -261,7 +261,8 @@ void __init acpi_hest_init(void)
 		goto err;
 	}
 
-	apei_hest_parse(hest_parse_cmc, NULL);
+	if (!acpi_disable_cmcff)
+		apei_hest_parse(hest_parse_cmc, NULL);
 
 	if (!ghes_disable) {
 		rc = apei_hest_parse(hest_parse_ghes_count, &ghes_count);


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 17:57 ` [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors Naveen N. Rao
@ 2013-06-19 18:04   ` Borislav Petkov
  2013-06-19 18:17     ` Naveen N. Rao
  2013-06-19 18:19     ` Luck, Tony
  2013-06-20  7:48   ` Borislav Petkov
  1 sibling, 2 replies; 42+ messages in thread
From: Borislav Petkov @ 2013-06-19 18:04 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On Wed, Jun 19, 2013 at 11:27:42PM +0530, Naveen N. Rao wrote:
> Add a boot option to disable firmware first mode for corrected errors.
> 
> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> ---
>  Documentation/x86/x86_64/boot-options.txt |    5 +++++
>  arch/x86/include/asm/acpi.h               |    2 ++
>  arch/x86/kernel/acpi/boot.c               |    5 +++++
>  drivers/acpi/apei/hest.c                  |    3 ++-
>  4 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
> index e9e8ddb..1228b22 100644
> --- a/Documentation/x86/x86_64/boot-options.txt
> +++ b/Documentation/x86/x86_64/boot-options.txt
> @@ -176,6 +176,11 @@ ACPI
>  
>    acpi=noirq	Don't route interrupts
>  
> +  acpi=nocmcff	Disable firmware first mode for corrected errors. This
> +		disables parsing the HEST CMC error source to check if
> +		firmware has set the FF flag. This may result in
> +		duplicate corrected error reports.

Interesting, why? Why would we even need such an option? My impression
is, if ACPI tells us FF, MCE code doesn't poll those banks anymore. So
where do the duplicated reports come from?

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 18:04   ` Borislav Petkov
@ 2013-06-19 18:17     ` Naveen N. Rao
  2013-06-19 18:19     ` Luck, Tony
  1 sibling, 0 replies; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-19 18:17 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On 06/19/2013 11:34 PM, Borislav Petkov wrote:
> On Wed, Jun 19, 2013 at 11:27:42PM +0530, Naveen N. Rao wrote:
>> Add a boot option to disable firmware first mode for corrected errors.
>>
>> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>> ---
>>   Documentation/x86/x86_64/boot-options.txt |    5 +++++
>>   arch/x86/include/asm/acpi.h               |    2 ++
>>   arch/x86/kernel/acpi/boot.c               |    5 +++++
>>   drivers/acpi/apei/hest.c                  |    3 ++-
>>   4 files changed, 14 insertions(+), 1 deletion(-)
>>
>> diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
>> index e9e8ddb..1228b22 100644
>> --- a/Documentation/x86/x86_64/boot-options.txt
>> +++ b/Documentation/x86/x86_64/boot-options.txt
>> @@ -176,6 +176,11 @@ ACPI
>>
>>     acpi=noirq	Don't route interrupts
>>
>> +  acpi=nocmcff	Disable firmware first mode for corrected errors. This
>> +		disables parsing the HEST CMC error source to check if
>> +		firmware has set the FF flag. This may result in
>> +		duplicate corrected error reports.
>
> Interesting, why? Why would we even need such an option? My impression
> is, if ACPI tells us FF, MCE code doesn't poll those banks anymore. So
> where do the duplicated reports come from?

This option is a way to revert to the existing behavior where we 
continue to enable CMCI/poll. If we enable this option, then we will 
receive error events from CMCI/polling as well as from the firmware 
through GHES.


Thanks,
Naveen


^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 18:04   ` Borislav Petkov
  2013-06-19 18:17     ` Naveen N. Rao
@ 2013-06-19 18:19     ` Luck, Tony
  2013-06-19 18:36       ` Borislav Petkov
  1 sibling, 1 reply; 42+ messages in thread
From: Luck, Tony @ 2013-06-19 18:19 UTC (permalink / raw)
  To: Borislav Petkov, Naveen N. Rao
  Cc: ananth, masbock, lcm, linux-kernel, linux-acpi, Huang, Ying

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1215 bytes --]

> Interesting, why? Why would we even need such an option? My impression
> is, if ACPI tells us FF, MCE code doesn't poll those banks anymore. So
> where do the duplicated reports come from?

The option is only disabling the Linux side of firmware first ... the BIOS
will still be doing it and generating records to feed to the OS using APEI.

So Linux may see the error in a bank and report it, and BIOS may report
the same error.  Though I'd expect that to be rare as whoever saw it first
would most likely clear the bank before the other could see it.

I asked for the option because I'm nervous about just skipping some banks
on the say-so of the BIOS ... what if the BIOS did something wrong. This
option gives us a way to return to the way things were before this patch.

These parts are now looking good ... but we still need to tackle what Linux
does when it does get the CPER record.  I suspect we need to preserve
the existing "fake an mcelog entry with just the address" on old platforms,
but need to do something smarter on new ones.

-Tony


ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 18:19     ` Luck, Tony
@ 2013-06-19 18:36       ` Borislav Petkov
  2013-06-19 19:05         ` Luck, Tony
  0 siblings, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-06-19 18:36 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Naveen N. Rao, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying

On Wed, Jun 19, 2013 at 06:19:25PM +0000, Luck, Tony wrote:
> > Interesting, why? Why would we even need such an option? My impression
> > is, if ACPI tells us FF, MCE code doesn't poll those banks anymore. So
> > where do the duplicated reports come from?
> 
> The option is only disabling the Linux side of firmware first ... the BIOS
> will still be doing it and generating records to feed to the OS using APEI.
> 
> So Linux may see the error in a bank and report it, and BIOS may report
> the same error.  Though I'd expect that to be rare as whoever saw it first
> would most likely clear the bank before the other could see it.
> 
> I asked for the option because I'm nervous about just skipping some banks
> on the say-so of the BIOS ... what if the BIOS did something wrong. This
> option gives us a way to return to the way things were before this patch.

Yeah, the code I saw only disables the banks in the HEST:

	mce_disable_ce_bank(mc_bank->bank_number)

and leaving the rest in poll mode. But I agree, we need this as a
fallback if BIOS is doing other crack smoking exercises and thus we want
to ignore FF completely.

> These parts are now looking good ... but we still need to tackle what
> Linux does when it does get the CPER record. I suspect we need to
> preserve the existing "fake an mcelog entry with just the address" on
> old platforms, but need to do something smarter on new ones.

Why, fill out struct mce and do mce_log(mce) does not suffice?

I'll take a look at the rest of the stuff tomorrow, on a clear head.

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 18:36       ` Borislav Petkov
@ 2013-06-19 19:05         ` Luck, Tony
  2013-06-19 20:14           ` Borislav Petkov
  0 siblings, 1 reply; 42+ messages in thread
From: Luck, Tony @ 2013-06-19 19:05 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Naveen N. Rao, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 926 bytes --]

> Why, fill out struct mce and do mce_log(mce) does not suffice?

There is (or should be) a lot more interesting stuff in the CPER than just the address. Stuff
that we don't have fields for in the existing mcelog structure.  We also need to treat filtered
records from modern APEI implementations a bit differently from the old stuff.  The original
user of this code was Westmere-EX, which used it as a workaround for a missing address in
MCi_ADDR for corrected errors.  So in that scenario we had every error being reported and
mcelog(8) deamon doing the threshold analysis to decide when to take action.

In this new modern world - Naveen wants to have the BIOS decide the threshold, so we'd
like Linux to take some action as soon as it sees just one CPER.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 19:05         ` Luck, Tony
@ 2013-06-19 20:14           ` Borislav Petkov
  2013-06-19 20:33             ` Luck, Tony
  0 siblings, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-06-19 20:14 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Naveen N. Rao, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying

On Wed, Jun 19, 2013 at 07:05:16PM +0000, Luck, Tony wrote:
> > Why, fill out struct mce and do mce_log(mce) does not suffice?
> 
> There is (or should be)

Ha!

> a lot more interesting stuff in the CPER than just the address. Stuff
> that we don't have fields for in the existing mcelog structure. We
> also need to treat filtered records from modern APEI implementations a
> bit differently from the old stuff.

Great, the CPER record is described in the UEFI spec. Those BIOS people
are all like a mafia.

Ok, seriously: so the situation should still be fine, FF reported errors
get the CPER format while the rest, the "old" MCE format.

cper.c is doing printk so I'm guessing it would need to get its own
tracepoint and carry that to userspace.

Concerning the RAS daemon, Robert and I are making good progress so once
we have the persistent events in perf, we can read that tracepoint in
userspace and do whatever we want with the error info.

> The original user of this code was Westmere-EX, which used it as a
> workaround for a missing address in MCi_ADDR for corrected errors.
> So in that scenario we had every error being reported and mcelog(8)
> deamon doing the threshold analysis to decide when to take action.
>
> In this new modern world - Naveen wants to have the BIOS decide the
> threshold, so we'd like Linux to take some action as soon as it sees
> just one CPER.

Why would Linux have to intervene if it is doing FF - wasn't the deal
behind Firmware First for the firmware to get the error first and handle
accordingly?

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 20:14           ` Borislav Petkov
@ 2013-06-19 20:33             ` Luck, Tony
  2013-06-19 21:07               ` Borislav Petkov
  0 siblings, 1 reply; 42+ messages in thread
From: Luck, Tony @ 2013-06-19 20:33 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Naveen N. Rao, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 2030 bytes --]

>> There is (or should be)
>
> Ha!

Oh ye of little faith - I'm sure the BIOS will get this right this time :-)


> Ok, seriously: so the situation should still be fine, FF reported errors
> get the CPER format while the rest, the "old" MCE format.
>
> cper.c is doing printk so I'm guessing it would need to get its own
> tracepoint and carry that to userspace.

Yes - a tracepoint is the right answer here for all the new stuff.

> Concerning the RAS daemon, Robert and I are making good progress so once
> we have the persistent events in perf, we can read that tracepoint in
> userspace and do whatever we want with the error info.

Mauro has a rasdaemon in progress
       git://git.fedorahosted.org/rasdaemon.git
just picks up perf/events and logs to a sqlite database.

>> In this new modern world - Naveen wants to have the BIOS decide the
>> threshold, so we'd like Linux to take some action as soon as it sees
>> just one CPER.
>
> Why would Linux have to intervene if it is doing FF - wasn't the deal
> behind Firmware First for the firmware to get the error first and handle
> accordingly?

Because Linux can do runtime things that the BIOS can't - like offline a 4K page.
Idea here is that BIOS does whatever the OEM thinks is the right level of
threshholding - not bothering the OS with petty details of random corrected
erorrs that mean nothing.  But if there is some repeated error (like a stuck bit)
then the BIOS can provide a CPER to the OS telling it that it would be a good idea
to stop using that page.

And this is where the semantics of a CPER change between the original WSM-EX
implementation ... where Linux expects to see all the errors and do its own
thresholding only taking a page offline if it sees a lot of CPER refer to the same
page; and now - where the BIOS does the counting and tells Linux just once to
take the page offline.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 20:33             ` Luck, Tony
@ 2013-06-19 21:07               ` Borislav Petkov
  2013-06-19 21:28                 ` Luck, Tony
  0 siblings, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-06-19 21:07 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Naveen N. Rao, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying, Robert Richter

On Wed, Jun 19, 2013 at 08:33:49PM +0000, Luck, Tony wrote:
> >> There is (or should be)
> >
> > Ha!
> 
> Oh ye of little faith - I'm sure the BIOS will get this right this time :-)
> 
> 
> > Ok, seriously: so the situation should still be fine, FF reported errors
> > get the CPER format while the rest, the "old" MCE format.
> >
> > cper.c is doing printk so I'm guessing it would need to get its own
> > tracepoint and carry that to userspace.
> 
> Yes - a tracepoint is the right answer here for all the new stuff.
> 
> > Concerning the RAS daemon, Robert and I are making good progress so once
> > we have the persistent events in perf, we can read that tracepoint in
> > userspace and do whatever we want with the error info.
> 
> Mauro has a rasdaemon in progress
>        git://git.fedorahosted.org/rasdaemon.git
> just picks up perf/events and logs to a sqlite database.

Actually it uses ftrace's facilities but it is a tracepoint in the end.

And I asked him nicely not to call it rasdaemon because I already have a
RAS daemon but hey, whatever. The more confusion, the better.

> Because Linux can do runtime things that the BIOS can't - like offline
> a 4K page. Idea here is that BIOS does whatever the OEM thinks is the
> right level of threshholding - not bothering the OS with petty details
> of random corrected erorrs that mean nothing. But if there is some
> repeated error (like a stuck bit) then the BIOS can provide a CPER
> to the OS telling it that it would be a good idea to stop using that
> page.

Ok, where is that semantics? What in a CPER record does say "this error
should tell you that you need to offline the containing page and I'm
telling you this exactly only once"? Error Severity 0, i.e. Recoverable?

> And this is where the semantics of a CPER change between the original
> WSM-EX implementation ... where Linux expects to see all the errors
> and do its own thresholding only taking a page offline if it sees a
> lot of CPER refer to the same page; and now - where the BIOS does the
> counting and tells Linux just once to take the page offline.

Ok, we're talking about the S in RAS now. Do we have error recovery
strategies specified anywhere? Are they per-platform or generic? Is this
CPER strategy above, for example, only valid for some platforms or for
all APEI-using hardware?

Questions over questions...

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 21:07               ` Borislav Petkov
@ 2013-06-19 21:28                 ` Luck, Tony
  2013-06-19 21:41                   ` Borislav Petkov
  2013-06-20 21:21                   ` Naveen N. Rao
  0 siblings, 2 replies; 42+ messages in thread
From: Luck, Tony @ 2013-06-19 21:28 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Naveen N. Rao, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying, Robert Richter

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1524 bytes --]

> Ok, where is that semantics? What in a CPER record does say "this error
> should tell you that you need to offline the containing page and I'm
> telling you this exactly only once"? Error Severity 0, i.e. Recoverable?

Naveen - this one is for you (or for your BIOS team).  Can you get us a sample
CPER that you plan to provide when the BIOS decides that its threshold has
been exceeded?  How will it be different from what old WSM-EX platforms
were sending to us?  Hopefully the answer is encoded in the CPER record
and not in some code we have to put in Linux to say "if (IBMplatform) do_thing_1(); else ... "

> Ok, we're talking about the S in RAS now. Do we have error recovery
> strategies specified anywhere? Are they per-platform or generic? Is this
> CPER strategy above, for example, only valid for some platforms or for
> all APEI-using hardware?

mcelog(8) daemon has been doing this for years ... but it used the "predictive
failure analysis" buzzwords that were popular way back then (today the
marketing people seem to prefer "self healing" ). Whatever the name, the
concept is the same ... take some set of corrected event reports and infer
from them that something worse may happen soon, and use that information
to try to avoid the (possibly) impending crash.

> Questions over questions...

Questions are good - they help fill out gaps

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 21:28                 ` Luck, Tony
@ 2013-06-19 21:41                   ` Borislav Petkov
  2013-06-19 22:08                     ` Luck, Tony
  2013-06-20 21:21                   ` Naveen N. Rao
  1 sibling, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-06-19 21:41 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Naveen N. Rao, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying, Robert Richter

On Wed, Jun 19, 2013 at 09:28:50PM +0000, Luck, Tony wrote:
> > Ok, where is that semantics? What in a CPER record does say "this error
> > should tell you that you need to offline the containing page and I'm
> > telling you this exactly only once"? Error Severity 0, i.e. Recoverable?
> 
> Naveen - this one is for you (or for your BIOS team).  Can you get us a sample
> CPER that you plan to provide when the BIOS decides that its threshold has
> been exceeded?  How will it be different from what old WSM-EX platforms
> were sending to us?  Hopefully the answer is encoded in the CPER record
> and not in some code we have to put in Linux to say "if (IBMplatform) do_thing_1(); else ... "

If we're going to be vendor-specific (which I'm pretty sure we will),
we'd gonna have to export knobs to userspace which each vendor can tweak
for themselves. Otherwise we'd get the DMI quirks ugliness all over
again and we better nip it in the bud, while we can.

> > Ok, we're talking about the S in RAS now. Do we have error recovery
> > strategies specified anywhere? Are they per-platform or generic? Is this
> > CPER strategy above, for example, only valid for some platforms or for
> > all APEI-using hardware?
> 
> mcelog(8) daemon has been doing this for years ... but it used the "predictive
> failure analysis" buzzwords that were popular way back then (today the
> marketing people seem to prefer "self healing" ). Whatever the name, the
> concept is the same ... take some set of corrected event reports and infer
> from them that something worse may happen soon, and use that information
> to try to avoid the (possibly) impending crash.

Ok, so some sort of userspace is enforcing policy based on collected
data/heuristics.

The above question about what to do *without* going to userspace and
back is maybe more interesting and we'd need a clean design there...
we'll see.

> > Questions over questions...
> 
> Questions are good - they help fill out gaps

I know - that's why I'm trying to poke holes early...

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 21:41                   ` Borislav Petkov
@ 2013-06-19 22:08                     ` Luck, Tony
  2013-06-20  5:35                       ` Borislav Petkov
  0 siblings, 1 reply; 42+ messages in thread
From: Luck, Tony @ 2013-06-19 22:08 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Naveen N. Rao, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying, Robert Richter

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1017 bytes --]

> The above question about what to do *without* going to userspace and
> back is maybe more interesting and we'd need a clean design there...
> we'll see.

Yes - this case (where the BIOS did all the threshold math and made the decision)
should be one where Linux kernel could just implement the action directly.
Perhaps controlled by a knob to say whether we really trust the BIOS that much.

But we will also have cases where a smart user agent can correlate data
from multiple sources to identify the real root cause (e.g. some temperature
anomalies around the same time as some memory errors that occur at 10am
on the third Tuesday each month -> cause is air conditioner maintenance guy
that shuts down the a/c for 10 minutes to change the filter).

I'll leave writing an agent that smart as an exercise for the concerned data
center manager :-)

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 22:08                     ` Luck, Tony
@ 2013-06-20  5:35                       ` Borislav Petkov
  0 siblings, 0 replies; 42+ messages in thread
From: Borislav Petkov @ 2013-06-20  5:35 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Naveen N. Rao, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying, Robert Richter

On Wed, Jun 19, 2013 at 10:08:53PM +0000, Luck, Tony wrote:
> > The above question about what to do *without* going to userspace and
> > back is maybe more interesting and we'd need a clean design there...
> > we'll see.
> 
> Yes - this case (where the BIOS did all the threshold math and made the decision)
> should be one where Linux kernel could just implement the action directly.
> Perhaps controlled by a knob to say whether we really trust the BIOS that much.
> 
> But we will also have cases where a smart user agent can correlate data
> from multiple sources to identify the real root cause (e.g. some temperature
> anomalies around the same time as some memory errors that occur at 10am
> on the third Tuesday each month -> cause is air conditioner maintenance guy
> that shuts down the a/c for 10 minutes to change the filter).

Surely we cannot put that in the kernel. For that we'd need userspace to
decide and only turn knobs in the kernel.

> I'll leave writing an agent that smart as an exercise for the concerned data
> center manager :-)

Me too, as long as it stays in userspace and it only turns
knobs/interfaces in the kernel.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-19 17:57 [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC Naveen N. Rao
  2013-06-19 17:57 ` [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors Naveen N. Rao
@ 2013-06-20  7:39 ` Borislav Petkov
  2013-06-20 19:08   ` Naveen N. Rao
  1 sibling, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-06-20  7:39 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On Wed, Jun 19, 2013 at 11:27:17PM +0530, Naveen N. Rao wrote:
> The Corrected Machine Check structure (CMC) in HEST has a flag which can be
> set by the firmware to indicate to the OS that it prefers to process the
> corrected error events first. In this scenario, the OS is expected to not
> monitor for corrected errors (through CMCI/polling). Instead, the firmware
> notifies the OS on corrected error events through GHES.
> 
> Linux already has support for GHES. This patch adds support for parsing CMC
> structure and to disable CMCI/polling if the firmware first flag is set.
> 
> Further, the list of machine check bank structures at the end of CMC is used
> to determine which MCA banks function in FF mode, so that we continue to
> monitor error events on the other banks.
> 
> 
> - Naveen
> 
> --
> Changes:
> - Incorporated comments from Boris and Tony from the previous thread at
>   http://thread.gmane.org/gmane.linux.acpi.devel/60802
> - Added patch to disable firmware first mode through a boot option.
> 
> 
> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> ---
>  arch/x86/include/asm/mce.h                |    3 ++
>  arch/x86/kernel/cpu/mcheck/mce-internal.h |    3 ++
>  arch/x86/kernel/cpu/mcheck/mce.c          |   25 ++++++++++++++++++
>  arch/x86/kernel/cpu/mcheck/mce_intel.c    |   40 +++++++++++++++++++++++------
>  drivers/acpi/apei/hest.c                  |   36 ++++++++++++++++++++++++++
>  5 files changed, 99 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
> index fa5f71e..380fff8 100644
> --- a/arch/x86/include/asm/mce.h
> +++ b/arch/x86/include/asm/mce.h
> @@ -188,6 +188,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
>  				    const char __user *ubuf,
>  				    size_t usize, loff_t *off));
>  
> +/* Disable CMCI/polling for MCA bank claimed by firmware */
> +extern void mce_disable_ce_bank(int bank);
> +
>  /*
>   * Exception handler
>   */
> diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
> index 5b7d4fa..193edc1 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
> +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
> @@ -25,6 +25,9 @@ int mce_severity(struct mce *a, int tolerant, char **msg);
>  struct dentry *mce_get_debugfs_dir(void);
>  
>  extern struct mce_bank *mce_banks;
> +extern mce_banks_t mce_banks_ce_disabled;
> +
> +void cmci_disable_bank(int bank);
>  
>  #ifdef CONFIG_X86_MCE_INTEL
>  unsigned long mce_intel_adjust_timer(unsigned long interval);
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 9239504..9512034 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -94,6 +94,9 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
>  	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
>  };
>  
> +/* MCA banks controlled through firmware first for corrected errors */
> +mce_banks_t mce_banks_ce_disabled;

Why the second bitfield? The cleared bits in mce_poll_banks should be
the banks which are handled in FF mode, no?

> +
>  static DEFINE_PER_CPU(struct work_struct, mce_work);
>  
>  static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
> @@ -1932,6 +1935,28 @@ static struct miscdevice mce_chrdev_device = {
>  	&mce_chrdev_ops,
>  };
>  
> +static void __mce_disable_ce_bank(void *arg)
> +{
> +	int bank = *((int *)arg);
> +
> +	/* Ensure we don't poll this bank */

No need for that comment.

> +	__clear_bit(bank, __get_cpu_var(mce_poll_banks));
> +	/* Disable CMCI */

No need for that comment either - function name cannot be more
descriptive :-)

> +	cmci_disable_bank(bank);
> +}
> +
> +void mce_disable_ce_bank(int bank)

Yeah, let's call it ...disable_poll_bank because we're disabling polling
for those banks. And yes, we poll for errors for which no MCE exception
is generated and those happen to be corrected but still...

> +{
> +	if (bank >= mca_cfg.banks) {
> +		pr_warning(FW_BUG 
> +			"Ignoring request to disable invalid MCA bank %d.\n",
> +			bank);
> +		return;
> +	}
> +	set_bit(bank, mce_banks_ce_disabled);
> +	on_each_cpu(__mce_disable_ce_bank, &bank, 1);
> +}
> +
>  /*
>   * mce=off Disables machine check
>   * mce=no_cmci Disables CMCI
> diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
> index ae1697c..78256c0 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
> @@ -191,6 +191,10 @@ static void cmci_discover(int banks)
>  		if (test_bit(i, owned))
>  			continue;
>  
> +		/* Skip banks in firmware first mode */
> +		if (test_bit(i, mce_banks_ce_disabled))
> +			continue;
> +
>  		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
>  
>  		/* Already owned by someone else? */
> @@ -259,6 +263,20 @@ void cmci_recheck(void)
>  	local_irq_restore(flags);
>  }
>  
> +/* Caller must hold the lock on cmci_discover_lock */
> +static void __cmci_disable_bank(int bank)
> +{
> +	u64 val;
> +
> +	if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
> +		return;
> +	/* Disable CMCI */
> +	rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
> +	val &= ~MCI_CTL2_CMCI_EN;
> +	wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
> +	__clear_bit(bank, __get_cpu_var(mce_banks_owned));
> +}
> +
>  /*
>   * Disable CMCI on this CPU for all banks it owns when it goes down.
>   * This allows other CPUs to claim the banks on rediscovery.
> @@ -268,19 +286,12 @@ void cmci_clear(void)
>  	unsigned long flags;
>  	int i;
>  	int banks;
> -	u64 val;
>  
>  	if (!cmci_supported(&banks))
>  		return;
>  	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
>  	for (i = 0; i < banks; i++) {
> -		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
> -			continue;
> -		/* Disable CMCI */
> -		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
> -		val &= ~MCI_CTL2_CMCI_EN;
> -		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
> -		__clear_bit(i, __get_cpu_var(mce_banks_owned));
> +		__cmci_disable_bank(i);
>  	}

This leaves only one line so no need for the {} braces anymore.

>  	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
>  }
> @@ -315,6 +326,19 @@ void cmci_reenable(void)
>  		cmci_discover(banks);
>  }
>  
> +void cmci_disable_bank(int bank)
> +{
> +	int banks;
> +	unsigned long flags;
> +
> +	if (!cmci_supported(&banks))
> +		return;
> +
> +	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
> +	__cmci_disable_bank(bank);
> +	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
> +}

You need a prototype for that if you're going to call it from mce.c but
CONFIG_X86_MCE_INTEL is not set:

arch/x86/built-in.o: In function `__mce_disable_ce_bank':
/home/boris/kernel/linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c:1945: undefined reference to `cmci_disable_bank'
make: *** [vmlinux] Error 1

> +
>  static void intel_init_cmci(void)
>  {
>  	int banks;
> diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
> index f5ef5d5..d8c69ba 100644
> --- a/drivers/acpi/apei/hest.c
> +++ b/drivers/acpi/apei/hest.c
> @@ -36,6 +36,7 @@
>  #include <linux/io.h>
>  #include <linux/platform_device.h>
>  #include <acpi/apei.h>
> +#include <asm/mce.h>
>  
>  #include "apei-internal.h"
>  
> @@ -121,6 +122,39 @@ int apei_hest_parse(apei_hest_func_t func, void *data)
>  }
>  EXPORT_SYMBOL_GPL(apei_hest_parse);
>  
> +/*
> + * Check if firmware advertises firmware first mode. We need FF bit to be set
> + * along with a set of MC banks which work in FF mode.
> + */
> +static int __init hest_parse_cmc(struct acpi_hest_header *hest_hdr, void *data)
> +{
> +	int i;
> +	struct acpi_hest_ia_corrected *cmc;
> +	struct acpi_hest_ia_error_bank *mc_bank;
> +
> +	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
> +		return 0;
> +
> +	cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
> +	if (!cmc->enabled || !(cmc->flags & ACPI_HEST_FIRMWARE_FIRST))
> +		return 0;
> +
> +	/*
> +	 * We expect HEST to provide a list of MC banks that
> +	 * report errors through firmware first mode.

		     ... in firmware first mode.

> +	 */
> +	if (cmc->num_hardware_banks == 0)

	if (!cmc->num_hardware_banks)

> +		return 0;

Err, why does this function return 0 when the sanity checks above fail?
apei_hest_parse actually looks at the retval and advances the hest_hdr.

> +
> +	pr_info(HEST_PFX "Enabling Firmware First mode for corrected errors.\n");
> +
> +	mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
> +	for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
> +		mce_disable_ce_bank(mc_bank->bank_number);
> +
> +	return 0;

IOW, we should return 0 only here.

[ … ]

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 17:57 ` [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors Naveen N. Rao
  2013-06-19 18:04   ` Borislav Petkov
@ 2013-06-20  7:48   ` Borislav Petkov
  2013-06-20 19:02     ` Naveen N. Rao
  1 sibling, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-06-20  7:48 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On Wed, Jun 19, 2013 at 11:27:42PM +0530, Naveen N. Rao wrote:
> Add a boot option to disable firmware first mode for corrected errors.
> 
> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> ---
>  Documentation/x86/x86_64/boot-options.txt |    5 +++++
>  arch/x86/include/asm/acpi.h               |    2 ++
>  arch/x86/kernel/acpi/boot.c               |    5 +++++
>  drivers/acpi/apei/hest.c                  |    3 ++-
>  4 files changed, 14 insertions(+), 1 deletion(-)
> 
> diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
> index e9e8ddb..1228b22 100644
> --- a/Documentation/x86/x86_64/boot-options.txt
> +++ b/Documentation/x86/x86_64/boot-options.txt
> @@ -176,6 +176,11 @@ ACPI
>  
>    acpi=noirq	Don't route interrupts
>  
> +  acpi=nocmcff	Disable firmware first mode for corrected errors. This
> +		disables parsing the HEST CMC error source to check if
> +		firmware has set the FF flag. This may result in
> +		duplicate corrected error reports.

Ok, this option should be called something more vendor-agnostic like

acpi=noff

for example.

CMCI is Intel-specific and Firmware First is a generic way to say that
firmware would like to look at the errors first.

AMD has similar thing called error thresholding so...

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-20  7:48   ` Borislav Petkov
@ 2013-06-20 19:02     ` Naveen N. Rao
  0 siblings, 0 replies; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-20 19:02 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On 06/20/2013 01:18 PM, Borislav Petkov wrote:
> On Wed, Jun 19, 2013 at 11:27:42PM +0530, Naveen N. Rao wrote:
>> Add a boot option to disable firmware first mode for corrected errors.
>>
>> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>> ---
>>   Documentation/x86/x86_64/boot-options.txt |    5 +++++
>>   arch/x86/include/asm/acpi.h               |    2 ++
>>   arch/x86/kernel/acpi/boot.c               |    5 +++++
>>   drivers/acpi/apei/hest.c                  |    3 ++-
>>   4 files changed, 14 insertions(+), 1 deletion(-)
>>
>> diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
>> index e9e8ddb..1228b22 100644
>> --- a/Documentation/x86/x86_64/boot-options.txt
>> +++ b/Documentation/x86/x86_64/boot-options.txt
>> @@ -176,6 +176,11 @@ ACPI
>>
>>     acpi=noirq	Don't route interrupts
>>
>> +  acpi=nocmcff	Disable firmware first mode for corrected errors. This
>> +		disables parsing the HEST CMC error source to check if
>> +		firmware has set the FF flag. This may result in
>> +		duplicate corrected error reports.
>
> Ok, this option should be called something more vendor-agnostic like
>
> acpi=noff
>
> for example.
>
> CMCI is Intel-specific and Firmware First is a generic way to say that
> firmware would like to look at the errors first.
>
> AMD has similar thing called error thresholding so...
>

nocmcff actually refers to CMC (Corrected Machine Check) Error source 
which is APEI terminology, so this is not actually Intel-specific and 
should apply equally well for AMD. I chose nocmcff over noff since there 
is also a firmware first flag for MCE Error source in APEI and this 
option only disables FF for CMC.

Thanks,
Naveen


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-20  7:39 ` [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC Borislav Petkov
@ 2013-06-20 19:08   ` Naveen N. Rao
  2013-06-20 19:29     ` Borislav Petkov
  0 siblings, 1 reply; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-20 19:08 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On 06/20/2013 01:09 PM, Borislav Petkov wrote:
> On Wed, Jun 19, 2013 at 11:27:17PM +0530, Naveen N. Rao wrote:
>> The Corrected Machine Check structure (CMC) in HEST has a flag which can be
>> set by the firmware to indicate to the OS that it prefers to process the
>> corrected error events first. In this scenario, the OS is expected to not
>> monitor for corrected errors (through CMCI/polling). Instead, the firmware
>> notifies the OS on corrected error events through GHES.
>>
>> Linux already has support for GHES. This patch adds support for parsing CMC
>> structure and to disable CMCI/polling if the firmware first flag is set.
>>
>> Further, the list of machine check bank structures at the end of CMC is used
>> to determine which MCA banks function in FF mode, so that we continue to
>> monitor error events on the other banks.
>>
>>
>> - Naveen
>>
>> --
>> Changes:
>> - Incorporated comments from Boris and Tony from the previous thread at
>>    http://thread.gmane.org/gmane.linux.acpi.devel/60802
>> - Added patch to disable firmware first mode through a boot option.
>>
>>
>> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
>> ---
>>   arch/x86/include/asm/mce.h                |    3 ++
>>   arch/x86/kernel/cpu/mcheck/mce-internal.h |    3 ++
>>   arch/x86/kernel/cpu/mcheck/mce.c          |   25 ++++++++++++++++++
>>   arch/x86/kernel/cpu/mcheck/mce_intel.c    |   40 +++++++++++++++++++++++------
>>   drivers/acpi/apei/hest.c                  |   36 ++++++++++++++++++++++++++
>>   5 files changed, 99 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
>> index fa5f71e..380fff8 100644
>> --- a/arch/x86/include/asm/mce.h
>> +++ b/arch/x86/include/asm/mce.h
>> @@ -188,6 +188,9 @@ extern void register_mce_write_callback(ssize_t (*)(struct file *filp,
>>   				    const char __user *ubuf,
>>   				    size_t usize, loff_t *off));
>>
>> +/* Disable CMCI/polling for MCA bank claimed by firmware */
>> +extern void mce_disable_ce_bank(int bank);
>> +
>>   /*
>>    * Exception handler
>>    */
>> diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
>> index 5b7d4fa..193edc1 100644
>> --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
>> +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
>> @@ -25,6 +25,9 @@ int mce_severity(struct mce *a, int tolerant, char **msg);
>>   struct dentry *mce_get_debugfs_dir(void);
>>
>>   extern struct mce_bank *mce_banks;
>> +extern mce_banks_t mce_banks_ce_disabled;
>> +
>> +void cmci_disable_bank(int bank);
>>
>>   #ifdef CONFIG_X86_MCE_INTEL
>>   unsigned long mce_intel_adjust_timer(unsigned long interval);
>> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
>> index 9239504..9512034 100644
>> --- a/arch/x86/kernel/cpu/mcheck/mce.c
>> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
>> @@ -94,6 +94,9 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
>>   	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
>>   };
>>
>> +/* MCA banks controlled through firmware first for corrected errors */
>> +mce_banks_t mce_banks_ce_disabled;
>
> Why the second bitfield? The cleared bits in mce_poll_banks should be
> the banks which are handled in FF mode, no?

We need this bitfield to prevent enabling CMCI in future cmci_discover() 
invocations. See usage in cmci_discover() further below.

>
>> +
>>   static DEFINE_PER_CPU(struct work_struct, mce_work);
>>
>>   static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
>> @@ -1932,6 +1935,28 @@ static struct miscdevice mce_chrdev_device = {
>>   	&mce_chrdev_ops,
>>   };
>>
>> +static void __mce_disable_ce_bank(void *arg)
>> +{
>> +	int bank = *((int *)arg);
>> +
>> +	/* Ensure we don't poll this bank */
>
> No need for that comment.
>
>> +	__clear_bit(bank, __get_cpu_var(mce_poll_banks));
>> +	/* Disable CMCI */
>
> No need for that comment either - function name cannot be more
> descriptive :-)

Sure, will remove these comments.

>
>> +	cmci_disable_bank(bank);
>> +}
>> +
>> +void mce_disable_ce_bank(int bank)
>
> Yeah, let's call it ...disable_poll_bank because we're disabling polling
> for those banks. And yes, we poll for errors for which no MCE exception
> is generated and those happen to be corrected but still...

We actually also disable CMCI here. So, in essence, we are disabling 
these banks for all sorts of direct corrected error reporting. I thought 
of naming this disable_ce_on_bank() or disable_ce_bank(), but felt that 
the mce_ prefix would be good to have. If that isn't necessary, I can 
rename this to disable_ce_on_bank() which sounds more accurate to me. Is 
that ok?

>
>> +{
>> +	if (bank >= mca_cfg.banks) {
>> +		pr_warning(FW_BUG
>> +			"Ignoring request to disable invalid MCA bank %d.\n",
>> +			bank);
>> +		return;
>> +	}
>> +	set_bit(bank, mce_banks_ce_disabled);
>> +	on_each_cpu(__mce_disable_ce_bank, &bank, 1);
>> +}
>> +
>>   /*
>>    * mce=off Disables machine check
>>    * mce=no_cmci Disables CMCI
>> diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
>> index ae1697c..78256c0 100644
>> --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
>> +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
>> @@ -191,6 +191,10 @@ static void cmci_discover(int banks)
>>   		if (test_bit(i, owned))
>>   			continue;
>>
>> +		/* Skip banks in firmware first mode */
>> +		if (test_bit(i, mce_banks_ce_disabled))
>> +			continue;
>> +
>>   		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
>>
>>   		/* Already owned by someone else? */
>> @@ -259,6 +263,20 @@ void cmci_recheck(void)
>>   	local_irq_restore(flags);
>>   }
>>
>> +/* Caller must hold the lock on cmci_discover_lock */
>> +static void __cmci_disable_bank(int bank)
>> +{
>> +	u64 val;
>> +
>> +	if (!test_bit(bank, __get_cpu_var(mce_banks_owned)))
>> +		return;
>> +	/* Disable CMCI */
>> +	rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
>> +	val &= ~MCI_CTL2_CMCI_EN;
>> +	wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
>> +	__clear_bit(bank, __get_cpu_var(mce_banks_owned));
>> +}
>> +
>>   /*
>>    * Disable CMCI on this CPU for all banks it owns when it goes down.
>>    * This allows other CPUs to claim the banks on rediscovery.
>> @@ -268,19 +286,12 @@ void cmci_clear(void)
>>   	unsigned long flags;
>>   	int i;
>>   	int banks;
>> -	u64 val;
>>
>>   	if (!cmci_supported(&banks))
>>   		return;
>>   	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
>>   	for (i = 0; i < banks; i++) {
>> -		if (!test_bit(i, __get_cpu_var(mce_banks_owned)))
>> -			continue;
>> -		/* Disable CMCI */
>> -		rdmsrl(MSR_IA32_MCx_CTL2(i), val);
>> -		val &= ~MCI_CTL2_CMCI_EN;
>> -		wrmsrl(MSR_IA32_MCx_CTL2(i), val);
>> -		__clear_bit(i, __get_cpu_var(mce_banks_owned));
>> +		__cmci_disable_bank(i);
>>   	}
>
> This leaves only one line so no need for the {} braces anymore.

Ah, ok.

>
>>   	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
>>   }
>> @@ -315,6 +326,19 @@ void cmci_reenable(void)
>>   		cmci_discover(banks);
>>   }
>>
>> +void cmci_disable_bank(int bank)
>> +{
>> +	int banks;
>> +	unsigned long flags;
>> +
>> +	if (!cmci_supported(&banks))
>> +		return;
>> +
>> +	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
>> +	__cmci_disable_bank(bank);
>> +	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
>> +}
>
> You need a prototype for that if you're going to call it from mce.c but
> CONFIG_X86_MCE_INTEL is not set:
>
> arch/x86/built-in.o: In function `__mce_disable_ce_bank':
> /home/boris/kernel/linux-2.6/arch/x86/kernel/cpu/mcheck/mce.c:1945: undefined reference to `cmci_disable_bank'
> make: *** [vmlinux] Error 1

Good catch - missed this. Will address this.

>
>> +
>>   static void intel_init_cmci(void)
>>   {
>>   	int banks;
>> diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
>> index f5ef5d5..d8c69ba 100644
>> --- a/drivers/acpi/apei/hest.c
>> +++ b/drivers/acpi/apei/hest.c
>> @@ -36,6 +36,7 @@
>>   #include <linux/io.h>
>>   #include <linux/platform_device.h>
>>   #include <acpi/apei.h>
>> +#include <asm/mce.h>
>>
>>   #include "apei-internal.h"
>>
>> @@ -121,6 +122,39 @@ int apei_hest_parse(apei_hest_func_t func, void *data)
>>   }
>>   EXPORT_SYMBOL_GPL(apei_hest_parse);
>>
>> +/*
>> + * Check if firmware advertises firmware first mode. We need FF bit to be set
>> + * along with a set of MC banks which work in FF mode.
>> + */
>> +static int __init hest_parse_cmc(struct acpi_hest_header *hest_hdr, void *data)
>> +{
>> +	int i;
>> +	struct acpi_hest_ia_corrected *cmc;
>> +	struct acpi_hest_ia_error_bank *mc_bank;
>> +
>> +	if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK)
>> +		return 0;
>> +
>> +	cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
>> +	if (!cmc->enabled || !(cmc->flags & ACPI_HEST_FIRMWARE_FIRST))
>> +		return 0;
>> +
>> +	/*
>> +	 * We expect HEST to provide a list of MC banks that
>> +	 * report errors through firmware first mode.
>
> 		     ... in firmware first mode.

Ok.

>
>> +	 */
>> +	if (cmc->num_hardware_banks == 0)
>
> 	if (!cmc->num_hardware_banks)

Ok.

>
>> +		return 0;
>
> Err, why does this function return 0 when the sanity checks above fail?
> apei_hest_parse actually looks at the retval and advances the hest_hdr.
>
>> +
>> +	pr_info(HEST_PFX "Enabling Firmware First mode for corrected errors.\n");
>> +
>> +	mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1);
>> +	for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++)
>> +		mce_disable_ce_bank(mc_bank->bank_number);
>> +
>> +	return 0;
>
> IOW, we should return 0 only here.

Quite the contrary actually. We want apei_hest_parse() to continue to 
the next error source structure _until_ we find CMC error source. This 
requires us to return 0 at the very first check we have for error source 
type. After that, we should actually return a non-zero value so we don't 
continue looking at further entries (as per APEI, only one error source 
of type CMC is permitted in HEST.) I will make this change.

As always, thanks for the detailed review!
- Naveen


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-20 19:08   ` Naveen N. Rao
@ 2013-06-20 19:29     ` Borislav Petkov
  2013-06-20 20:14       ` Naveen N. Rao
  0 siblings, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-06-20 19:29 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On Fri, Jun 21, 2013 at 12:38:13AM +0530, Naveen N. Rao wrote:
> We need this bitfield to prevent enabling CMCI in future
> cmci_discover() invocations. See usage in cmci_discover() further
> below.

So?!

	/* Skip banks in firmware first mode */
	if (!test_bit(i, __get_cpu_var(mce_poll_banks))
		continue;

...

> >Yeah, let's call it ...disable_poll_bank because we're disabling polling
> >for those banks. And yes, we poll for errors for which no MCE exception
> >is generated and those happen to be corrected but still...
> 
> We actually also disable CMCI here. So, in essence, we are disabling
> these banks for all sorts of direct corrected error reporting. I
> thought of naming this disable_ce_on_bank() or disable_ce_bank(),
> but felt that the mce_ prefix would be good to have. If that isn't
> necessary, I can rename this to disable_ce_on_bank() which sounds
> more accurate to me. Is that ok?

No, mce_disable_bank() removes the respective bank from the polling
bitfield and cmci_disable_bank() actually disables CMCI which is
Intel-only. So leave it at mce_disable_bank and that should be fine.

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-20 19:29     ` Borislav Petkov
@ 2013-06-20 20:14       ` Naveen N. Rao
  2013-06-20 20:57         ` Borislav Petkov
  0 siblings, 1 reply; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-20 20:14 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On 06/21/2013 12:59 AM, Borislav Petkov wrote:
> On Fri, Jun 21, 2013 at 12:38:13AM +0530, Naveen N. Rao wrote:
>> We need this bitfield to prevent enabling CMCI in future
>> cmci_discover() invocations. See usage in cmci_discover() further
>> below.
>
> So?!
>
> 	/* Skip banks in firmware first mode */
> 	if (!test_bit(i, __get_cpu_var(mce_poll_banks))
> 		continue;

This won't work across cpu offline/online, right? We will end up _not_ 
enabling CMCI on certain banks where we should have.

>
> ...
>
>>> Yeah, let's call it ...disable_poll_bank because we're disabling polling
>>> for those banks. And yes, we poll for errors for which no MCE exception
>>> is generated and those happen to be corrected but still...
>>
>> We actually also disable CMCI here. So, in essence, we are disabling
>> these banks for all sorts of direct corrected error reporting. I
>> thought of naming this disable_ce_on_bank() or disable_ce_bank(),
>> but felt that the mce_ prefix would be good to have. If that isn't
>> necessary, I can rename this to disable_ce_on_bank() which sounds
>> more accurate to me. Is that ok?
>
> No, mce_disable_bank() removes the respective bank from the polling
> bitfield and cmci_disable_bank() actually disables CMCI which is
> Intel-only. So leave it at mce_disable_bank and that should be fine.

Ok. I will rename this to mce_disable_bank() from mce_disable_ce_bank().

Another thing: for hest_parse_cmc(), does the below look good?

         cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
         if (!cmc->enabled)
                 return 0;

#define ACPI_HEST_PARSING_DONE 1
         /*
          * We expect HEST to provide a list of MC banks that
          * report errors in firmware first mode.
          */
         if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || 
!cmc->num_hardware_banks)
                 return ACPI_HEST_PARSING_DONE;

The return value doesn't really matter since we don't check it, but 
returning an error looked like the wrong thing to do as well.


Thanks,
Naveen


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-20 20:14       ` Naveen N. Rao
@ 2013-06-20 20:57         ` Borislav Petkov
  2013-06-20 21:22           ` Naveen N. Rao
  0 siblings, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-06-20 20:57 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On Fri, Jun 21, 2013 at 01:44:00AM +0530, Naveen N. Rao wrote:
> This won't work across cpu offline/online, right? We will end up
> _not_ enabling CMCI on certain banks where we should have.

Huh, don't understand. cmci_discover runs on each CPU. After you've run
hest_parse_cmc early during boot and cleared the mce_poll_banks bits,
nothing will set them again so CPU hotplug doesn't matter...

> Another thing: for hest_parse_cmc(), does the below look good?
> 
>         cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
>         if (!cmc->enabled)
>                 return 0;
> 
> #define ACPI_HEST_PARSING_DONE 1
>         /*
>          * We expect HEST to provide a list of MC banks that
>          * report errors in firmware first mode.
>          */
>         if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
> !cmc->num_hardware_banks)
>                 return ACPI_HEST_PARSING_DONE;
> 
> The return value doesn't really matter since we don't check it, but
> returning an error looked like the wrong thing to do as well.

I'd add a comment above the "return 1" statement to explain why I'm
doing this. It is much more verbose even than a well-named macro :)

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-19 21:28                 ` Luck, Tony
  2013-06-19 21:41                   ` Borislav Petkov
@ 2013-06-20 21:21                   ` Naveen N. Rao
  2013-06-20 22:11                     ` Luck, Tony
  1 sibling, 1 reply; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-20 21:21 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying, Robert Richter

On 06/20/2013 02:58 AM, Luck, Tony wrote:
>> Ok, where is that semantics? What in a CPER record does say "this error
>> should tell you that you need to offline the containing page and I'm
>> telling you this exactly only once"? Error Severity 0, i.e. Recoverable?
>
> Naveen - this one is for you (or for your BIOS team).  Can you get us a sample
> CPER that you plan to provide when the BIOS decides that its threshold has
> been exceeded?  How will it be different from what old WSM-EX platforms
> were sending to us?  Hopefully the answer is encoded in the CPER record
> and not in some code we have to put in Linux to say "if (IBMplatform) do_thing_1(); else ... "

Looking at the specs, there might be a few ways we can do this:
- One, Error threshold value of 1 in the Hardware Error Notification 
structure of CMC. This field is described as the number of error events 
before OS considers this as an error event. With a threshold value of 1, 
we are essentially asking the OS not to threshold further.
- Two, the Generic Error Data Entry (aka UEFI Section Descriptor) has a 
flag which indicates 'Error Threshold Exceeded'. From the UEFI spec, it 
looks like we could consider this as an indication to offline the page; 
though I am not sure if/how this relates to the threshold value above.

Thoughts?


Thanks,
Naveen


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-20 20:57         ` Borislav Petkov
@ 2013-06-20 21:22           ` Naveen N. Rao
  2013-06-21  7:34             ` Borislav Petkov
  0 siblings, 1 reply; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-20 21:22 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On 06/21/2013 02:27 AM, Borislav Petkov wrote:
> On Fri, Jun 21, 2013 at 01:44:00AM +0530, Naveen N. Rao wrote:
>> This won't work across cpu offline/online, right? We will end up
>> _not_ enabling CMCI on certain banks where we should have.
>
> Huh, don't understand. cmci_discover runs on each CPU. After you've run
> hest_parse_cmc early during boot and cleared the mce_poll_banks bits,
> nothing will set them again so CPU hotplug doesn't matter...

Exactly, but mce_poll_banks also doesn't have bits set for banks on 
which CMCI is enabled.

Let's say we have a cpu with 2 banks (not shared), none of which work in 
FF mode. Both these banks support CMCI, so mce_poll_banks won't have 
these bits set.

On cpu offline, we call cmci_clear() which disables CMCI on these two 
banks before offlining it.  When this cpu is brought online again, we 
call cmci_discover() which sees that mce_poll_banks doesn't have these 
two banks enabled and will skip enabling CMCI thinking these are in FF.

Right?

>
>> Another thing: for hest_parse_cmc(), does the below look good?
>>
>>          cmc = (struct acpi_hest_ia_corrected *)hest_hdr;
>>          if (!cmc->enabled)
>>                  return 0;
>>
>> #define ACPI_HEST_PARSING_DONE 1
>>          /*
>>           * We expect HEST to provide a list of MC banks that
>>           * report errors in firmware first mode.
>>           */
>>          if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) ||
>> !cmc->num_hardware_banks)
>>                  return ACPI_HEST_PARSING_DONE;
>>
>> The return value doesn't really matter since we don't check it, but
>> returning an error looked like the wrong thing to do as well.
>
> I'd add a comment above the "return 1" statement to explain why I'm
> doing this. It is much more verbose even than a well-named macro :)

Okay :)

Thanks,
Naveen


^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-20 21:21                   ` Naveen N. Rao
@ 2013-06-20 22:11                     ` Luck, Tony
  2013-06-21  7:27                       ` Borislav Petkov
  0 siblings, 1 reply; 42+ messages in thread
From: Luck, Tony @ 2013-06-20 22:11 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: Borislav Petkov, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying, Robert Richter

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 653 bytes --]

> - Two, the Generic Error Data Entry (aka UEFI Section Descriptor) has a 
> flag which indicates 'Error Threshold Exceeded'. From the UEFI spec, it 
> looks like we could consider this as an indication to offline the page; 
> though I am not sure if/how this relates to the threshold value above.

This one sounds to make sense ... the flag description sounds exactly what
we want - I won't feel embarrassed explaining to people why Linux takes
action when it sees a record like this.

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-20 22:11                     ` Luck, Tony
@ 2013-06-21  7:27                       ` Borislav Petkov
  2013-06-21 16:43                         ` Naveen N. Rao
  2013-06-28 12:04                         ` Naveen N. Rao
  0 siblings, 2 replies; 42+ messages in thread
From: Borislav Petkov @ 2013-06-21  7:27 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Naveen N. Rao, ananth, masbock, lcm, linux-kernel, linux-acpi,
	Huang, Ying, Robert Richter

On Thu, Jun 20, 2013 at 10:11:27PM +0000, Luck, Tony wrote:
> > - Two, the Generic Error Data Entry (aka UEFI Section Descriptor) has a 
> > flag which indicates 'Error Threshold Exceeded'. From the UEFI spec, it 
> > looks like we could consider this as an indication to offline the page; 
> > though I am not sure if/how this relates to the threshold value above.
> 
> This one sounds to make sense ... the flag description sounds exactly what
> we want - I won't feel embarrassed explaining to people why Linux takes
> action when it sees a record like this.

Yep, and firmware can set that flag when it wants to, so decision when
to offline a page is left to the platform.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-20 21:22           ` Naveen N. Rao
@ 2013-06-21  7:34             ` Borislav Petkov
  2013-06-21  7:46               ` Naveen N. Rao
  0 siblings, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-06-21  7:34 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On Fri, Jun 21, 2013 at 02:52:25AM +0530, Naveen N. Rao wrote:
> Exactly, but mce_poll_banks also doesn't have bits set for banks on
> which CMCI is enabled.
>
> Let's say we have a cpu with 2 banks (not shared), none of which work
> in FF mode. Both these banks support CMCI, so mce_poll_banks won't
> have these bits set.
>
> On cpu offline, we call cmci_clear() which disables CMCI on these two
> banks before offlining it. When this cpu is brought online again, we
> call cmci_discover() which sees that mce_poll_banks doesn't have these
> two banks enabled and will skip enabling CMCI thinking these are in
> FF.

Hmm, mce_intel has yet another bitfield - mce_banks_owned. (Btw, this is
why I have a problem with adding yet another bitfield).

The way I understand it is, if a bit is set in the owned bitfield, those
banks belong to CMCI and are not polled.

Now, can we use both mce_banks_owned and mce_poll_banks? If a bit in
both bifields is cleared, the corresponding bank is not polled *and* is
not owned by CMCI => it is in FF mode.

Makes sense?

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-21  7:34             ` Borislav Petkov
@ 2013-06-21  7:46               ` Naveen N. Rao
  2013-06-21  8:36                 ` Borislav Petkov
  0 siblings, 1 reply; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-21  7:46 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On 06/21/2013 01:04 PM, Borislav Petkov wrote:
> On Fri, Jun 21, 2013 at 02:52:25AM +0530, Naveen N. Rao wrote:
>> Exactly, but mce_poll_banks also doesn't have bits set for banks on
>> which CMCI is enabled.
>>
>> Let's say we have a cpu with 2 banks (not shared), none of which work
>> in FF mode. Both these banks support CMCI, so mce_poll_banks won't
>> have these bits set.
>>
>> On cpu offline, we call cmci_clear() which disables CMCI on these two
>> banks before offlining it. When this cpu is brought online again, we
>> call cmci_discover() which sees that mce_poll_banks doesn't have these
>> two banks enabled and will skip enabling CMCI thinking these are in
>> FF.
>
> Hmm, mce_intel has yet another bitfield - mce_banks_owned. (Btw, this is
> why I have a problem with adding yet another bitfield).
>
> The way I understand it is, if a bit is set in the owned bitfield, those
> banks belong to CMCI and are not polled.
>
> Now, can we use both mce_banks_owned and mce_poll_banks? If a bit in
> both bifields is cleared, the corresponding bank is not polled *and* is
> not owned by CMCI => it is in FF mode.
>
> Makes sense?
>

Yes, but I'm afraid this won't work either - mce_banks_owned is cleared 
during cpu offline. This is necessary since a cmci rediscover is 
triggered on cpu offline, so that if this bank is shared across cores, a 
different cpu can claim ownership of this bank.

The difference between the new bitfield and the existing bitfields is 
that the new one is not per-cpu. This is a global list of banks across 
cpus that we do not want enabled.


Thanks,
Naveen


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-21  7:46               ` Naveen N. Rao
@ 2013-06-21  8:36                 ` Borislav Petkov
  2013-06-21  9:32                   ` Naveen N. Rao
  2013-06-21 16:47                   ` Tony Luck
  0 siblings, 2 replies; 42+ messages in thread
From: Borislav Petkov @ 2013-06-21  8:36 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On Fri, Jun 21, 2013 at 01:16:50PM +0530, Naveen N. Rao wrote:
> Yes, but I'm afraid this won't work either - mce_banks_owned is
> cleared during cpu offline. This is necessary since a cmci
> rediscover is triggered on cpu offline, so that if this bank is
> shared across cores, a different cpu can claim ownership of this
> bank.

What for? Sounds strange to me.

Anyway, the reason for which mce_banks_owned won't work is because it
is in mce_intel and we need a mce.c global bitfield for banks which are
excluded from CE handling.

So ok, I'm persuaded, yet another bitfield it is ... :-\

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-21  8:36                 ` Borislav Petkov
@ 2013-06-21  9:32                   ` Naveen N. Rao
  2013-06-21 14:08                     ` Borislav Petkov
  2013-06-21 16:47                   ` Tony Luck
  1 sibling, 1 reply; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-21  9:32 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On 06/21/2013 02:06 PM, Borislav Petkov wrote:
> On Fri, Jun 21, 2013 at 01:16:50PM +0530, Naveen N. Rao wrote:
>> Yes, but I'm afraid this won't work either - mce_banks_owned is
>> cleared during cpu offline. This is necessary since a cmci
>> rediscover is triggered on cpu offline, so that if this bank is
>> shared across cores, a different cpu can claim ownership of this
>> bank.
>
> What for? Sounds strange to me.

Look at section "15.5.1 CMCI Local APIC Interface" from Intel SDM Vol. 
3, and the subsequent section on "System Software Recommendation for 
Managing CMCI and Machine Check Resources":
"For example, if a corrected bit error in a cache shared by two logical 
processors caused a CMCI, the interrupt will be delivered to both 
logical processors sharing that microarchitectural sub-system."

In other words, some of the MC banks are shared across logical cpus in a 
core and some across all cores in a package. During initialization, the 
first cpu in a core ends up owning most of the banks specific to the 
core/package. When this cpu is offlined, we would want the second cpu in 
that core to discover and enable CMCI for those MC banks which it shares 
with the first cpu.

As an example, consider a hypothetical single-core Intel processor with 
Hyperthreading. On init, let's say the first cpu ends up owning banks 1, 
2, 3 and 4; and the second cpu ends up owning banks 1 and 2. This would 
mean that MC banks 1 and 2 are "hyperthread"-specific, while banks 3 and 
4 are shared. Now, if we offline the first cpu, it disables CMCI on all 
4 banks. However, banks 3 and 4 are shared. So, if we now do a cmci 
rediscovery, the second cpu will see that banks 3 and 4 don't have CMCI 
enabled and will then claim ownership of those so that we can continue 
to receive and process CMCIs from those subsystems.

Makes sense now?


Thanks,
Naveen


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-21  9:32                   ` Naveen N. Rao
@ 2013-06-21 14:08                     ` Borislav Petkov
  0 siblings, 0 replies; 42+ messages in thread
From: Borislav Petkov @ 2013-06-21 14:08 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On Fri, Jun 21, 2013 at 03:02:04PM +0530, Naveen N. Rao wrote:
> As an example, consider a hypothetical single-core Intel processor
> with Hyperthreading. On init, let's say the first cpu ends up owning
> banks 1, 2, 3 and 4; and the second cpu ends up owning banks 1 and
> 2. This would mean that MC banks 1 and 2 are "hyperthread"-specific,
> while banks 3 and 4 are shared. Now, if we offline the first cpu, it
> disables CMCI on all 4 banks.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

This is what I find strange - having to disable CMCI, especially on a
shared bank, just to reenable it right back on the next core. But I
guess this is a constraint dictated by the hardware...

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-21  7:27                       ` Borislav Petkov
@ 2013-06-21 16:43                         ` Naveen N. Rao
  2013-06-28 12:04                         ` Naveen N. Rao
  1 sibling, 0 replies; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-21 16:43 UTC (permalink / raw)
  To: Borislav Petkov, Luck, Tony
  Cc: ananth, masbock, lcm, linux-kernel, linux-acpi, Huang, Ying,
	Robert Richter

On 06/21/2013 12:57 PM, Borislav Petkov wrote:
> On Thu, Jun 20, 2013 at 10:11:27PM +0000, Luck, Tony wrote:
>>> - Two, the Generic Error Data Entry (aka UEFI Section Descriptor) has a
>>> flag which indicates 'Error Threshold Exceeded'. From the UEFI spec, it
>>> looks like we could consider this as an indication to offline the page;
>>> though I am not sure if/how this relates to the threshold value above.
>>
>> This one sounds to make sense ... the flag description sounds exactly what
>> we want - I won't feel embarrassed explaining to people why Linux takes
>> action when it sees a record like this.
>
> Yep, and firmware can set that flag when it wants to, so decision when
> to offline a page is left to the platform.
>

Ok, I will work towards a patch which does this.

Thanks,
Naveen


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-21  8:36                 ` Borislav Petkov
  2013-06-21  9:32                   ` Naveen N. Rao
@ 2013-06-21 16:47                   ` Tony Luck
  2013-06-21 17:40                     ` Borislav Petkov
  1 sibling, 1 reply; 42+ messages in thread
From: Tony Luck @ 2013-06-21 16:47 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Naveen N. Rao, ananth, masbock, lcm, Linux Kernel Mailing List,
	linux-acpi, Huang Ying

On Fri, Jun 21, 2013 at 1:36 AM, Borislav Petkov <bp@alien8.de> wrote:
> So ok, I'm persuaded, yet another bitfield it is ... :-\

Let's add some more comments on what each of these bitfields mean. Otherwise
we will be going back over this next time we have a patch that touches one
of them and we've all forgotten the subtle details explained in this
e-mail thread.

-Tony

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-21 16:47                   ` Tony Luck
@ 2013-06-21 17:40                     ` Borislav Petkov
  2013-06-25 17:46                       ` Naveen N. Rao
  0 siblings, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-06-21 17:40 UTC (permalink / raw)
  To: Tony Luck
  Cc: Naveen N. Rao, ananth, masbock, lcm, Linux Kernel Mailing List,
	linux-acpi, Huang Ying

On Fri, Jun 21, 2013 at 09:47:31AM -0700, Tony Luck wrote:
> On Fri, Jun 21, 2013 at 1:36 AM, Borislav Petkov <bp@alien8.de> wrote:
> > So ok, I'm persuaded, yet another bitfield it is ... :-\
> 
> Let's add some more comments on what each of these bitfields mean. Otherwise
> we will be going back over this next time we have a patch that touches one
> of them and we've all forgotten the subtle details explained in this
> e-mail thread.

Very good point - it's like you're reading my mind! I thought about
suggesting exactly that but then it slipped my mind with all the details
and commotion here.

:-)

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-21 17:40                     ` Borislav Petkov
@ 2013-06-25 17:46                       ` Naveen N. Rao
  2013-06-25 17:53                         ` Borislav Petkov
  2013-06-25 17:55                         ` Luck, Tony
  0 siblings, 2 replies; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-25 17:46 UTC (permalink / raw)
  To: tony.luck, bp; +Cc: ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

Tony, Boris,
Can you please see if the comments in the below patch include the details you
were expecting?

Thanks,
Naveen

--
Add comments to clarify usage of the various bitfields in the MCA subsystem

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
 arch/x86/kernel/cpu/mcheck/mce.c       |    5 ++++-
 arch/x86/kernel/cpu/mcheck/mce_intel.c |    9 +++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9239504..bf49cdb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -89,7 +89,10 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int			cpu_missing;
 
-/* MCA banks polled by the period polling timer for corrected events */
+/*
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
+ */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 };
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index ae1697c..2816f53 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -24,6 +24,15 @@
  * Also supports reliable discovery of shared banks.
  */
 
+/*
+ * Indicates MCA banks controlled by the current cpu for CMCI. Note that this
+ * can change when a cpu is offlined or brought online since some MCA banks
+ * are shared across cpus. When a cpu is offlined, cmci_clear() disables CMCI
+ * on all banks owned by the cpu and clears this bitfield. At this point,
+ * cmci_rediscover() kicks in and a different cpu may end up taking
+ * ownership of some of the shared MCA banks that were previously owned
+ * by the offlined cpu.
+ */
 static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
 
 /*


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-25 17:46                       ` Naveen N. Rao
@ 2013-06-25 17:53                         ` Borislav Petkov
  2013-06-25 17:55                         ` Luck, Tony
  1 sibling, 0 replies; 42+ messages in thread
From: Borislav Petkov @ 2013-06-25 17:53 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: tony.luck, ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

On Tue, Jun 25, 2013 at 11:16:32PM +0530, Naveen N. Rao wrote:
> Tony, Boris,
> Can you please see if the comments in the below patch include the details you
> were expecting?
> 
> Thanks,
> Naveen
> 
> --
> Add comments to clarify usage of the various bitfields in the MCA subsystem
> 
> Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
> ---
>  arch/x86/kernel/cpu/mcheck/mce.c       |    5 ++++-
>  arch/x86/kernel/cpu/mcheck/mce_intel.c |    9 +++++++++
>  2 files changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
> index 9239504..bf49cdb 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce.c
> @@ -89,7 +89,10 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
>  static DEFINE_PER_CPU(struct mce, mces_seen);
>  static int			cpu_missing;
>  
> -/* MCA banks polled by the period polling timer for corrected events */
> +/*
> + * MCA banks polled by the period polling timer for corrected events.
> + * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
> + */
>  DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
>  	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
>  };
> diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
> index ae1697c..2816f53 100644
> --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
> +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
> @@ -24,6 +24,15 @@
>   * Also supports reliable discovery of shared banks.
>   */
>  
> +/*
> + * Indicates MCA banks controlled by the current cpu for CMCI. Note that this
> + * can change when a cpu is offlined or brought online since some MCA banks
> + * are shared across cpus. When a cpu is offlined, cmci_clear() disables CMCI
> + * on all banks owned by the cpu and clears this bitfield. At this point,
> + * cmci_rediscover() kicks in and a different cpu may end up taking
> + * ownership of some of the shared MCA banks that were previously owned
> + * by the offlined cpu.
> + */
>  static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);

Yep, it looks ok to me.

Acked-by: Borislav Petkov <bp@suse.de>

Thanks.

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-25 17:46                       ` Naveen N. Rao
  2013-06-25 17:53                         ` Borislav Petkov
@ 2013-06-25 17:55                         ` Luck, Tony
  2013-06-25 18:28                           ` Naveen N. Rao
  1 sibling, 1 reply; 42+ messages in thread
From: Luck, Tony @ 2013-06-25 17:55 UTC (permalink / raw)
  To: Naveen N. Rao, bp
  Cc: ananth, masbock, lcm, linux-kernel, linux-acpi, Huang, Ying

[-- Warning: decoded text below may be mangled, UTF-8 assumed --]
[-- Attachment #1: Type: text/plain; charset="utf-8", Size: 1069 bytes --]

+/*
+ * Indicates MCA banks controlled by the current cpu for CMCI. Note that this
+ * can change when a cpu is offlined or brought online since some MCA banks
+ * are shared across cpus. When a cpu is offlined, cmci_clear() disables CMCI
+ * on all banks owned by the cpu and clears this bitfield. At this point,
+ * cmci_rediscover() kicks in and a different cpu may end up taking
+ * ownership of some of the shared MCA banks that were previously owned
+ * by the offlined cpu.
+ */
 static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);

Maybe an extra sentence or two at the beginning to say *why* we need this.
E.g.

/*
 * CMCI can be delivered to multiple cpus that share a machine check bank
 * so we need to designate a single cpu to process errors logged in each bank
 * in the interrupt handler (otherwise we would have many races and potential
 * double reporting of the same error.
 */
...

-Tony
ÿôèº{.nÇ+‰·Ÿ®‰­†+%ŠËÿ±éݶ\x17¥Šwÿº{.nÇ+‰·¥Š{±þG«éÿŠ{ayº\x1dʇڙë,j\a­¢f£¢·hšïêÿ‘êçz_è®\x03(­éšŽŠÝ¢j"ú\x1a¶^[m§ÿÿ¾\a«þG«éÿ¢¸?™¨è­Ú&£ø§~á¶iO•æ¬z·švØ^\x14\x04\x1a¶^[m§ÿÿÃ\fÿ¶ìÿ¢¸?–I¥

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC
  2013-06-25 17:55                         ` Luck, Tony
@ 2013-06-25 18:28                           ` Naveen N. Rao
  0 siblings, 0 replies; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-25 18:28 UTC (permalink / raw)
  To: tony.luck; +Cc: ananth, masbock, lcm, linux-kernel, linux-acpi, bp, ying.huang

Tony,
Thanks - I have included your text in the patch. I wasn't sure if I should add
your Signed-off-by. Kindly review and do the needful.

Thanks,
Naveen

--
Add comments to clarify usage of the various bitfields in the MCA subsystem

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c       |    5 ++++-
 arch/x86/kernel/cpu/mcheck/mce_intel.c |   12 ++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 9239504..bf49cdb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -89,7 +89,10 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int			cpu_missing;
 
-/* MCA banks polled by the period polling timer for corrected events */
+/*
+ * MCA banks polled by the period polling timer for corrected events.
+ * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
+ */
 DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 	[0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
 };
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index ae1697c..d5640530 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -24,6 +24,18 @@
  * Also supports reliable discovery of shared banks.
  */
 
+/*
+ * CMCI can be delivered to multiple cpus that share a machine check bank
+ * so we need to designate a single cpu to process errors logged in each bank
+ * in the interrupt handler (otherwise we would have many races and potential
+ * double reporting of the same error).
+ * Note that this can change when a cpu is offlined or brought online since
+ * some MCA banks are shared across cpus. When a cpu is offlined, cmci_clear()
+ * disables CMCI on all banks owned by the cpu and clears this bitfield. At
+ * this point, cmci_rediscover() kicks in and a different cpu may end up
+ * taking ownership of some of the shared MCA banks that were previously
+ * owned by the offlined cpu.
+ */
 static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
 
 /*


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-21  7:27                       ` Borislav Petkov
  2013-06-21 16:43                         ` Naveen N. Rao
@ 2013-06-28 12:04                         ` Naveen N. Rao
  2013-06-28 17:31                           ` Tony Luck
  1 sibling, 1 reply; 42+ messages in thread
From: Naveen N. Rao @ 2013-06-28 12:04 UTC (permalink / raw)
  To: tony.luck, bp; +Cc: ananth, masbock, lcm, linux-kernel, linux-acpi, ying.huang

Hi Tony, Boris,
Here is a patch which implements this technique. Kindly take a look and let me
know what you think.

Thanks,
Naveen

--
If the firmware indicates in GHES error data entry that the error threshold
has exceeded for a corrected error event, then we try to soft-offline the
page.

Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
---
 drivers/acpi/apei/ghes.c |    7 ++++++
 include/linux/mm.h       |    1 +
 mm/memory-failure.c      |   53 ++++++++++++++++++++++++++++++----------------
 3 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index fcd7d91..17137cf 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -422,6 +422,13 @@ static void ghes_do_proc(struct ghes *ghes,
 				 CPER_SEC_PLATFORM_MEM)) {
 			struct cper_sec_mem_err *mem_err;
 			mem_err = (struct cper_sec_mem_err *)(gdata+1);
+			if (sec_sev == GHES_SEV_CORRECTED &&
+			    (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) &&
+			    (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) {
+				unsigned long pfn;
+				pfn = mem_err->physical_addr >> PAGE_SHIFT;
+				soft_memory_failure_queue(pfn, 0, 0);
+			}
 			ghes_edac_report_mem_error(ghes, sev, mem_err);
 
 #ifdef CONFIG_X86_MCE
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e0c8528..f9907d2 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1787,6 +1787,7 @@ enum mf_flags {
 };
 extern int memory_failure(unsigned long pfn, int trapno, int flags);
 extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
+extern void soft_memory_failure_queue(unsigned long pfn, int trapno, int flags);
 extern int unpoison_memory(unsigned long pfn);
 extern int sysctl_memory_failure_early_kill;
 extern int sysctl_memory_failure_recovery;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ceb0c7f..50caefd 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1222,6 +1222,7 @@ struct memory_failure_entry {
 	unsigned long pfn;
 	int trapno;
 	int flags;
+	bool soft_offline;
 };
 
 struct memory_failure_cpu {
@@ -1233,6 +1234,28 @@ struct memory_failure_cpu {
 
 static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
 
+void __memory_failure_queue(unsigned long pfn, int trapno, int flags, int soft_offline)
+{
+	struct memory_failure_cpu *mf_cpu;
+	unsigned long proc_flags;
+	struct memory_failure_entry entry = {
+		.pfn =		pfn,
+		.trapno =	trapno,
+		.flags =	flags,
+		.soft_offline = soft_offline
+	};
+
+	mf_cpu = &get_cpu_var(memory_failure_cpu);
+	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
+	if (kfifo_put(&mf_cpu->fifo, &entry))
+		schedule_work_on(smp_processor_id(), &mf_cpu->work);
+	else
+		pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
+		       pfn);
+	spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
+	put_cpu_var(memory_failure_cpu);
+}
+
 /**
  * memory_failure_queue - Schedule handling memory failure of a page.
  * @pfn: Page Number of the corrupted page
@@ -1250,28 +1273,19 @@ static DEFINE_PER_CPU(struct memory_failure_cpu, memory_failure_cpu);
  *
  * Can run in IRQ context.
  */
+
 void memory_failure_queue(unsigned long pfn, int trapno, int flags)
 {
-	struct memory_failure_cpu *mf_cpu;
-	unsigned long proc_flags;
-	struct memory_failure_entry entry = {
-		.pfn =		pfn,
-		.trapno =	trapno,
-		.flags =	flags,
-	};
-
-	mf_cpu = &get_cpu_var(memory_failure_cpu);
-	spin_lock_irqsave(&mf_cpu->lock, proc_flags);
-	if (kfifo_put(&mf_cpu->fifo, &entry))
-		schedule_work_on(smp_processor_id(), &mf_cpu->work);
-	else
-		pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n",
-		       pfn);
-	spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
-	put_cpu_var(memory_failure_cpu);
+	__memory_failure_queue(pfn, trapno, flags, false);
 }
 EXPORT_SYMBOL_GPL(memory_failure_queue);
 
+void soft_memory_failure_queue(unsigned long pfn, int trapno, int flags)
+{
+	__memory_failure_queue(pfn, trapno, flags, true);
+}
+EXPORT_SYMBOL_GPL(soft_memory_failure_queue);
+
 static void memory_failure_work_func(struct work_struct *work)
 {
 	struct memory_failure_cpu *mf_cpu;
@@ -1286,7 +1300,10 @@ static void memory_failure_work_func(struct work_struct *work)
 		spin_unlock_irqrestore(&mf_cpu->lock, proc_flags);
 		if (!gotten)
 			break;
-		memory_failure(entry.pfn, entry.trapno, entry.flags);
+		if (entry.soft_offline)
+			soft_offline_page(pfn_to_page(entry.pfn), entry.flags);
+		else
+			memory_failure(entry.pfn, entry.trapno, entry.flags);
 	}
 }
 


^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-28 12:04                         ` Naveen N. Rao
@ 2013-06-28 17:31                           ` Tony Luck
  2013-07-01 15:07                             ` Naveen N. Rao
  0 siblings, 1 reply; 42+ messages in thread
From: Tony Luck @ 2013-06-28 17:31 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: Borislav Petkov, ananth, masbock, lcm, Linux Kernel Mailing List,
	linux-acpi, Huang Ying

> +                       if (sec_sev == GHES_SEV_CORRECTED &&
> +                           (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) &&
> +                           (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) {
> +                               unsigned long pfn;
> +                               pfn = mem_err->physical_addr >> PAGE_SHIFT;

As Reagan said "Trust ... but verify" ... we should make sure BIOS
gave us a good pfn
                                    if (pfn_valid(pfn))
                                         soft_memory_failure_queue(pfn, 0, 0);
                                    else
                                         printk( ...something about
BIOS giving us bad pfn = %lu\n", pfn);
> +                       }

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-06-28 17:31                           ` Tony Luck
@ 2013-07-01 15:07                             ` Naveen N. Rao
  2013-07-01 15:38                               ` Borislav Petkov
  0 siblings, 1 reply; 42+ messages in thread
From: Naveen N. Rao @ 2013-07-01 15:07 UTC (permalink / raw)
  To: Tony Luck
  Cc: Borislav Petkov, ananth, masbock, lcm, Linux Kernel Mailing List,
	linux-acpi, Huang Ying

On 06/28/2013 11:01 PM, Tony Luck wrote:
>> +                       if (sec_sev == GHES_SEV_CORRECTED &&
>> +                           (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) &&
>> +                           (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) {
>> +                               unsigned long pfn;
>> +                               pfn = mem_err->physical_addr >> PAGE_SHIFT;
>
> As Reagan said "Trust ... but verify" ... we should make sure BIOS
> gave us a good pfn
>                                      if (pfn_valid(pfn))
>                                           soft_memory_failure_queue(pfn, 0, 0);
>                                      else
>                                           printk( ...something about
> BIOS giving us bad pfn = %lu\n", pfn);

Ah, nice catch - I thought soft_offline_page() takes care of this, but 
it sure is good to point a finger at the firmware.

Thanks!
- Naveen

>> +                       }
>


^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-07-01 15:07                             ` Naveen N. Rao
@ 2013-07-01 15:38                               ` Borislav Petkov
  2013-07-01 15:41                                 ` Naveen N. Rao
  0 siblings, 1 reply; 42+ messages in thread
From: Borislav Petkov @ 2013-07-01 15:38 UTC (permalink / raw)
  To: Naveen N. Rao
  Cc: Tony Luck, ananth, masbock, lcm, Linux Kernel Mailing List,
	linux-acpi, Huang Ying

On Mon, Jul 01, 2013 at 08:37:43PM +0530, Naveen N. Rao wrote:
> On 06/28/2013 11:01 PM, Tony Luck wrote:
> >>+                       if (sec_sev == GHES_SEV_CORRECTED &&
> >>+                           (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) &&
> >>+                           (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) {
> >>+                               unsigned long pfn;
> >>+                               pfn = mem_err->physical_addr >> PAGE_SHIFT;
> >
> >As Reagan said "Trust ... but verify" ... we should make sure BIOS
> >gave us a good pfn
> >                                     if (pfn_valid(pfn))
> >                                          soft_memory_failure_queue(pfn, 0, 0);
> >                                     else
> >                                          printk( ...something about
> >BIOS giving us bad pfn = %lu\n", pfn);
> 
> Ah, nice catch - I thought soft_offline_page() takes care of this,
> but it sure is good to point a finger at the firmware.

While at it maybe make it pr_warning(FW_BUG or FW_WARN...

-- 
Regards/Gruss,
    Boris.

Sent from a fat crate under my desk. Formatting is fine.
--

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors
  2013-07-01 15:38                               ` Borislav Petkov
@ 2013-07-01 15:41                                 ` Naveen N. Rao
  0 siblings, 0 replies; 42+ messages in thread
From: Naveen N. Rao @ 2013-07-01 15:41 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: Tony Luck, ananth, masbock, lcm, Linux Kernel Mailing List,
	linux-acpi, Huang Ying

On 07/01/2013 09:08 PM, Borislav Petkov wrote:
> On Mon, Jul 01, 2013 at 08:37:43PM +0530, Naveen N. Rao wrote:
>> On 06/28/2013 11:01 PM, Tony Luck wrote:
>>>> +                       if (sec_sev == GHES_SEV_CORRECTED &&
>>>> +                           (gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED) &&
>>>> +                           (mem_err->validation_bits & CPER_MEM_VALID_PHYSICAL_ADDRESS)) {
>>>> +                               unsigned long pfn;
>>>> +                               pfn = mem_err->physical_addr >> PAGE_SHIFT;
>>>
>>> As Reagan said "Trust ... but verify" ... we should make sure BIOS
>>> gave us a good pfn
>>>                                      if (pfn_valid(pfn))
>>>                                           soft_memory_failure_queue(pfn, 0, 0);
>>>                                      else
>>>                                           printk( ...something about
>>> BIOS giving us bad pfn = %lu\n", pfn);
>>
>> Ah, nice catch - I thought soft_offline_page() takes care of this,
>> but it sure is good to point a finger at the firmware.
>
> While at it maybe make it pr_warning(FW_BUG or FW_WARN...

Yup - I've made it a pr_warning(FW_WARN... I just sent out a new version 
with these changes.

Thanks,
Naveen


^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2013-07-01 15:41 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-06-19 17:57 [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC Naveen N. Rao
2013-06-19 17:57 ` [PATCH v2 2/2] mce: acpi/apei: Add a boot option to disable ff mode for corrected errors Naveen N. Rao
2013-06-19 18:04   ` Borislav Petkov
2013-06-19 18:17     ` Naveen N. Rao
2013-06-19 18:19     ` Luck, Tony
2013-06-19 18:36       ` Borislav Petkov
2013-06-19 19:05         ` Luck, Tony
2013-06-19 20:14           ` Borislav Petkov
2013-06-19 20:33             ` Luck, Tony
2013-06-19 21:07               ` Borislav Petkov
2013-06-19 21:28                 ` Luck, Tony
2013-06-19 21:41                   ` Borislav Petkov
2013-06-19 22:08                     ` Luck, Tony
2013-06-20  5:35                       ` Borislav Petkov
2013-06-20 21:21                   ` Naveen N. Rao
2013-06-20 22:11                     ` Luck, Tony
2013-06-21  7:27                       ` Borislav Petkov
2013-06-21 16:43                         ` Naveen N. Rao
2013-06-28 12:04                         ` Naveen N. Rao
2013-06-28 17:31                           ` Tony Luck
2013-07-01 15:07                             ` Naveen N. Rao
2013-07-01 15:38                               ` Borislav Petkov
2013-07-01 15:41                                 ` Naveen N. Rao
2013-06-20  7:48   ` Borislav Petkov
2013-06-20 19:02     ` Naveen N. Rao
2013-06-20  7:39 ` [PATCH v2 1/2] mce: acpi/apei: Honour Firmware First for MCA banks listed in APEI HEST CMC Borislav Petkov
2013-06-20 19:08   ` Naveen N. Rao
2013-06-20 19:29     ` Borislav Petkov
2013-06-20 20:14       ` Naveen N. Rao
2013-06-20 20:57         ` Borislav Petkov
2013-06-20 21:22           ` Naveen N. Rao
2013-06-21  7:34             ` Borislav Petkov
2013-06-21  7:46               ` Naveen N. Rao
2013-06-21  8:36                 ` Borislav Petkov
2013-06-21  9:32                   ` Naveen N. Rao
2013-06-21 14:08                     ` Borislav Petkov
2013-06-21 16:47                   ` Tony Luck
2013-06-21 17:40                     ` Borislav Petkov
2013-06-25 17:46                       ` Naveen N. Rao
2013-06-25 17:53                         ` Borislav Petkov
2013-06-25 17:55                         ` Luck, Tony
2013-06-25 18:28                           ` Naveen N. Rao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).