All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
@ 2017-04-04 17:24 Yazen Ghannam
  2017-04-04 17:24 ` [PATCH v2 2/2] x86/mce/AMD: Carve out SMCA bank configuration Yazen Ghannam
  2017-04-05 13:39 ` [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers Borislav Petkov
  0 siblings, 2 replies; 22+ messages in thread
From: Yazen Ghannam @ 2017-04-04 17:24 UTC (permalink / raw)
  To: linux-edac; +Cc: Yazen Ghannam, Tony Luck, Borislav Petkov, x86, linux-kernel

From: Yazen Ghannam <yazen.ghannam@amd.com>

We have support for the new SMCA MCA_DE{STAT,ADDR} registers in Linux. So
we've used these registers in place of MCA_{STATUS,ADDR} on SMCA systems.
However, the guidance for current implementations of SMCA is to continue
using MCA_{STATUS,ADDR} and to use MCA_DE{STAT,ADDR} only if a Deferred
error was not found in the former registers. This also means we shouldn't
clear MCA_CONFIG[LogDeferredInMcaStat].

Redo the AMD Deferred error interrupt handler to follow the guidance for
current SMCA systems. Also, don't break after finding the first error.

Rework __log_error() for clarity.

Don't clear MCA_CONFIG[LogDeferredInMcaStat] during AMD mcheck init.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
Link:
https://lkml.kernel.org/r/1490210971-62346-1-git-send-email-Yazen.Ghannam@amd.com

v1->v2:
- Change name of check_deferred_status() to is_deferred_error().
- Rework __log_error() to move out SMCA/Deferred error-specific code.

 arch/x86/kernel/cpu/mcheck/mce_amd.c | 67 ++++++++++++++++--------------------
 1 file changed, 30 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 6e4a047..2caa84c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -472,20 +472,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 		smca_high |= BIT(0);
 
 		/*
-		 * SMCA logs Deferred Error information in MCA_DE{STAT,ADDR}
-		 * registers with the option of additionally logging to
-		 * MCA_{STATUS,ADDR} if MCA_CONFIG[LogDeferredInMcaStat] is set.
-		 *
-		 * This bit is usually set by BIOS to retain the old behavior
-		 * for OSes that don't use the new registers. Linux supports the
-		 * new registers so let's disable that additional logging here.
-		 *
-		 * MCA_CONFIG[LogDeferredInMcaStat] is bit 34 (bit 2 in the high
-		 * portion of the MSR).
-		 */
-		smca_high &= ~BIT(2);
-
-		/*
 		 * SMCA sets the Deferred Error Interrupt type per bank.
 		 *
 		 * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
@@ -756,20 +742,11 @@ int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr)
 EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr);
 
 static void
-__log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
+__log_error(unsigned int bank, u32 msr_status, u32 msr_addr, u64 misc)
 {
-	u32 msr_status = msr_ops.status(bank);
-	u32 msr_addr = msr_ops.addr(bank);
 	struct mce m;
 	u64 status;
 
-	WARN_ON_ONCE(deferred_err && threshold_err);
-
-	if (deferred_err && mce_flags.smca) {
-		msr_status = MSR_AMD64_SMCA_MCx_DESTAT(bank);
-		msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);
-	}
-
 	rdmsrl(msr_status, status);
 
 	if (!(status & MCI_STATUS_VAL))
@@ -778,12 +755,10 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 	mce_setup(&m);
 
 	m.status = status;
+	m.misc   = misc;
 	m.bank   = bank;
 	m.tsc	 = rdtsc();
 
-	if (threshold_err)
-		m.misc = misc;
-
 	if (m.status & MCI_STATUS_ADDRV) {
 		rdmsrl(msr_addr, m.addr);
 
@@ -810,6 +785,20 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc)
 	wrmsrl(msr_status, 0);
 }
 
+static void __log_error_deferred(unsigned int bank, bool use_smca_destat)
+{
+	u32 msr_status	= use_smca_destat ? MSR_AMD64_SMCA_MCx_DESTAT(bank) :
+					    msr_ops.status(bank);
+	u32 msr_addr	= use_smca_destat ? MSR_AMD64_SMCA_MCx_DEADDR(bank) :
+					    msr_ops.addr(bank);
+
+	__log_error(bank, msr_status, msr_addr, 0);
+
+	/* We should still clear MCA_DESTAT even if we used MCA_STATUS. */
+	if (mce_flags.smca && !use_smca_destat)
+		wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
+}
+
 static inline void __smp_deferred_error_interrupt(void)
 {
 	inc_irq_stat(irq_deferred_error_count);
@@ -832,25 +821,29 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
 	exiting_ack_irq();
 }
 
+static inline bool is_deferred_error(u64 status)
+{
+	return ((status & MCI_STATUS_VAL) && (status & MCI_STATUS_DEFERRED));
+}
+
 /* APIC interrupt handler for deferred errors */
 static void amd_deferred_error_interrupt(void)
 {
 	unsigned int bank;
-	u32 msr_status;
 	u64 status;
 
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
-		msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank)
-					      : msr_ops.status(bank);
+		rdmsrl(msr_ops.status(bank), status);
 
-		rdmsrl(msr_status, status);
+		if (is_deferred_error(status)) {
+			__log_error_deferred(bank, false);
 
-		if (!(status & MCI_STATUS_VAL) ||
-		    !(status & MCI_STATUS_DEFERRED))
-			continue;
+		} else if (mce_flags.smca) {
+			rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), status);
 
-		__log_error(bank, true, false, 0);
-		break;
+			if (is_deferred_error(status))
+				__log_error_deferred(bank, true);
+		}
 	}
 }
 
@@ -904,7 +897,7 @@ static void amd_threshold_interrupt(void)
 	return;
 
 log:
-	__log_error(bank, false, true, ((u64)high << 32) | low);
+	__log_error(bank, msr_ops.status(bank), msr_ops.addr(bank), ((u64)high << 32) | low);
 
 	/* Reset threshold block after logging error. */
 	memset(&tr, 0, sizeof(tr));
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH v2 2/2] x86/mce/AMD: Carve out SMCA bank configuration
  2017-04-04 17:24 [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers Yazen Ghannam
@ 2017-04-04 17:24 ` Yazen Ghannam
  2017-04-05 14:19   ` Borislav Petkov
  2017-04-05 13:39 ` [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers Borislav Petkov
  1 sibling, 1 reply; 22+ messages in thread
From: Yazen Ghannam @ 2017-04-04 17:24 UTC (permalink / raw)
  To: linux-edac; +Cc: Yazen Ghannam, Tony Luck, Borislav Petkov, x86, linux-kernel

From: Yazen Ghannam <yazen.ghannam@amd.com>

Scalable MCA systems have a new MCA_CONFIG register that we use to
configure each bank. We currently use this when we set up thresholding.
However, this is logically separate.

Group all SMCA-related initialization into a single, separate function.
This includes setting MCA_CONFIG and gathering SMCA bank info.

Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
Link:
https://lkml.kernel.org/r/1490210971-62346-2-git-send-email-Yazen.Ghannam@amd.com

v1->v2:
- Merge get_smca_bank_info() and set_smca_config() into smca_configure().

 arch/x86/kernel/cpu/mcheck/mce_amd.c | 74 ++++++++++++++++++------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 2caa84c..b51d4d7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -164,17 +164,48 @@ static void default_deferred_error_interrupt(void)
 }
 void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt;
 
-static void get_smca_bank_info(unsigned int bank)
+static void smca_configure(unsigned int bank)
 {
 	unsigned int i, hwid_mcatype, cpu = smp_processor_id();
 	struct smca_hwid *s_hwid;
-	u32 high, instance_id;
+	u32 high, low;
+	u32 smca_config = MSR_AMD64_SMCA_MCx_CONFIG(bank);
+
+	/* Set appropriate bits in MCA_CONFIG */
+	if (!rdmsr_safe(smca_config, &low, &high)) {
+		/*
+		 * OS is required to set the MCAX bit to acknowledge that it is
+		 * now using the new MSR ranges and new registers under each
+		 * bank. It also means that the OS will configure deferred
+		 * errors in the new MCx_CONFIG register. If the bit is not set,
+		 * uncorrectable errors will cause a system panic.
+		 *
+		 * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
+		 */
+		high |= BIT(0);
+
+		/*
+		 * SMCA sets the Deferred Error Interrupt type per bank.
+		 *
+		 * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
+		 * if the DeferredIntType bit field is available.
+		 *
+		 * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
+		 * high portion of the MSR). OS should set this to 0x1 to enable
+		 * APIC based interrupt. First, check that no interrupt has been
+		 * set.
+		 */
+		if ((low & BIT(5)) && !((high >> 5) & 0x3))
+			high |= BIT(5);
+
+		wrmsr(smca_config, low, high);
+	}
 
 	/* Collect bank_info using CPU 0 for now. */
 	if (cpu)
 		return;
 
-	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) {
+	if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) {
 		pr_warn("Failed to read MCA_IPID for bank %d\n", bank);
 		return;
 	}
@@ -191,7 +222,7 @@ static void get_smca_bank_info(unsigned int bank)
 			     smca_get_name(s_hwid->bank_type));
 
 			smca_banks[bank].hwid = s_hwid;
-			smca_banks[bank].id = instance_id;
+			smca_banks[bank].id = low;
 			smca_banks[bank].sysfs_id = s_hwid->count++;
 			break;
 		}
@@ -433,7 +464,7 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 			int offset, u32 misc_high)
 {
 	unsigned int cpu = smp_processor_id();
-	u32 smca_low, smca_high, smca_addr;
+	u32 smca_low, smca_high;
 	struct threshold_block b;
 	int new;
 
@@ -457,37 +488,6 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr,
 		goto set_offset;
 	}
 
-	smca_addr = MSR_AMD64_SMCA_MCx_CONFIG(bank);
-
-	if (!rdmsr_safe(smca_addr, &smca_low, &smca_high)) {
-		/*
-		 * OS is required to set the MCAX bit to acknowledge that it is
-		 * now using the new MSR ranges and new registers under each
-		 * bank. It also means that the OS will configure deferred
-		 * errors in the new MCx_CONFIG register. If the bit is not set,
-		 * uncorrectable errors will cause a system panic.
-		 *
-		 * MCA_CONFIG[MCAX] is bit 32 (0 in the high portion of the MSR.)
-		 */
-		smca_high |= BIT(0);
-
-		/*
-		 * SMCA sets the Deferred Error Interrupt type per bank.
-		 *
-		 * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us
-		 * if the DeferredIntType bit field is available.
-		 *
-		 * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the
-		 * high portion of the MSR). OS should set this to 0x1 to enable
-		 * APIC based interrupt. First, check that no interrupt has been
-		 * set.
-		 */
-		if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3))
-			smca_high |= BIT(5);
-
-		wrmsr(smca_addr, smca_low, smca_high);
-	}
-
 	/* Gather LVT offset for thresholding: */
 	if (rdmsr_safe(MSR_CU_DEF_ERR, &smca_low, &smca_high))
 		goto out;
@@ -516,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 
 	for (bank = 0; bank < mca_cfg.banks; ++bank) {
 		if (mce_flags.smca)
-			get_smca_bank_info(bank);
+			smca_configure(bank);
 
 		for (block = 0; block < NR_BLOCKS; ++block) {
 			address = get_block_address(cpu, address, low, high, bank, block);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-04 17:24 [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers Yazen Ghannam
  2017-04-04 17:24 ` [PATCH v2 2/2] x86/mce/AMD: Carve out SMCA bank configuration Yazen Ghannam
@ 2017-04-05 13:39 ` Borislav Petkov
  2017-04-05 14:52   ` Ghannam, Yazen
  1 sibling, 1 reply; 22+ messages in thread
From: Borislav Petkov @ 2017-04-05 13:39 UTC (permalink / raw)
  To: Yazen Ghannam; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Tue, Apr 04, 2017 at 12:24:31PM -0500, Yazen Ghannam wrote:
> From: Yazen Ghannam <yazen.ghannam@amd.com>
> 
> We have support for the new SMCA MCA_DE{STAT,ADDR} registers in Linux. So
> we've used these registers in place of MCA_{STATUS,ADDR} on SMCA systems.
> However, the guidance for current implementations of SMCA is to continue
> using MCA_{STATUS,ADDR} and to use MCA_DE{STAT,ADDR} only if a Deferred
> error was not found in the former registers. This also means we shouldn't
> clear MCA_CONFIG[LogDeferredInMcaStat].

Does that mean we should trust what BIOS has set it to?

> 
> Redo the AMD Deferred error interrupt handler to follow the guidance for
> current SMCA systems. Also, don't break after finding the first error.
> 
> Rework __log_error() for clarity.
> 
> Don't clear MCA_CONFIG[LogDeferredInMcaStat] during AMD mcheck init.
> 
> Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
> ---

...

> @@ -832,25 +821,29 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void)
>  	exiting_ack_irq();
>  }
>  
> +static inline bool is_deferred_error(u64 status)
> +{
> +	return ((status & MCI_STATUS_VAL) && (status & MCI_STATUS_DEFERRED));
> +}
> +
>  /* APIC interrupt handler for deferred errors */
>  static void amd_deferred_error_interrupt(void)
>  {
>  	unsigned int bank;
> -	u32 msr_status;
>  	u64 status;
>  
>  	for (bank = 0; bank < mca_cfg.banks; ++bank) {
> -		msr_status = (mce_flags.smca) ? MSR_AMD64_SMCA_MCx_DESTAT(bank)
> -					      : msr_ops.status(bank);
> +		rdmsrl(msr_ops.status(bank), status);
>  
> -		rdmsrl(msr_status, status);
> +		if (is_deferred_error(status)) {
> +			__log_error_deferred(bank, false);
>  
> -		if (!(status & MCI_STATUS_VAL) ||
> -		    !(status & MCI_STATUS_DEFERRED))
> -			continue;
> +		} else if (mce_flags.smca) {
> +			rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), status);
>  
> -		__log_error(bank, true, false, 0);
> -		break;
> +			if (is_deferred_error(status))
> +				__log_error_deferred(bank, true);
> +		}

The diff is very hard to parse, lemme paste the whole resulting function
after the patch:

> /* APIC interrupt handler for deferred errors */
> static void amd_deferred_error_interrupt(void)
> {
>         unsigned int bank;
>         u64 status;
> 
>         for (bank = 0; bank < mca_cfg.banks; ++bank) {
>                 rdmsrl(msr_ops.status(bank), status);
> 
>                 if (is_deferred_error(status)) {
>                         __log_error_deferred(bank, false);
> 
>                 } else if (mce_flags.smca) {
>                         rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), status);
> 
>                         if (is_deferred_error(status))
>                                 __log_error_deferred(bank, true);

So this doesn't sound like the guidance: if we should fallback to the
DE* versions only if a deferred error wasn't found in the usual regs,
the logic should be:

static void __log_error_deferred(unsigned int bank)
{
	u32 msr_stat = msr_ops.status(bank);
	u32 msr_addr = msr_ops.addr(bank);

	/*
	 * __log_error() needs to return true to say it has logged successfully or false
	 * if it hasn't.
	 */
	if (__log_error(bank, msr_stat, msr_addr, 0))
		goto out;

	if (!mce_flags.smca)
		return;

	/*
	 * Couldn't log a deferred error from the usual regs, fallback to DE*
	 * variants.
	 */
        msr_stat = MSR_AMD64_SMCA_MCx_DESTAT(bank);
        msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);

        __log_error(bank, msr_stat, msr_addr, 0);

out:
        /* We should still clear MCA_DESTAT even if we used MCA_STATUS. */
        if (mce_flags.smca)
                wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
}

and this way you don't need to wiggle around that use_smca_destat thing.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 2/2] x86/mce/AMD: Carve out SMCA bank configuration
  2017-04-04 17:24 ` [PATCH v2 2/2] x86/mce/AMD: Carve out SMCA bank configuration Yazen Ghannam
@ 2017-04-05 14:19   ` Borislav Petkov
  0 siblings, 0 replies; 22+ messages in thread
From: Borislav Petkov @ 2017-04-05 14:19 UTC (permalink / raw)
  To: Yazen Ghannam; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Tue, Apr 04, 2017 at 12:24:32PM -0500, Yazen Ghannam wrote:
> From: Yazen Ghannam <yazen.ghannam@amd.com>
> 
> Scalable MCA systems have a new MCA_CONFIG register that we use to
> configure each bank. We currently use this when we set up thresholding.
> However, this is logically separate.
> 
> Group all SMCA-related initialization into a single, separate function.
> This includes setting MCA_CONFIG and gathering SMCA bank info.
> 
> Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
> ---
> Link:
> https://lkml.kernel.org/r/1490210971-62346-2-git-send-email-Yazen.Ghannam@amd.com
> 
> v1->v2:
> - Merge get_smca_bank_info() and set_smca_config() into smca_configure().
> 
>  arch/x86/kernel/cpu/mcheck/mce_amd.c | 74 ++++++++++++++++++------------------
>  1 file changed, 37 insertions(+), 37 deletions(-)

...

> @@ -516,7 +516,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
>  
>  	for (bank = 0; bank < mca_cfg.banks; ++bank) {
>  		if (mce_flags.smca)
> -			get_smca_bank_info(bank);
> +			smca_configure(bank);

Pass in cpu from above:

			smca_configure(bank, cpu);


>  
>  		for (block = 0; block < NR_BLOCKS; ++block) {
>  			address = get_block_address(cpu, address, low, high, bank, block);
> -- 

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-05 13:39 ` [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers Borislav Petkov
@ 2017-04-05 14:52   ` Ghannam, Yazen
  2017-04-05 16:44     ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Ghannam, Yazen @ 2017-04-05 14:52 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-edac, Tony Luck, x86, linux-kernel

> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Wednesday, April 05, 2017 9:40 AM
> To: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Cc: linux-edac@vger.kernel.org; Tony Luck <tony.luck@intel.com>;
> x86@kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA
> MCA_DE{STAT,ADDR} registers
> 
> On Tue, Apr 04, 2017 at 12:24:31PM -0500, Yazen Ghannam wrote:
> > From: Yazen Ghannam <yazen.ghannam@amd.com>
> >
> > We have support for the new SMCA MCA_DE{STAT,ADDR} registers in Linux.
> > So we've used these registers in place of MCA_{STATUS,ADDR} on SMCA
> systems.
> > However, the guidance for current implementations of SMCA is to
> > continue using MCA_{STATUS,ADDR} and to use MCA_DE{STAT,ADDR} only if
> > a Deferred error was not found in the former registers. This also
> > means we shouldn't clear MCA_CONFIG[LogDeferredInMcaStat].
> 
> Does that mean we should trust what BIOS has set it to?
> 

Yes, BIOS does and should set this bit. If it doesn't for some reason then our
handling in the Kernel will still be functional. We just won't find Deferred errors
in MCA_STATUS.

> >
> > Redo the AMD Deferred error interrupt handler to follow the guidance
> > for current SMCA systems. Also, don't break after finding the first error.
> >
> > Rework __log_error() for clarity.
> >
> > Don't clear MCA_CONFIG[LogDeferredInMcaStat] during AMD mcheck init.
> >
> > Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com>
> > ---
> 
> ...
> 
> > @@ -832,25 +821,29 @@ asmlinkage __visible void __irq_entry
> smp_trace_deferred_error_interrupt(void)
> >  	exiting_ack_irq();
> >  }
> >
> > +static inline bool is_deferred_error(u64 status) {
> > +	return ((status & MCI_STATUS_VAL) && (status &
> > +MCI_STATUS_DEFERRED)); }
> > +
> >  /* APIC interrupt handler for deferred errors */  static void
> > amd_deferred_error_interrupt(void)
> >  {
> >  	unsigned int bank;
> > -	u32 msr_status;
> >  	u64 status;
> >
> >  	for (bank = 0; bank < mca_cfg.banks; ++bank) {
> > -		msr_status = (mce_flags.smca) ?
> MSR_AMD64_SMCA_MCx_DESTAT(bank)
> > -					      : msr_ops.status(bank);
> > +		rdmsrl(msr_ops.status(bank), status);
> >
> > -		rdmsrl(msr_status, status);
> > +		if (is_deferred_error(status)) {
> > +			__log_error_deferred(bank, false);
> >
> > -		if (!(status & MCI_STATUS_VAL) ||
> > -		    !(status & MCI_STATUS_DEFERRED))
> > -			continue;
> > +		} else if (mce_flags.smca) {
> > +			rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank),
> status);
> >
> > -		__log_error(bank, true, false, 0);
> > -		break;
> > +			if (is_deferred_error(status))
> > +				__log_error_deferred(bank, true);
> > +		}
> 
> The diff is very hard to parse, lemme paste the whole resulting function after
> the patch:
> 
> > /* APIC interrupt handler for deferred errors */ static void
> > amd_deferred_error_interrupt(void)
> > {
> >         unsigned int bank;
> >         u64 status;
> >
> >         for (bank = 0; bank < mca_cfg.banks; ++bank) {
> >                 rdmsrl(msr_ops.status(bank), status);
> >
> >                 if (is_deferred_error(status)) {
> >                         __log_error_deferred(bank, false);
> >
> >                 } else if (mce_flags.smca) {
> >                         rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank),
> > status);
> >
> >                         if (is_deferred_error(status))
> >                                 __log_error_deferred(bank, true);
> 
> So this doesn't sound like the guidance: if we should fallback to the
> DE* versions only if a deferred error wasn't found in the usual regs, the logic
> should be:
> 
> static void __log_error_deferred(unsigned int bank) {
> 	u32 msr_stat = msr_ops.status(bank);
> 	u32 msr_addr = msr_ops.addr(bank);
> 
> 	/*
> 	 * __log_error() needs to return true to say it has logged successfully
> or false
> 	 * if it hasn't.
> 	 */
> 	if (__log_error(bank, msr_stat, msr_addr, 0))
> 		goto out;
> 

I'd rather we keep as many checks as possible out of __log_error().

> 	if (!mce_flags.smca)
> 		return;
> 
> 	/*
> 	 * Couldn't log a deferred error from the usual regs, fallback to DE*
> 	 * variants.
> 	 */
>         msr_stat = MSR_AMD64_SMCA_MCx_DESTAT(bank);
>         msr_addr = MSR_AMD64_SMCA_MCx_DEADDR(bank);
> 
>         __log_error(bank, msr_stat, msr_addr, 0);
> 
> out:
>         /* We should still clear MCA_DESTAT even if we used MCA_STATUS. */
>         if (mce_flags.smca)
>                 wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); }
> 
> and this way you don't need to wiggle around that use_smca_destat thing.
> 

Your suggestion gave me an idea. Let's drop __log_error_deferred() and just
select the correct registers in the deferred error interrupt handler.

/*
 * APIC interrupt handler for deferred errors
 *
 * We have three scenarios for checking for Deferred errors.
 * 1) Non-SMCA systems check MCA_STATUS and log error if found.
 * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
 *    clear MCA_DESTAT.
 * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
 *    log it.
 */
static void amd_deferred_error_interrupt(void)
{
        unsigned int bank;
        u64 status;

        for (bank = 0; bank < mca_cfg.banks; ++bank) {
                rdmsrl(msr_ops.status(bank), status);

                if (is_deferred_error(status)) {
                        __log_error(bank, msr_ops.status(bank), msr_ops.addr(bank), 0);
        
                        /* Clear MCA_DESTAT even if we used MCA_STATUS. */
                        if (mce_flags.smca)
                                wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);

                } else if (mce_flags.smca) {
                        rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), status);

                        if (is_deferred_error(status))
                                __log_error(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
                }
        }
}

What do you think?

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-05 14:52   ` Ghannam, Yazen
@ 2017-04-05 16:44     ` Borislav Petkov
  2017-04-05 17:06       ` Ghannam, Yazen
  0 siblings, 1 reply; 22+ messages in thread
From: Borislav Petkov @ 2017-04-05 16:44 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Wed, Apr 05, 2017 at 02:52:08PM +0000, Ghannam, Yazen wrote:
> Yes, BIOS does and should set this bit. If it doesn't for some reason
> then our handling in the Kernel will still be functional. We just
> won't find Deferred errors in MCA_STATUS.

Ok.

> I'd rather we keep as many checks as possible out of __log_error().

What checks?

> Your suggestion gave me an idea. Let's drop __log_error_deferred() and just
> select the correct registers in the deferred error interrupt handler.
> 
> /*
>  * APIC interrupt handler for deferred errors
>  *
>  * We have three scenarios for checking for Deferred errors.
>  * 1) Non-SMCA systems check MCA_STATUS and log error if found.
>  * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
>  *    clear MCA_DESTAT.
>  * 3) SMCA systems check MCA_DESTAT, if error was not found in MCA_STATUS, and
>  *    log it.
>  */
> static void amd_deferred_error_interrupt(void)
> {
>         unsigned int bank;
>         u64 status;
> 
>         for (bank = 0; bank < mca_cfg.banks; ++bank) {
>                 rdmsrl(msr_ops.status(bank), status);
> 
>                 if (is_deferred_error(status)) {
>                         __log_error(bank, msr_ops.status(bank), msr_ops.addr(bank), 0);

So we're an SMCA box and we land here on a deferred error, we don't have
anything in the standard MSRs...

>                         /* Clear MCA_DESTAT even if we used MCA_STATUS. */
>                         if (mce_flags.smca)
>                                 wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);

... and here we clear the info which we wanted to log before we log it!

> 
>                 } else if (mce_flags.smca) {
>                         rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), status);
> 
>                         if (is_deferred_error(status))
>                                 __log_error(bank, MSR_AMD64_SMCA_MCx_DESTAT(bank), MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);

So we execute __log_error() twice on an SMCA box for a deferred error.

So no, I'd like to see clear code flow which does what the guidance is.
And if you feel it is not readable enough then you restructure more. And
I gave you a perfectly fine __log_error_deferred() which actually does
what the guidance is.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-05 16:44     ` Borislav Petkov
@ 2017-04-05 17:06       ` Ghannam, Yazen
  2017-04-05 17:22         ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Ghannam, Yazen @ 2017-04-05 17:06 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-edac, Tony Luck, x86, linux-kernel

> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Wednesday, April 05, 2017 12:45 PM
> To: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Cc: linux-edac@vger.kernel.org; Tony Luck <tony.luck@intel.com>;
> x86@kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA
> MCA_DE{STAT,ADDR} registers
> 
> 
> > I'd rather we keep as many checks as possible out of __log_error().
> 
> What checks?
> 

Checking if we have a valid deferred error. Since we call __log_error() on
thresholding interrupts too we would need to tell it which handler is calling
it to do the correct check. This is what we currently do.

> > Your suggestion gave me an idea. Let's drop __log_error_deferred() and
> > just select the correct registers in the deferred error interrupt handler.
> >
> > /*
> >  * APIC interrupt handler for deferred errors
> >  *
> >  * We have three scenarios for checking for Deferred errors.
> >  * 1) Non-SMCA systems check MCA_STATUS and log error if found.
> >  * 2) SMCA systems check MCA_STATUS. If error is found then log it and also
> >  *    clear MCA_DESTAT.
> >  * 3) SMCA systems check MCA_DESTAT, if error was not found in
> MCA_STATUS, and
> >  *    log it.
> >  */
> > static void amd_deferred_error_interrupt(void)
> > {
> >         unsigned int bank;
> >         u64 status;
> >
> >         for (bank = 0; bank < mca_cfg.banks; ++bank) {
> >                 rdmsrl(msr_ops.status(bank), status);
> >
> >                 if (is_deferred_error(status)) {
> >                         __log_error(bank, msr_ops.status(bank),
> > msr_ops.addr(bank), 0);
> 
> So we're an SMCA box and we land here on a deferred error, we don't have
> anything in the standard MSRs...
> 

What do you mean " we don't have anything"? We check if we have a valid
deferred error in is_deferred_error(). Otherwise, we don't log anything.

> >                         /* Clear MCA_DESTAT even if we used MCA_STATUS. */
> >                         if (mce_flags.smca)
> >
> > wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0);
> 
> ... and here we clear the info which we wanted to log before we log it!
> 

No we don't. If we don't have a valid deferred error in MCA_STATUS then we
don't get here.

> >
> >                 } else if (mce_flags.smca) {
> >                         rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank),
> > status);
> >
> >                         if (is_deferred_error(status))
> >                                 __log_error(bank,
> > MSR_AMD64_SMCA_MCx_DESTAT(bank),
> MSR_AMD64_SMCA_MCx_DEADDR(bank), 0);
> 
> So we execute __log_error() twice on an SMCA box for a deferred error.
> 

No we don't. This is an if/else-if statement.

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-05 17:06       ` Ghannam, Yazen
@ 2017-04-05 17:22         ` Borislav Petkov
  2017-04-05 17:53           ` Ghannam, Yazen
  0 siblings, 1 reply; 22+ messages in thread
From: Borislav Petkov @ 2017-04-05 17:22 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Wed, Apr 05, 2017 at 05:06:19PM +0000, Ghannam, Yazen wrote:
> Checking if we have a valid deferred error. Since we call __log_error() on
> thresholding interrupts too we would need to tell it which handler is calling
> it to do the correct check. This is what we currently do.

That's why I suggested a __log_error_deferred() - a separate function
which deals with deferred errors.

> What do you mean " we don't have anything"? We check if we have a valid
> deferred error in is_deferred_error(). Otherwise, we don't log anything.

So the normal status MSR says whether we have a deferred error or not.
If it says we don't, then we have to look at the DE* MSRs, correct?

If yes, then do it exactly like this.

Not:

	IF deferred:
	 	log
	ELSE IF SMCA:
		IF deferred:
			log

but:

	IF deferred:
		log_deferred:
			log
			IF cannot log from normal MSRs
				log from DE

Why are we even wasting time with this?!

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-05 17:22         ` Borislav Petkov
@ 2017-04-05 17:53           ` Ghannam, Yazen
  2017-04-05 18:22             ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Ghannam, Yazen @ 2017-04-05 17:53 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-edac, Tony Luck, x86, linux-kernel

> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Wednesday, April 05, 2017 1:22 PM
> To: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Cc: linux-edac@vger.kernel.org; Tony Luck <tony.luck@intel.com>;
> x86@kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA
> MCA_DE{STAT,ADDR} registers
> 
> On Wed, Apr 05, 2017 at 05:06:19PM +0000, Ghannam, Yazen wrote:
> > Checking if we have a valid deferred error. Since we call
> > __log_error() on thresholding interrupts too we would need to tell it
> > which handler is calling it to do the correct check. This is what we currently
> do.
> 
> That's why I suggested a __log_error_deferred() - a separate function which
> deals with deferred errors.
> 
> > What do you mean " we don't have anything"? We check if we have a
> > valid deferred error in is_deferred_error(). Otherwise, we don't log
> anything.
> 
> So the normal status MSR says whether we have a deferred error or not.
> If it says we don't, then we have to look at the DE* MSRs, correct?
> 

Correct, but only on SMCA systems.

> If yes, then do it exactly like this.
> 
> Not:
> 
> 	IF deferred:
> 	 	log
> 	ELSE IF SMCA:
> 		IF deferred:
> 			log
> 

This works so I don't know why it's not okay. Your suggestion also does an
SMCA check. So code that does a check-and-return is preferable to code
using if/else-if statements? If that's the case then I can try to rework it.

> but:
> 
> 	IF deferred:
> 		log_deferred:
> 			log
> 			IF cannot log from normal MSRs

How does log_error() know if we can't use the normal MSRs? We check
for MCI_STATUS_VAL in log_error(). We also need to check for
MCI_STATUS_DEFERRED but only if we're coming from the deferred error
handler.

> 				log from DE
> 
> Why are we even wasting time with this?!
> 

I don't know. 

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-05 17:53           ` Ghannam, Yazen
@ 2017-04-05 18:22             ` Borislav Petkov
  2017-04-05 19:29               ` Ghannam, Yazen
  0 siblings, 1 reply; 22+ messages in thread
From: Borislav Petkov @ 2017-04-05 18:22 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Wed, Apr 05, 2017 at 05:53:57PM +0000, Ghannam, Yazen wrote:
> Correct, but only on SMCA systems.

On !SMCA systems you log only once anyway - you don't have DE* MSRs.

> This works so I don't know why it's not okay.

So I should take the code just because you have found one way that it
works? Screw readability or future proof design - people should dump the
code on maintainers and maintainers should deal with the stinking pile.
Who cares, it works for you.

> Your suggestion also does an SMCA check.

Why TF does that even matter? Enough with the dumb checks argument
already.

> So code that does a check-and-return is preferable to code using
> if/else-if statements? If that's the case then I can try to rework it.

No, readable code matters. Your suggestion is confusing. I see

	if (is_deferred_error(status)) {
		__log_error
	} else if (mce_flags.smca) {
		...
		if (is_deferred_error(status))

But then here --->

we still deal with deferred errors. Even though we did the deferred
check in the if-clause. And that is confusing and makes the code hard to
follow.

My suggestion does *exactly* what the commit message says:

	if (__log_error(..))
		goto out;

when we go to the out label, we know we've succeeded logging the
deferred error and we're done.

        if (!mce_flags.smca)
                return;

When we return here, we know, we've taken care of the !SMCA systems. Now
here starts the second attempt to read the DE* registers and we *know*
we're on SMCA systems.

> How does log_error() know if we can't use the normal MSRs?

MCI_STATUS_VAL.

> We check for MCI_STATUS_VAL in log_error().

Yes.

> We also need to check for MCI_STATUS_DEFERRED but only if we're coming
> from the deferred error handler.

Why? We *are* coming from the #DF handler so are you expecting a
different type of error in the MSRs?

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-05 18:22             ` Borislav Petkov
@ 2017-04-05 19:29               ` Ghannam, Yazen
  2017-04-05 20:04                 ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Ghannam, Yazen @ 2017-04-05 19:29 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-edac, Tony Luck, x86, linux-kernel

> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Wednesday, April 05, 2017 2:22 PM
> To: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Cc: linux-edac@vger.kernel.org; Tony Luck <tony.luck@intel.com>;
> x86@kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA
> MCA_DE{STAT,ADDR} registers
> 
> 
> > How does log_error() know if we can't use the normal MSRs?
> 
> MCI_STATUS_VAL.
> 
> > We check for MCI_STATUS_VAL in log_error().
> 
> Yes.
> 
> > We also need to check for MCI_STATUS_DEFERRED but only if we're coming
> > from the deferred error handler.
> 
> Why? We *are* coming from the #DF handler so are you expecting a
> different type of error in the MSRs?
> 

Yes, there could be depending on how MCA_CONFIG[LogDeferredInMcaStat] is
set among other things.

If it's set, then I expect a Deferred error in MCA_STATUS since any Correctable
Errors will be overwritten. Multiple bank types can generate Deferred errors
so there may also be cases where for some types a valid Uncorrectable error
happens and overwrites the Deferred error before we can handle it. In which
case we lose the Deferred error if we don't check MCA_DESTAT.

If it's not set, then it's possible to have a valid Correctable error in MCA_STATUS
while the valid Deferred error is in MCA_DESTAT.

Right now MCA_CONFIG[LogDeferredInMcaStat] is set but this may change for
future SMCA implementations.

Thanks,
Yazen 

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-05 19:29               ` Ghannam, Yazen
@ 2017-04-05 20:04                 ` Borislav Petkov
  2017-04-07 20:37                   ` Ghannam, Yazen
  0 siblings, 1 reply; 22+ messages in thread
From: Borislav Petkov @ 2017-04-05 20:04 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Wed, Apr 05, 2017 at 07:29:48PM +0000, Ghannam, Yazen wrote:
> If it's set, then I expect a Deferred error in MCA_STATUS since any Correctable
> Errors will be overwritten. Multiple bank types can generate Deferred errors
> so there may also be cases where for some types a valid Uncorrectable error
> happens and overwrites the Deferred error before we can handle it. In which
> case we lose the Deferred error if we don't check MCA_DESTAT.

So if we have an UE, wouldn't that raise an #MC? I guess in such cases
we should concentrate only on the deferred errors and let the #MC
handler deal with them. As we do now.

> If it's not set, then it's possible to have a valid Correctable error in MCA_STATUS
> while the valid Deferred error is in MCA_DESTAT.

What's logging the CE? We probably should log it too before something
overwrites it.

Anyway, ok, I think I know what needs to happen now:

amd_deferred_error_interrupt:

	if (__log_error_deferred(bank))
		return;

This one read MC?_STATUS and does the logging for when the deferred
error is in the normal MSRs. It returns true if it succeeded. It reads
and hands down both MC?_STATUS and MC?_ADDR to __log_error() so that it
doesn't have to read MC?_STATUS twice.

If __log_error_deferred() has read a different type of error, we still
log it? I'm not sure about this. I guess we can ignore that case for
now.

Then:

	if (mca_flags.smca)
		__log_error_deferred_smca(bank));

which handles the SMCA case. It too reads MSR_AMD64_SMCA_MCx_DESTAT
and MSR_AMD64_SMCA_MCx_DEADDR and hands them down to __log_error() for
logging.

For the __log_error() call in amd_threshold_interrupt(), you define a
log_error() wrapper which reads the default MSRs and hands them down to
__log_error().

So __log_error() always gets STATUS and ADDR MSR values and it doesn't
need to read them from the MSRs but only log them.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-05 20:04                 ` Borislav Petkov
@ 2017-04-07 20:37                   ` Ghannam, Yazen
  2017-04-07 21:35                     ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Ghannam, Yazen @ 2017-04-07 20:37 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-edac, Tony Luck, x86, linux-kernel

> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Wednesday, April 05, 2017 4:05 PM
> To: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Cc: linux-edac@vger.kernel.org; Tony Luck <tony.luck@intel.com>;
> x86@kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA
> MCA_DE{STAT,ADDR} registers
> 
> On Wed, Apr 05, 2017 at 07:29:48PM +0000, Ghannam, Yazen wrote:
> > If it's set, then I expect a Deferred error in MCA_STATUS since any
> > Correctable Errors will be overwritten. Multiple bank types can
> > generate Deferred errors so there may also be cases where for some
> > types a valid Uncorrectable error happens and overwrites the Deferred
> > error before we can handle it. In which case we lose the Deferred error if
> we don't check MCA_DESTAT.
> 
> So if we have an UE, wouldn't that raise an #MC? I guess in such cases we
> should concentrate only on the deferred errors and let the #MC handler deal
> with them. As we do now.
> 

Right.

> > If it's not set, then it's possible to have a valid Correctable error
> > in MCA_STATUS while the valid Deferred error is in MCA_DESTAT.
> 
> What's logging the CE? We probably should log it too before something
> overwrites it.
> 

CEs will get picked up by polling or if we hit a threshold.

> Anyway, ok, I think I know what needs to happen now:
> 
> amd_deferred_error_interrupt:
> 
> 	if (__log_error_deferred(bank))
> 		return;
> 
> This one read MC?_STATUS and does the logging for when the deferred error
> is in the normal MSRs. It returns true if it succeeded. It reads and hands down
> both MC?_STATUS and MC?_ADDR to __log_error() so that it doesn't have to
> read MC?_STATUS twice.
> 

Okay. But do we need a return value? I'm thinking we can go through all banks
and log any and all Deferred errors rather than just the first one we find. I asked
and this is the preferred method like how we do in the #MC handler. The same
applies to the thresholding interrupt handler.

> If __log_error_deferred() has read a different type of error, we still log it? I'm
> not sure about this. I guess we can ignore that case for now.
> 

Yeah, we should ignore other error types. They'll get picked up by other methods.

> Then:
> 
> 	if (mca_flags.smca)
> 		__log_error_deferred_smca(bank));
> 
> which handles the SMCA case. It too reads MSR_AMD64_SMCA_MCx_DESTAT
> and MSR_AMD64_SMCA_MCx_DEADDR and hands them down to
> __log_error() for logging.
> 

Can this just be included in the __log_error_deferred()? I'm thinking something
similar to one of your earlier suggestions. Like this:

static void
__log_error_deferred(unsigned int bank)
{
        u64 status, addr;
        bool logged = false;

        rdmsrl(msr_ops.status, status);

        if (is_deferred_error(status)) {
                rdmsrl(msr_ops.addr, addr);

                __log_error(bank, status, addr, 0);

                wrmsrl(msr_ops.status, 0);

                logged = true;
        }

        if (!mca_flags.smca)
                return;

        if (logged) {
                wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT, 0);
                return;
        }

        rdmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), status);

        if (is_deferred_error(status)) {
                rdmsrl(MSR_AMD64_SMCA_MCx_DEADDR, addr);

                __log_error(bank, status, addr, 0);

                wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT, 0);
        }
}

> For the __log_error() call in amd_threshold_interrupt(), you define a
> log_error() wrapper which reads the default MSRs and hands them down to
> __log_error().
> 

Okay.

> So __log_error() always gets STATUS and ADDR MSR values and it doesn't
> need to read them from the MSRs but only log them.
> 

Okay.

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-07 20:37                   ` Ghannam, Yazen
@ 2017-04-07 21:35                     ` Borislav Petkov
  2017-04-11 12:53                       ` Ghannam, Yazen
  0 siblings, 1 reply; 22+ messages in thread
From: Borislav Petkov @ 2017-04-07 21:35 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Fri, Apr 07, 2017 at 08:37:03PM +0000, Ghannam, Yazen wrote:
> CEs will get picked up by polling or if we hit a threshold.

I think we should be pre-emptive here and simply log the error. No use
waiting until something polling finally gets its hands on it - we're
looking at the signature so we might just as well log it.

> Okay. But do we need a return value? I'm thinking we can go through all banks
> and log any and all Deferred errors rather than just the first one we find. I asked
> and this is the preferred method like how we do in the #MC handler. The same
> applies to the thresholding interrupt handler.

... and not only the deferred errors but the CEs too.

Which would make the whole code a *lot* simpler. You simply iterate over
banks:

	for_each_bank()
		log_error()

		if (smca)
			log_error_from_de_regs()

Purely pseudocode of course.

And there's no need to go and look whether the error is a deferred
error or whatnot. In the majority of the cases it will be because we're
in the #DF handler and it better be raised for a #DF.

But even if we see something else, we should simply log it. As long as
it is a valid error signature there's nothing wrong with us logging it
and clearing the regs. The earlier we do so, the lower the probability
for setting the overflow bit.

Yeah, that should be the simplest and most robust way without missing
out on any errors. Hmm, I like it.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-07 21:35                     ` Borislav Petkov
@ 2017-04-11 12:53                       ` Ghannam, Yazen
  2017-04-11 13:12                         ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Ghannam, Yazen @ 2017-04-11 12:53 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-edac, Tony Luck, x86, linux-kernel

> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Friday, April 07, 2017 5:35 PM
> To: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Cc: linux-edac@vger.kernel.org; Tony Luck <tony.luck@intel.com>;
> x86@kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA
> MCA_DE{STAT,ADDR} registers
> 
> On Fri, Apr 07, 2017 at 08:37:03PM +0000, Ghannam, Yazen wrote:
> > CEs will get picked up by polling or if we hit a threshold.
> 
> I think we should be pre-emptive here and simply log the error. No use
> waiting until something polling finally gets its hands on it - we're looking at the
> signature so we might just as well log it.
> 

Okay, will do.

> > Okay. But do we need a return value? I'm thinking we can go through
> > all banks and log any and all Deferred errors rather than just the
> > first one we find. I asked and this is the preferred method like how
> > we do in the #MC handler. The same applies to the thresholding interrupt
> handler.
> 
> ... and not only the deferred errors but the CEs too.
> 
> Which would make the whole code a *lot* simpler. You simply iterate over
> banks:
> 
> 	for_each_bank()
> 		log_error()
> 
> 		if (smca)
> 			log_error_from_de_regs()
> 
> Purely pseudocode of course.
> 
> And there's no need to go and look whether the error is a deferred error or
> whatnot. In the majority of the cases it will be because we're in the #DF
> handler and it better be raised for a #DF.
> 

If we do as above then we can possibly log the same deferred error twice. So
even if we log every error we should still check if the error is deferred to decide
whether or not to log what's in the DE* registers.

> But even if we see something else, we should simply log it. As long as it is a
> valid error signature there's nothing wrong with us logging it and clearing the
> regs. The earlier we do so, the lower the probability for setting the overflow
> bit.
> 

Okay.

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-11 12:53                       ` Ghannam, Yazen
@ 2017-04-11 13:12                         ` Borislav Petkov
  2017-04-11 13:18                           ` Ghannam, Yazen
  0 siblings, 1 reply; 22+ messages in thread
From: Borislav Petkov @ 2017-04-11 13:12 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Tue, Apr 11, 2017 at 12:53:56PM +0000, Ghannam, Yazen wrote:
> If we do as above then we can possibly log the same deferred error twice.

Why twice?

	for_each_bank()
		log_error()
		|-> clear MSRs after logging

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-11 13:12                         ` Borislav Petkov
@ 2017-04-11 13:18                           ` Ghannam, Yazen
  2017-04-11 13:24                             ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Ghannam, Yazen @ 2017-04-11 13:18 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-edac, Tony Luck, x86, linux-kernel

> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Tuesday, April 11, 2017 9:12 AM
> To: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Cc: linux-edac@vger.kernel.org; Tony Luck <tony.luck@intel.com>;
> x86@kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA
> MCA_DE{STAT,ADDR} registers
> 
> On Tue, Apr 11, 2017 at 12:53:56PM +0000, Ghannam, Yazen wrote:
> > If we do as above then we can possibly log the same deferred error twice.
> 
> Why twice?
> 
> 	for_each_bank()
> 		log_error()
> 		|-> clear MSRs after logging
> 

So log_error() reads/clears MCA_STATUS, right? This won't affect MCA_DESTAT
on SMCA systems. So if we call log_error_smca() and unconditionally read
MCA_DESTAT, we will find the same deferred error that we logged in log_error().

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-11 13:18                           ` Ghannam, Yazen
@ 2017-04-11 13:24                             ` Borislav Petkov
  2017-04-11 13:32                               ` Ghannam, Yazen
  0 siblings, 1 reply; 22+ messages in thread
From: Borislav Petkov @ 2017-04-11 13:24 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Tue, Apr 11, 2017 at 01:18:50PM +0000, Ghannam, Yazen wrote:
> So log_error() reads/clears MCA_STATUS, right? This won't affect MCA_DESTAT
> on SMCA systems. So if we call log_error_smca() and unconditionally read
> MCA_DESTAT, we will find the same deferred error that we logged in log_error().

I'm reading this as, "we log the same deferred error in *both* the
original MCA MSRs and in the new DE* ones". Correct?

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-11 13:24                             ` Borislav Petkov
@ 2017-04-11 13:32                               ` Ghannam, Yazen
  2017-04-11 13:35                                 ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Ghannam, Yazen @ 2017-04-11 13:32 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-edac, Tony Luck, x86, linux-kernel

> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Tuesday, April 11, 2017 9:25 AM
> To: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Cc: linux-edac@vger.kernel.org; Tony Luck <tony.luck@intel.com>;
> x86@kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA
> MCA_DE{STAT,ADDR} registers
> 
> On Tue, Apr 11, 2017 at 01:18:50PM +0000, Ghannam, Yazen wrote:
> > So log_error() reads/clears MCA_STATUS, right? This won't affect
> > MCA_DESTAT on SMCA systems. So if we call log_error_smca() and
> > unconditionally read MCA_DESTAT, we will find the same deferred error
> that we logged in log_error().
> 
> I'm reading this as, "we log the same deferred error in *both* the original
> MCA MSRs and in the new DE* ones". Correct?
> 

Yes, exactly. Deferred errors are *always* logged in the DE* registers and
they are logged in the original MSRs based on the MCA_CONFIG bit.

The idea here is that if a deferred error is overwritten in MCA_STATUS we
will still have a copy logged in MCA_DESTAT.

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-11 13:32                               ` Ghannam, Yazen
@ 2017-04-11 13:35                                 ` Borislav Petkov
  2017-04-11 13:50                                   ` Ghannam, Yazen
  0 siblings, 1 reply; 22+ messages in thread
From: Borislav Petkov @ 2017-04-11 13:35 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Tue, Apr 11, 2017 at 01:32:03PM +0000, Ghannam, Yazen wrote:
> Yes, exactly. Deferred errors are *always* logged in the DE* registers and

Then the solution is simple:

        for_each_bank()
                if (log_error()) {
			clear_msrs();
			continue;
		}

                if (mca_cfg.smca) {
                        log_error_from_de_regs()
			clear_msrs();
		}
	}

and clear_msrs() clears them all.

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* RE: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-11 13:35                                 ` Borislav Petkov
@ 2017-04-11 13:50                                   ` Ghannam, Yazen
  2017-04-11 13:54                                     ` Borislav Petkov
  0 siblings, 1 reply; 22+ messages in thread
From: Ghannam, Yazen @ 2017-04-11 13:50 UTC (permalink / raw)
  To: Borislav Petkov; +Cc: linux-edac, Tony Luck, x86, linux-kernel

> -----Original Message-----
> From: Borislav Petkov [mailto:bp@alien8.de]
> Sent: Tuesday, April 11, 2017 9:36 AM
> To: Ghannam, Yazen <Yazen.Ghannam@amd.com>
> Cc: linux-edac@vger.kernel.org; Tony Luck <tony.luck@intel.com>;
> x86@kernel.org; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA
> MCA_DE{STAT,ADDR} registers
> 
> On Tue, Apr 11, 2017 at 01:32:03PM +0000, Ghannam, Yazen wrote:
> > Yes, exactly. Deferred errors are *always* logged in the DE* registers and
> 
> Then the solution is simple:
> 
>         for_each_bank()
>                 if (log_error()) {
> 			clear_msrs();

We need to make sure log_error() logged the deferred error before clearing
MCA_DESTAT. We shouldn't assume that it did because we're in the #DF handler.
There's still a possibility that it was overwritten even if very rare.

> 			continue;
> 		}
> 
>                 if (mca_cfg.smca) {
>                         log_error_from_de_regs()
> 			clear_msrs();
> 		}
> 	}
> 
> and clear_msrs() clears them all.
> 

Thanks,
Yazen

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers
  2017-04-11 13:50                                   ` Ghannam, Yazen
@ 2017-04-11 13:54                                     ` Borislav Petkov
  0 siblings, 0 replies; 22+ messages in thread
From: Borislav Petkov @ 2017-04-11 13:54 UTC (permalink / raw)
  To: Ghannam, Yazen; +Cc: linux-edac, Tony Luck, x86, linux-kernel

On Tue, Apr 11, 2017 at 01:50:02PM +0000, Ghannam, Yazen wrote:
> We need to make sure log_error() logged the deferred error before clearing
> MCA_DESTAT.

I can think of a couple of ways how to do that...

-- 
Regards/Gruss,
    Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2017-04-11 13:55 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-04 17:24 [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers Yazen Ghannam
2017-04-04 17:24 ` [PATCH v2 2/2] x86/mce/AMD: Carve out SMCA bank configuration Yazen Ghannam
2017-04-05 14:19   ` Borislav Petkov
2017-04-05 13:39 ` [PATCH v2 1/2] x86/mce/AMD: Redo use of SMCA MCA_DE{STAT,ADDR} registers Borislav Petkov
2017-04-05 14:52   ` Ghannam, Yazen
2017-04-05 16:44     ` Borislav Petkov
2017-04-05 17:06       ` Ghannam, Yazen
2017-04-05 17:22         ` Borislav Petkov
2017-04-05 17:53           ` Ghannam, Yazen
2017-04-05 18:22             ` Borislav Petkov
2017-04-05 19:29               ` Ghannam, Yazen
2017-04-05 20:04                 ` Borislav Petkov
2017-04-07 20:37                   ` Ghannam, Yazen
2017-04-07 21:35                     ` Borislav Petkov
2017-04-11 12:53                       ` Ghannam, Yazen
2017-04-11 13:12                         ` Borislav Petkov
2017-04-11 13:18                           ` Ghannam, Yazen
2017-04-11 13:24                             ` Borislav Petkov
2017-04-11 13:32                               ` Ghannam, Yazen
2017-04-11 13:35                                 ` Borislav Petkov
2017-04-11 13:50                                   ` Ghannam, Yazen
2017-04-11 13:54                                     ` Borislav Petkov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.