linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Aristeu Rozanski <aris@ruivo.org>
To: Borislav Petkov <bp@alien8.de>
Cc: Tony Luck <tony.luck@intel.com>,
	"linux-edac@vger.kernel.org" <linux-edac@vger.kernel.org>,
	"aris@redhat.com" <aris@redhat.com>
Subject: Re: [PATCH] x86/mce: Prevent duplicate error records
Date: Thu, 20 Jul 2023 08:53:34 -0400	[thread overview]
Message-ID: <20230720125334.GD94963@cathedrallabs.org> (raw)
In-Reply-To: <20230720054908.GAZLjK1CSIrioNSI/f@fat_crate.local>

On Thu, Jul 20, 2023 at 07:49:08AM +0200, Borislav Petkov wrote:
> A legitimate use case of the MCA infrastructure is to have the firmware
> log all uncorrectable errors and also, have the OS see all correctable
> errors.
> 
> The uncorrectable, UCNA errors are usually configured to be reported
> through an SMI. CMCI, which is the correctable error reporting
> interrupt, uses SMI too and having both enabled, leads to unnecessary
> overhead.
> 
> So what ends up happening is, people disable CMCI in the wild and leave
> on only the UCNA SMI.
> 
> When CMCI is disabled, the MCA infrastructure resorts to polling the MCA
> banks. If a MCA MSR is shared between the logical threads, one error
> ends up getting logged multiple times as the polling runs on every
> logical thread.
> 
> Therefore, introduce locking on the Intel side of the polling routine to
> prevent such duplicate error records from appearing.
> 
> Based on a patch by Aristeu Rozanski <aris@ruivo.org>.
> 
> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
> Tested-by: Tony Luck <tony.luck@intel.com>
> Link: https://lore.kernel.org/r/20230515143225.GC4090740@cathedrallabs.org
> ---
>  arch/x86/kernel/cpu/mce/core.c     |  9 ++++++++-
>  arch/x86/kernel/cpu/mce/intel.c    | 19 ++++++++++++++++++-
>  arch/x86/kernel/cpu/mce/internal.h |  1 +
>  3 files changed, 27 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
> index 89e2aab5d34d..b8ad5a5b4026 100644
> --- a/arch/x86/kernel/cpu/mce/core.c
> +++ b/arch/x86/kernel/cpu/mce/core.c
> @@ -1608,6 +1608,13 @@ static void __start_timer(struct timer_list *t, unsigned long interval)
>  	local_irq_restore(flags);
>  }
>  
> +static void mc_poll_banks_default(void)
> +{
> +	machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
> +}
> +
> +void (*mc_poll_banks)(void) = mc_poll_banks_default;
> +
>  static void mce_timer_fn(struct timer_list *t)
>  {
>  	struct timer_list *cpu_t = this_cpu_ptr(&mce_timer);
> @@ -1618,7 +1625,7 @@ static void mce_timer_fn(struct timer_list *t)
>  	iv = __this_cpu_read(mce_next_interval);
>  
>  	if (mce_available(this_cpu_ptr(&cpu_info))) {
> -		machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
> +		mc_poll_banks();
>  
>  		if (mce_intel_cmci_poll()) {
>  			iv = mce_adjust_timer(iv);
> diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
> index 95275a5e57e0..f5323551c1a9 100644
> --- a/arch/x86/kernel/cpu/mce/intel.c
> +++ b/arch/x86/kernel/cpu/mce/intel.c
> @@ -56,6 +56,13 @@ static DEFINE_PER_CPU(int, cmci_backoff_cnt);
>   */
>  static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
>  
> +/*
> + * On systems that do support CMCI but it's disabled, polling for MCEs can
> + * cause the same event to be reported multiple times because IA32_MCi_STATUS
> + * is shared by the same package.
> + */
> +static DEFINE_SPINLOCK(cmci_poll_lock);
> +
>  #define CMCI_THRESHOLD		1
>  #define CMCI_POLL_INTERVAL	(30 * HZ)
>  #define CMCI_STORM_INTERVAL	(HZ)
> @@ -426,12 +433,22 @@ void cmci_disable_bank(int bank)
>  	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
>  }
>  
> +/* Bank polling function when CMCI is disabled. */
> +static void cmci_mc_poll_banks(void)
> +{
> +	spin_lock(&cmci_poll_lock);
> +	machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
> +	spin_unlock(&cmci_poll_lock);
> +}
> +
>  void intel_init_cmci(void)
>  {
>  	int banks;
>  
> -	if (!cmci_supported(&banks))
> +	if (!cmci_supported(&banks)) {
> +		mc_poll_banks = cmci_mc_poll_banks;
>  		return;
> +	}
>  
>  	mce_threshold_vector = intel_threshold_interrupt;
>  	cmci_discover(banks);
> diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
> index d2412ce2d312..ed4a71c0f093 100644
> --- a/arch/x86/kernel/cpu/mce/internal.h
> +++ b/arch/x86/kernel/cpu/mce/internal.h
> @@ -274,4 +274,5 @@ static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg)
>  	return 0;
>  }
>  
> +extern void (*mc_poll_banks)(void);
>  #endif /* __X86_MCE_INTERNAL_H__ */

Acked-by: Aristeu Rozanski <aris@ruivo.org>

-- 
Aristeu


      reply	other threads:[~2023-07-20 12:53 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-07-17 15:23 [PATCH v4] mce: prevent concurrent polling of MCE events Aristeu Rozanski
2023-07-19  9:26 ` Borislav Petkov
2023-07-19 17:32   ` Luck, Tony
2023-07-19 18:07   ` Aristeu Rozanski
2023-07-20  5:49     ` [PATCH] x86/mce: Prevent duplicate error records Borislav Petkov
2023-07-20 12:53       ` Aristeu Rozanski [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230720125334.GD94963@cathedrallabs.org \
    --to=aris@ruivo.org \
    --cc=aris@redhat.com \
    --cc=bp@alien8.de \
    --cc=linux-edac@vger.kernel.org \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).