linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Tony Luck <tony.luck@intel.com>
To: Borislav Petkov <bp@alien8.de>
Cc: Yazen Ghannam <yazen.ghannam@amd.com>,
	Smita.KoralahalliChannabasappa@amd.com,
	dave.hansen@linux.intel.com, x86@kernel.org,
	linux-edac@vger.kernel.org, linux-kernel@vger.kernel.org,
	patches@lists.linux.dev, Tony Luck <tony.luck@intel.com>
Subject: [PATCH v4 1/5] x86/mce: Remove old CMCI storm mitigation code
Date: Mon,  3 Apr 2023 14:07:12 -0700	[thread overview]
Message-ID: <20230403210716.347773-2-tony.luck@intel.com> (raw)
In-Reply-To: <20230403210716.347773-1-tony.luck@intel.com>

When a "storm" of CMCI is detected this code mitigates by
disabling CMCI interrupt signalling from all of the banks
owned by the CPU that saw the storm.

There are problems with this approach:

1) It is very coarse grained. In all likelihood only one of the
banks was generating the interrupts, but CMCI is disabled for all.
This means Linux may delay seeing and processing errors logged
from other banks.

2) Although CMCI stands for Corrected Machine Check Interrupt, it
is also used to signal when an uncorrected error is logged. This
is a problem because these errors should be handled in a timely
manner.

Delete all this code in preparation for a finer grained solution.

Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Yazen Ghannam <yazen.ghannam@amd.com>
Tested-by: Yazen Ghannam <yazen.ghannam@amd.com>
---
 arch/x86/kernel/cpu/mce/internal.h |   6 --
 arch/x86/kernel/cpu/mce/core.c     |  20 +---
 arch/x86/kernel/cpu/mce/intel.c    | 145 -----------------------------
 3 files changed, 1 insertion(+), 170 deletions(-)

diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 91a415553c27..f9331c6229b4 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -41,18 +41,12 @@ struct dentry *mce_get_debugfs_dir(void);
 extern mce_banks_t mce_banks_ce_disabled;
 
 #ifdef CONFIG_X86_MCE_INTEL
-unsigned long cmci_intel_adjust_timer(unsigned long interval);
-bool mce_intel_cmci_poll(void);
-void mce_intel_hcpu_update(unsigned long cpu);
 void cmci_disable_bank(int bank);
 void intel_init_cmci(void);
 void intel_init_lmce(void);
 void intel_clear_lmce(void);
 bool intel_filter_mce(struct mce *m);
 #else
-# define cmci_intel_adjust_timer mce_adjust_timer_default
-static inline bool mce_intel_cmci_poll(void) { return false; }
-static inline void mce_intel_hcpu_update(unsigned long cpu) { }
 static inline void cmci_disable_bank(int bank) { }
 static inline void intel_init_cmci(void) { }
 static inline void intel_init_lmce(void) { }
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 2eec60f50057..e7936be84204 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -1588,13 +1588,6 @@ static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
 
-static unsigned long mce_adjust_timer_default(unsigned long interval)
-{
-	return interval;
-}
-
-static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-
 static void __start_timer(struct timer_list *t, unsigned long interval)
 {
 	unsigned long when = jiffies + interval;
@@ -1617,15 +1610,9 @@ static void mce_timer_fn(struct timer_list *t)
 
 	iv = __this_cpu_read(mce_next_interval);
 
-	if (mce_available(this_cpu_ptr(&cpu_info))) {
+	if (mce_available(this_cpu_ptr(&cpu_info)))
 		machine_check_poll(0, this_cpu_ptr(&mce_poll_banks));
 
-		if (mce_intel_cmci_poll()) {
-			iv = mce_adjust_timer(iv);
-			goto done;
-		}
-	}
-
 	/*
 	 * Alert userspace if needed. If we logged an MCE, reduce the polling
 	 * interval, otherwise increase the polling interval.
@@ -1635,7 +1622,6 @@ static void mce_timer_fn(struct timer_list *t)
 	else
 		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
 
-done:
 	__this_cpu_write(mce_next_interval, iv);
 	__start_timer(t, iv);
 }
@@ -1972,7 +1958,6 @@ static void mce_zhaoxin_feature_init(struct cpuinfo_x86 *c)
 
 	intel_init_cmci();
 	intel_init_lmce();
-	mce_adjust_timer = cmci_intel_adjust_timer;
 }
 
 static void mce_zhaoxin_feature_clear(struct cpuinfo_x86 *c)
@@ -1985,7 +1970,6 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
 		mce_intel_feature_init(c);
-		mce_adjust_timer = cmci_intel_adjust_timer;
 		break;
 
 	case X86_VENDOR_AMD: {
@@ -2642,8 +2626,6 @@ static void mce_reenable_cpu(void)
 
 static int mce_cpu_dead(unsigned int cpu)
 {
-	mce_intel_hcpu_update(cpu);
-
 	/* intentionally ignoring frozen here */
 	if (!cpuhp_tasks_frozen)
 		cmci_rediscover();
diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c
index 95275a5e57e0..052bf2708391 100644
--- a/arch/x86/kernel/cpu/mce/intel.c
+++ b/arch/x86/kernel/cpu/mce/intel.c
@@ -41,15 +41,6 @@
  */
 static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
 
-/*
- * CMCI storm detection backoff counter
- *
- * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
- * encountered an error. If not, we decrement it by one. We signal the end of
- * the CMCI storm when it reaches 0.
- */
-static DEFINE_PER_CPU(int, cmci_backoff_cnt);
-
 /*
  * cmci_discover_lock protects against parallel discovery attempts
  * which could race against each other.
@@ -57,21 +48,6 @@ static DEFINE_PER_CPU(int, cmci_backoff_cnt);
 static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
 
 #define CMCI_THRESHOLD		1
-#define CMCI_POLL_INTERVAL	(30 * HZ)
-#define CMCI_STORM_INTERVAL	(HZ)
-#define CMCI_STORM_THRESHOLD	15
-
-static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt);
-static DEFINE_PER_CPU(unsigned int, cmci_storm_state);
-
-enum {
-	CMCI_STORM_NONE,
-	CMCI_STORM_ACTIVE,
-	CMCI_STORM_SUBSIDED,
-};
-
-static atomic_t cmci_storm_on_cpus;
 
 static int cmci_supported(int *banks)
 {
@@ -127,124 +103,6 @@ static bool lmce_supported(void)
 	return tmp & FEAT_CTL_LMCE_ENABLED;
 }
 
-bool mce_intel_cmci_poll(void)
-{
-	if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
-		return false;
-
-	/*
-	 * Reset the counter if we've logged an error in the last poll
-	 * during the storm.
-	 */
-	if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)))
-		this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
-	else
-		this_cpu_dec(cmci_backoff_cnt);
-
-	return true;
-}
-
-void mce_intel_hcpu_update(unsigned long cpu)
-{
-	if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE)
-		atomic_dec(&cmci_storm_on_cpus);
-
-	per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
-}
-
-static void cmci_toggle_interrupt_mode(bool on)
-{
-	unsigned long flags, *owned;
-	int bank;
-	u64 val;
-
-	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
-	owned = this_cpu_ptr(mce_banks_owned);
-	for_each_set_bit(bank, owned, MAX_NR_BANKS) {
-		rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
-
-		if (on)
-			val |= MCI_CTL2_CMCI_EN;
-		else
-			val &= ~MCI_CTL2_CMCI_EN;
-
-		wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
-	}
-	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
-unsigned long cmci_intel_adjust_timer(unsigned long interval)
-{
-	if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
-	    (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
-		mce_notify_irq();
-		return CMCI_STORM_INTERVAL;
-	}
-
-	switch (__this_cpu_read(cmci_storm_state)) {
-	case CMCI_STORM_ACTIVE:
-
-		/*
-		 * We switch back to interrupt mode once the poll timer has
-		 * silenced itself. That means no events recorded and the timer
-		 * interval is back to our poll interval.
-		 */
-		__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
-		if (!atomic_sub_return(1, &cmci_storm_on_cpus))
-			pr_notice("CMCI storm subsided: switching to interrupt mode\n");
-
-		fallthrough;
-
-	case CMCI_STORM_SUBSIDED:
-		/*
-		 * We wait for all CPUs to go back to SUBSIDED state. When that
-		 * happens we switch back to interrupt mode.
-		 */
-		if (!atomic_read(&cmci_storm_on_cpus)) {
-			__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
-			cmci_toggle_interrupt_mode(true);
-			cmci_recheck();
-		}
-		return CMCI_POLL_INTERVAL;
-	default:
-
-		/* We have shiny weather. Let the poll do whatever it thinks. */
-		return interval;
-	}
-}
-
-static bool cmci_storm_detect(void)
-{
-	unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
-	unsigned long ts = __this_cpu_read(cmci_time_stamp);
-	unsigned long now = jiffies;
-	int r;
-
-	if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE)
-		return true;
-
-	if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) {
-		cnt++;
-	} else {
-		cnt = 1;
-		__this_cpu_write(cmci_time_stamp, now);
-	}
-	__this_cpu_write(cmci_storm_cnt, cnt);
-
-	if (cnt <= CMCI_STORM_THRESHOLD)
-		return false;
-
-	cmci_toggle_interrupt_mode(false);
-	__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
-	r = atomic_add_return(1, &cmci_storm_on_cpus);
-	mce_timer_kick(CMCI_STORM_INTERVAL);
-	this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
-
-	if (r == 1)
-		pr_notice("CMCI storm detected: switching to poll mode\n");
-	return true;
-}
-
 /*
  * The interrupt handler. This is called on every event.
  * Just call the poller directly to log any events.
@@ -253,9 +111,6 @@ static bool cmci_storm_detect(void)
  */
 static void intel_threshold_interrupt(void)
 {
-	if (cmci_storm_detect())
-		return;
-
 	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
 }
 
-- 
2.39.2


  reply	other threads:[~2023-04-03 21:07 UTC|newest]

Thread overview: 99+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-04-06  6:35 [RFC PATCH 0/5] Handle corrected machine check interrupt storms Smita Koralahalli
2022-04-06  6:35 ` [PATCH 1/5] x86/mce: Remove old CMCI storm mitigation code Smita Koralahalli
2022-04-06  6:35 ` [PATCH 2/5] x86/mce: Add per-bank CMCI storm mitigation Smita Koralahalli
2022-04-06  6:35 ` [RFC PATCH 3/5] x86/mce: Introduce a function pointer mce_handle_storm Smita Koralahalli
2022-04-06 22:38   ` Luck, Tony
2022-04-06  6:35 ` [RFC PATCH 4/5] x86/mce: Move storm handling to core Smita Koralahalli
2022-06-21  5:08   ` Luck, Tony
2022-06-27 17:36     ` [PATCH v2 0/5] Handle corrected machine check interrupt storms Tony Luck
2022-06-27 17:36       ` [PATCH v2 1/5] x86/mce: Remove old CMCI storm mitigation code Tony Luck
2022-06-27 17:36       ` [PATCH v2 2/5] x86/mce: Add per-bank CMCI storm mitigation Tony Luck
2022-06-27 17:36       ` [PATCH v2 3/5] x86/mce: Introduce mce_handle_storm() to deal with begin/end of storms Tony Luck
2022-06-27 17:36       ` [PATCH v2 4/5] x86/mce: Move storm handling to core Tony Luck
2022-06-27 17:36       ` [PATCH v2 5/5] x86/mce: Handle AMD threshold interrupt storms Tony Luck
2023-03-17 14:50       ` [PATCH v2 0/5] Handle corrected machine check " Yazen Ghannam
2023-03-17 17:20         ` [PATCH v3 " Tony Luck
2023-03-17 17:20           ` [PATCH v3 1/5] x86/mce: Remove old CMCI storm mitigation code Tony Luck
2023-03-17 17:20           ` [PATCH v3 2/5] x86/mce: Add per-bank CMCI storm mitigation Tony Luck
2023-03-17 17:20           ` [PATCH v3 3/5] x86/mce: Introduce mce_handle_storm() to deal with begin/end of storms Tony Luck
2023-03-23 15:22             ` Yazen Ghannam
2023-03-23 18:00               ` Tony Luck
2023-03-17 17:20           ` [PATCH v3 4/5] x86/mce: Move storm handling to core Tony Luck
2023-03-23 15:27             ` Yazen Ghannam
2023-03-23 18:10               ` Luck, Tony
2023-03-23 20:26                 ` Luck, Tony
2023-03-24 20:44                   ` Yazen Ghannam
2023-03-29 15:26                   ` Yazen Ghannam
2023-04-03 19:03                     ` Luck, Tony
2023-04-03 21:07                     ` [PATCH v4 0/5] Handle corrected machine check interrupt storms Tony Luck
2023-04-03 21:07                       ` Tony Luck [this message]
2023-04-03 21:07                       ` [PATCH v4 2/5] x86/mce: Add per-bank CMCI storm mitigation Tony Luck
2023-04-11 12:32                         ` Borislav Petkov
2023-04-11 14:06                           ` Yazen Ghannam
2023-04-11 16:06                             ` Luck, Tony
2023-04-11 17:17                               ` Borislav Petkov
2023-04-03 21:07                       ` [PATCH v4 3/5] x86/mce: Introduce mce_handle_storm() to deal with begin/end of storms Tony Luck
2023-04-03 21:07                       ` [PATCH v4 4/5] x86/mce: Move storm handling to core Tony Luck
2023-04-03 21:07                       ` [PATCH v4 5/5] x86/mce: Handle AMD threshold interrupt storms Tony Luck
2023-04-11 17:38                       ` [PATCH v5 0/5] Handle corrected machine check " Tony Luck
2023-04-11 17:38                         ` [PATCH v5 1/5] x86/mce: Remove old CMCI storm mitigation code Tony Luck
2023-04-11 17:38                         ` [PATCH v5 2/5] x86/mce: Add per-bank CMCI storm mitigation Tony Luck
2023-06-13 17:45                           ` Borislav Petkov
2023-06-16 18:15                             ` Tony Luck
2023-04-11 17:38                         ` [PATCH v5 3/5] x86/mce: Introduce mce_handle_storm() to deal with begin/end of storms Tony Luck
2023-04-11 17:38                         ` [PATCH v5 4/5] x86/mce: Move storm handling to core Tony Luck
2023-04-11 17:38                         ` [PATCH v5 5/5] x86/mce: Handle AMD threshold interrupt storms Tony Luck
2023-06-16 18:27                         ` [PATCH v6 0/4] Handle corrected machine check " Tony Luck
2023-06-16 18:27                           ` [PATCH v6 1/4] x86/mce: Remove old CMCI storm mitigation code Tony Luck
2023-06-16 18:27                           ` [PATCH v6 2/4] x86/mce: Add per-bank CMCI storm mitigation Tony Luck
2023-06-23 12:09                             ` Borislav Petkov
2023-06-23 15:40                               ` Luck, Tony
2023-07-17  8:58                                 ` Borislav Petkov
2023-06-16 18:27                           ` [PATCH v6 3/4] x86/mce: Handle AMD threshold interrupt storms Tony Luck
2023-06-23 14:45                             ` Borislav Petkov
2023-06-23 15:54                               ` Yazen Ghannam
2023-06-16 18:27                           ` [PATCH v6 4/4] x86/mce: Handle Intel " Tony Luck
2023-07-18 21:08                           ` [PATCH v7 0/3] Handle corrected machine check " Tony Luck
2023-07-18 21:08                             ` [PATCH v7 1/3] x86/mce: Remove old CMCI storm mitigation code Tony Luck
2023-07-18 21:08                             ` [PATCH v7 2/3] x86/mce: Add per-bank CMCI storm mitigation Tony Luck
2023-09-19 17:44                               ` Yazen Ghannam
2023-09-20 15:56                               ` Yazen Ghannam
2023-09-20 16:09                                 ` Luck, Tony
2023-07-18 21:08                             ` [PATCH v7 3/3] x86/mce: Handle Intel threshold interrupt storms Tony Luck
2023-09-19 17:59                               ` Yazen Ghannam
2023-09-29 18:16                             ` [PATCH v8 0/3] Handle corrected machine check " Tony Luck
2023-09-29 18:16                               ` [PATCH v8 1/3] x86/mce: Remove old CMCI storm mitigation code Tony Luck
2023-09-29 18:16                               ` [PATCH v8 2/3] x86/mce: Add per-bank CMCI storm mitigation Tony Luck
2023-09-29 18:16                               ` [PATCH v8 3/3] x86/mce: Handle Intel threshold interrupt storms Tony Luck
2023-10-02 17:57                               ` [PATCH v8 0/3] Handle corrected machine check " Luck, Tony
2023-10-04 18:36                               ` [PATCH v9 " Tony Luck
2023-10-04 18:36                                 ` [PATCH v9 1/3] x86/mce: Remove old CMCI storm mitigation code Tony Luck
2023-10-04 18:36                                 ` [PATCH v9 2/3] x86/mce: Add per-bank CMCI storm mitigation Tony Luck
2023-10-11  9:11                                   ` kernel test robot
2023-10-11 15:16                                     ` Luck, Tony
2023-10-11 15:42                                       ` Feng Tang
2023-10-11 17:23                                         ` Luck, Tony
2023-10-12  5:36                                           ` Feng Tang
2023-10-12  5:56                                             ` Feng Tang
2023-10-12  2:35                                         ` Philip Li
2023-10-19 15:12                                   ` Borislav Petkov
2023-10-23 18:14                                     ` Tony Luck
2023-11-14 19:23                                       ` Borislav Petkov
2023-11-14 22:04                                         ` Tony Luck
2023-11-21 11:54                                           ` Borislav Petkov
2023-11-27 19:50                                             ` Tony Luck
2023-11-27 20:14                                               ` Tony Luck
2023-11-28  0:42                                                 ` Tony Luck
2023-11-28 15:32                                                   ` Yazen Ghannam
2023-12-14 16:58                                                   ` Borislav Petkov
2023-12-14 18:03                                                     ` Luck, Tony
2023-10-04 18:36                                 ` [PATCH v9 3/3] x86/mce: Handle Intel threshold interrupt storms Tony Luck
2023-11-15 19:54                                 ` [PATCH v10 0/3] Handle corrected machine check " Tony Luck
2023-11-15 19:54                                   ` [PATCH v10 1/3] x86/mce: Remove old CMCI storm mitigation code Tony Luck
2023-11-15 19:54                                   ` [PATCH v10 2/3] x86/mce: Add per-bank CMCI storm mitigation Tony Luck
2023-11-15 19:54                                   ` [PATCH v10 3/3] x86/mce: Handle Intel threshold interrupt storms Tony Luck
2023-03-17 17:20           ` [PATCH v3 5/5] x86/mce: Handle AMD " Tony Luck
2022-04-06  6:35 ` [RFC PATCH " Smita Koralahalli
2022-04-06 22:44   ` Luck, Tony
2022-04-08  7:48     ` Koralahalli Channabasappa, Smita
2022-04-08 19:29       ` Luck, Tony

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230403210716.347773-2-tony.luck@intel.com \
    --to=tony.luck@intel.com \
    --cc=Smita.KoralahalliChannabasappa@amd.com \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=patches@lists.linux.dev \
    --cc=x86@kernel.org \
    --cc=yazen.ghannam@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).