linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Dump cper error table in mce_panic
@ 2020-11-04  6:50 yaoaili126
  2020-11-04 10:16 ` kernel test robot
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: yaoaili126 @ 2020-11-04  6:50 UTC (permalink / raw)
  To: rjw, lenb, tony.luck, bp, james.morse
  Cc: linux-acpi, linux-edac, yangfeng1, CHENGUOMIN, yaoaili

From: Aili Yao <yaoaili@kingsoft.com>

For X86_MCE, When there is a fatal ue error, BIOS will prepare one
detailed cper error table before raising MCE, this cper table is meant
to supply addtional error information and not to race with mce handler
to panic.

Usually possible unexpected cper process from NMI watchdog race panic
with MCE panic is not a problem, the panic process will coordinate with
each core. But When the CPER is not processed in the first kernel and
leave it to the second kernel, this is a problem, lead to a kdump fail.

Now in this patch, the mce_panic will race with unexpected NMI to dump
the cper error log and get it cleaned, this will prevent the cper table
leak to the second kernel, which will fix the kdump fail problem, and
also guarrante the cper log is collected which it's meant to.

Anyway,For x86_mce platform, the ghes module is still needed not to
panic for fatal memory UE as it's MCE handler's work.

Signed-off-by: Aili Yao <yaoaili@kingsoft.com>
---
 arch/x86/kernel/cpu/mce/core.c     |  2 +
 arch/x86/kernel/cpu/mce/internal.h |  5 ++
 drivers/acpi/apei/ghes.c           | 79 ++++++++++++++++++++++++++++++
 include/acpi/ghes.h                |  2 +
 4 files changed, 88 insertions(+)

diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index 4102b866e7c0..22efa708ef53 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -341,6 +341,8 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
 		if (!apei_err)
 			apei_err = apei_write_mce(final);
 	}
+	/* Print possible additional cper error info, get cper cleared */
+	ghes_in_mce_cper_entry_check();
 	if (cpu_missing)
 		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 	if (exp)
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 88dcc79cfb07..3aea48400af3 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -71,6 +71,7 @@ int apei_write_mce(struct mce *m);
 ssize_t apei_read_mce(struct mce *m, u64 *record_id);
 int apei_check_mce(void);
 int apei_clear_mce(u64 record_id);
+extern int ghes_in_mce_cper_entry_check(void);
 #else
 static inline int apei_write_mce(struct mce *m)
 {
@@ -88,6 +89,10 @@ static inline int apei_clear_mce(u64 record_id)
 {
 	return -EINVAL;
 }
+static inline int ghes_in_mce_cper_entry_check(void)
+{
+	return 0;
+}
 #endif
 
 /*
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index fce7ade2aba9..2c4274a0bec0 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -1147,9 +1147,88 @@ static void ghes_nmi_remove(struct ghes *ghes)
 	 */
 	synchronize_rcu();
 }
+
+/*
+ * Only called by mce_panic, Return value will be ignored, just for debug
+ * purpose; when mce_panic is called, there may be meanwhile other hw error
+ * triggered through NMI, this function may lead that NMI unhandled,
+ * as we are in panic, collecting log will be sufficient.
+ */
+int ghes_in_mce_cper_entry_check(void)
+{
+	int ret = -ENOENT;
+	struct ghes *ghes;
+	struct list_head *rcu_list = &ghes_nmi;
+	enum fixed_addresses fixmap_idx = FIX_APEI_GHES_NMI;
+	struct acpi_hest_generic_status *estatus, tmp_header;
+	struct ghes_estatus_node *estatus_node;
+	u32 len, node_len;
+	u64 buf_paddr;
+
+	/* if NMI handler already in process, let NMI do its job */
+	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
+		return 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ghes, rcu_list, list) {
+		int rc;
+
+		rc = __ghes_peek_estatus(ghes, &tmp_header, &buf_paddr, fixmap_idx);
+		if (rc) {
+			ghes_clear_estatus(ghes, &tmp_header, buf_paddr, fixmap_idx);
+			ret = rc;
+			continue;
+		}
+
+		rc = __ghes_check_estatus(ghes, &tmp_header);
+		if (rc) {
+			ghes_clear_estatus(ghes, &tmp_header, buf_paddr, fixmap_idx);
+			ret = rc;
+			continue;
+		}
+
+		len = cper_estatus_len(&tmp_header);
+		node_len = GHES_ESTATUS_NODE_LEN(len);
+		estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);
+		if (!estatus_node) {
+			/* Going to panic, No need to keep the error. */
+			ghes_clear_estatus(ghes, &tmp_header, buf_paddr, fixmap_idx);
+			ret = -ENOMEM;
+			goto done;
+		}
+
+		estatus_node->ghes = ghes;
+		estatus_node->generic = ghes->generic;
+		estatus_node->task_work.func = NULL;
+		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+
+		if (__ghes_read_estatus(estatus, buf_paddr, fixmap_idx, len)) {
+			ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
+			gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
+			ret = -ENOENT;
+			continue;
+		}
+
+		/*
+		 * As we are going to panic, and preemt the possible NMI handing,
+		 * dump all the info and get it cleared.
+		 */
+		ghes_print_queued_estatus();
+		__ghes_print_estatus(KERN_EMERG, ghes->generic, estatus);
+		ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
+
+		gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
+		      node_len);
+	}
+done:
+	rcu_read_unlock();
+	atomic_dec(&ghes_in_nmi);
+	return ret;
+}
 #else /* CONFIG_HAVE_ACPI_APEI_NMI */
 static inline void ghes_nmi_add(struct ghes *ghes) { }
 static inline void ghes_nmi_remove(struct ghes *ghes) { }
+int ghes_in_mce_cper_entry_check(void) {}
 #endif /* CONFIG_HAVE_ACPI_APEI_NMI */
 
 static void ghes_nmi_init_cxt(void)
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 34fb3431a8f3..be1ee0e993d2 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -145,4 +145,6 @@ int ghes_notify_sea(void);
 static inline int ghes_notify_sea(void) { return -ENOENT; }
 #endif
 
+int ghes_in_mce_cper_entry_check(void);
+
 #endif /* GHES_H */
-- 
2.18.4



^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] Dump cper error table in mce_panic
  2020-11-04  6:50 [PATCH] Dump cper error table in mce_panic yaoaili126
@ 2020-11-04 10:16 ` kernel test robot
  2020-11-06 19:35 ` James Morse
  2020-11-17  9:58 ` [PATCH v2] " Aili Yao
  2 siblings, 0 replies; 16+ messages in thread
From: kernel test robot @ 2020-11-04 10:16 UTC (permalink / raw)
  To: yaoaili126, rjw, lenb, tony.luck, bp, james.morse
  Cc: kbuild-all, linux-acpi, linux-edac, yangfeng1, CHENGUOMIN, yaoaili

[-- Attachment #1: Type: text/plain, Size: 3682 bytes --]

Hi,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on tip/x86/core]
[also build test ERROR on pm/linux-next v5.10-rc2 next-20201103]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:    https://github.com/0day-ci/linux/commits/yaoaili126-163-com/Dump-cper-error-table-in-mce_panic/20201104-150937
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git 238c91115cd05c71447ea071624a4c9fe661f970
config: x86_64-randconfig-s021-20201104 (attached as .config)
compiler: gcc-9 (Debian 9.3.0-15) 9.3.0
reproduce:
        # apt-get install sparse
        # sparse version: v0.6.3-76-gf680124b-dirty
        # https://github.com/0day-ci/linux/commit/b11831c841cb8046a9e01300f5d91985c293e045
        git remote add linux-review https://github.com/0day-ci/linux
        git fetch --no-tags linux-review yaoaili126-163-com/Dump-cper-error-table-in-mce_panic/20201104-150937
        git checkout b11831c841cb8046a9e01300f5d91985c293e045
        # save the attached .config to linux build tree
        make W=1 C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__' ARCH=x86_64 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   ld: arch/x86/kernel/cpu/mce/core.o: in function `mce_panic':
>> arch/x86/kernel/cpu/mce/core.c:346: undefined reference to `ghes_in_mce_cper_entry_check'

vim +346 arch/x86/kernel/cpu/mce/core.c

   297	
   298	static void mce_panic(const char *msg, struct mce *final, char *exp)
   299	{
   300		int apei_err = 0;
   301		struct llist_node *pending;
   302		struct mce_evt_llist *l;
   303	
   304		if (!fake_panic) {
   305			/*
   306			 * Make sure only one CPU runs in machine check panic
   307			 */
   308			if (atomic_inc_return(&mce_panicked) > 1)
   309				wait_for_panic();
   310			barrier();
   311	
   312			bust_spinlocks(1);
   313			console_verbose();
   314		} else {
   315			/* Don't log too much for fake panic */
   316			if (atomic_inc_return(&mce_fake_panicked) > 1)
   317				return;
   318		}
   319		pending = mce_gen_pool_prepare_records();
   320		/* First print corrected ones that are still unlogged */
   321		llist_for_each_entry(l, pending, llnode) {
   322			struct mce *m = &l->mce;
   323			if (!(m->status & MCI_STATUS_UC)) {
   324				print_mce(m);
   325				if (!apei_err)
   326					apei_err = apei_write_mce(m);
   327			}
   328		}
   329		/* Now print uncorrected but with the final one last */
   330		llist_for_each_entry(l, pending, llnode) {
   331			struct mce *m = &l->mce;
   332			if (!(m->status & MCI_STATUS_UC))
   333				continue;
   334			if (!final || mce_cmp(m, final)) {
   335				print_mce(m);
   336				if (!apei_err)
   337					apei_err = apei_write_mce(m);
   338			}
   339		}
   340		if (final) {
   341			print_mce(final);
   342			if (!apei_err)
   343				apei_err = apei_write_mce(final);
   344		}
   345		/* Print possible additional cper error info, get cper cleared */
 > 346		ghes_in_mce_cper_entry_check();
   347		if (cpu_missing)
   348			pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
   349		if (exp)
   350			pr_emerg(HW_ERR "Machine check: %s\n", exp);
   351		if (!fake_panic) {
   352			if (panic_timeout == 0)
   353				panic_timeout = mca_cfg.panic_timeout;
   354			panic(msg);
   355		} else
   356			pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
   357	}
   358	

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 36369 bytes --]

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] Dump cper error table in mce_panic
  2020-11-04  6:50 [PATCH] Dump cper error table in mce_panic yaoaili126
  2020-11-04 10:16 ` kernel test robot
@ 2020-11-06 19:35 ` James Morse
  2020-11-18  3:12   ` Aili Yao
  2020-11-17  9:58 ` [PATCH v2] " Aili Yao
  2 siblings, 1 reply; 16+ messages in thread
From: James Morse @ 2020-11-06 19:35 UTC (permalink / raw)
  To: yaoaili126
  Cc: rjw, lenb, tony.luck, bp, linux-acpi, linux-edac, yangfeng1,
	CHENGUOMIN, yaoaili

Hello!

On 04/11/2020 06:50, yaoaili126@163.com wrote:
> From: Aili Yao <yaoaili@kingsoft.com>
> 
> For X86_MCE, When there is a fatal ue error, BIOS will prepare one
> detailed cper error table before raising MCE,

(outside GHES-ASSIST), Its not supposed to do this.

There is an example flow described in 18.4.1 "Example: Firmware First Handling Using NMI
Notification" of ACPI v6.3:
https://uefi.org/sites/default/files/resources/ACPI_Spec_6_3_A_Oct_6_2020.pdf


The machine-check is the notification from hardware, which in step 1 of the above should
go to firmware. You should only see an NMI, which is step 8.
Step 7 is to clear the error from hardware, so triggering a machine-check is pointless.
(but I agree no firmware ever follows this!)


You appear to have something that behaves as GHES-ASSIST. Can you post the decompiled dump
of your HEST table? (decompiled, no binaries!) If its large, you can post it to me off
list and I'll copy the relevant bits here...


> this cper table is meant
> to supply addtional error information and not to race with mce handler
> to panic.

This is a description of GHES_ASSIST. See 18.7 "GHES_ASSIST Error Reporting" of the above pdf.


> Usually possible unexpected cper process from NMI watchdog race panic
> with MCE panic is not a problem, the panic process will coordinate with
> each core. But When the CPER is not processed in the first kernel and
> leave it to the second kernel, this is a problem, lead to a kdump fail.

> Now in this patch, the mce_panic will race with unexpected NMI to dump
> the cper error log and get it cleaned, this will prevent the cper table
> leak to the second kernel, which will fix the kdump fail problem, and
> also guarrante the cper log is collected which it's meant to.

> Anyway,For x86_mce platform, the ghes module is still needed not to
> panic for fatal memory UE as it's MCE handler's work.

If and only if those GHES are marked as GHES_ASSIST.

If they are not, then you have a fully fledged firwmare-first system.

Could you share what your system is describing it as in the HEST so we can work out what
is going on here?!

We need to work this out first.


Thanks,

James


^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH v2] Dump cper error table in mce_panic
  2020-11-04  6:50 [PATCH] Dump cper error table in mce_panic yaoaili126
  2020-11-04 10:16 ` kernel test robot
  2020-11-06 19:35 ` James Morse
@ 2020-11-17  9:58 ` Aili Yao
  2020-11-18 12:45   ` Borislav Petkov
  2 siblings, 1 reply; 16+ messages in thread
From: Aili Yao @ 2020-11-17  9:58 UTC (permalink / raw)
  To: rjw, lenb, tony.luck, bp, james.morse
  Cc: linux-acpi, linux-edac, yangfeng1, CHENGUOMIN

From 3b2602aa50f37321f9b05f9e377ac52f8a1db90a Mon Sep 17 00:00:00 2001
From: Aili Yao <yaoaili@kingsoft.com>
Date: Tue, 17 Nov 2020 17:41:54 +0800
Subject: [PATCH v2] Dump cper error table in mce_panic

For X86_MCE, When BIOS option WHEA memory log is enabled,if there is a
fatal ue error, BIOS will prepare one cper error table before raising MCE,
this cper table is meant to supply addtional error information and not to
race with mce handler to panic, but currently ghes_notify_nmi() from
unexpected NMI watchdog may race to panic with mce.

Usually possible unexpected cper process from NMI watchdog race panic
with MCE panic is not a problem, the panic process will coordinate with
each core. But When the CPER is not processed in the first kernel and
leave it to the second kernel, that is a problem, lead to a kdump fail.

Now in this patch, the mce_panic will race with unexpected NMI to dump
the cper error log and get it cleaned, this will prevent the cper table
leaked to the second kernel, which will prevent ghes_notify_nmi to
process and fix the kdump fail problem, and also guarrante the cper log is
collected which it's meant to.

Anyway,For x86_mce platform, the ghes module is better not to panic with
fatal memory UE as it's MCE handler's work.

Signed-off-by: Aili Yao <yaoaili@kingsoft.com>
---
 arch/x86/kernel/cpu/mce/apei.c     |  5 ++
 arch/x86/kernel/cpu/mce/core.c     |  2 +
 arch/x86/kernel/cpu/mce/internal.h |  5 ++
 drivers/acpi/apei/ghes.c           | 78 ++++++++++++++++++++++++++++++
 include/acpi/ghes.h                |  6 +++
 5 files changed, 96 insertions(+)

diff --git a/arch/x86/kernel/cpu/mce/apei.c b/arch/x86/kernel/cpu/mce/apei.c
index af8d37962586..d2dcae90613b 100644
--- a/arch/x86/kernel/cpu/mce/apei.c
+++ b/arch/x86/kernel/cpu/mce/apei.c
@@ -143,3 +143,8 @@ int apei_clear_mce(u64 record_id)
 {
 	return erst_clear(record_id);
 }
+
+int apei_check_cper(void)
+{
+	return ghes_in_mce_cper_entry_check();
+}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index f43a78bde670..ce468bed352d 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -342,6 +342,8 @@ static void mce_panic(const char *msg, struct mce *final, char *exp)
 		if (!apei_err)
 			apei_err = apei_write_mce(final);
 	}
+	/* Print possible additional cper error info, get cper cleared */
+	apei_check_cper();
 	if (cpu_missing)
 		pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
 	if (exp)
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 6473070b5da4..ad68119fb15c 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -70,6 +70,7 @@ int apei_write_mce(struct mce *m);
 ssize_t apei_read_mce(struct mce *m, u64 *record_id);
 int apei_check_mce(void);
 int apei_clear_mce(u64 record_id);
+int apei_check_cper(void);
 #else
 static inline int apei_write_mce(struct mce *m)
 {
@@ -87,6 +88,10 @@ static inline int apei_clear_mce(u64 record_id)
 {
 	return -EINVAL;
 }
+static inline int apei_check_cper(void)
+{
+	return 0;
+}
 #endif
 
 /*
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 81bf71b10d44..80342e2f9760 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -1084,6 +1084,84 @@ static void ghes_nmi_remove(struct ghes *ghes)
 	 */
 	synchronize_rcu();
 }
+
+/*
+ * Only called by mce_panic, Return value will be ignored, just for debug
+ * purpose; when mce_panic is called, there may be meanwhile other hw error
+ * triggered through NMI, this function may lead that NMI unhandled,
+ * as we are in panic, collecting log will be sufficient.
+ */
+int ghes_in_mce_cper_entry_check(void)
+{
+	int ret = -ENOENT;
+	struct ghes *ghes;
+	struct list_head *rcu_list = &ghes_nmi;
+	enum fixed_addresses fixmap_idx = FIX_APEI_GHES_NMI;
+	struct acpi_hest_generic_status *estatus, tmp_header;
+	struct ghes_estatus_node *estatus_node;
+	u32 len, node_len;
+	u64 buf_paddr;
+
+	/* if NMI handler already in process, let NMI do its job */
+	if (!atomic_add_unless(&ghes_in_nmi, 1, 1))
+		return 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(ghes, rcu_list, list) {
+		int rc;
+
+		rc = __ghes_peek_estatus(ghes, &tmp_header, &buf_paddr, fixmap_idx);
+		if (rc) {
+			ghes_clear_estatus(ghes, &tmp_header, buf_paddr, fixmap_idx);
+			ret = rc;
+			continue;
+		}
+
+		rc = __ghes_check_estatus(ghes, &tmp_header);
+		if (rc) {
+			ghes_clear_estatus(ghes, &tmp_header, buf_paddr, fixmap_idx);
+			ret = rc;
+			continue;
+		}
+
+		len = cper_estatus_len(&tmp_header);
+		node_len = GHES_ESTATUS_NODE_LEN(len);
+		estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);
+		if (!estatus_node) {
+			/* Going to panic, No need to keep the error. */
+			ghes_clear_estatus(ghes, &tmp_header, buf_paddr, fixmap_idx);
+			ret = -ENOMEM;
+			continue;
+		}
+
+		estatus_node->ghes = ghes;
+		estatus_node->generic = ghes->generic;
+		estatus_node->task_work.func = NULL;
+		estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+
+		if (__ghes_read_estatus(estatus, buf_paddr, fixmap_idx, len)) {
+			ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
+			gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
+			ret = -ENOENT;
+			continue;
+		}
+
+		/*
+		 * As we are going to panic, and preemt the possible NMI handing,
+		 * dump all the info and get it cleared.
+		 */
+		ghes_print_queued_estatus();
+		__ghes_print_estatus(KERN_EMERG, ghes->generic, estatus);
+		ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
+
+		gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
+		      node_len);
+	}
+
+	rcu_read_unlock();
+	atomic_dec(&ghes_in_nmi);
+	return ret;
+}
 #else /* CONFIG_HAVE_ACPI_APEI_NMI */
 static inline void ghes_nmi_add(struct ghes *ghes) { }
 static inline void ghes_nmi_remove(struct ghes *ghes) { }
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 517a5231cc1b..52a8638a0495 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -127,4 +127,10 @@ int ghes_notify_sea(void);
 static inline int ghes_notify_sea(void) { return -ENOENT; }
 #endif
 
+#if defined(CONFIG_ACPI_APEI_GHES) && defined(CONFIG_HAVE_ACPI_APEI_NMI)
+int ghes_in_mce_cper_entry_check(void);
+#else
+static inline int ghes_in_mce_cper_entry_check(void) { return 0; }
+#endif
+
 #endif /* GHES_H */
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] Dump cper error table in mce_panic
  2020-11-06 19:35 ` James Morse
@ 2020-11-18  3:12   ` Aili Yao
  0 siblings, 0 replies; 16+ messages in thread
From: Aili Yao @ 2020-11-18  3:12 UTC (permalink / raw)
  To: James Morse
  Cc: rjw, lenb, tony.luck, bp, linux-acpi, linux-edac, yangfeng1,
	CHENGUOMIN, yaoaili

Hi, Thanks for your comments!

On Fri, 6 Nov 2020 19:35:32 +0000
James Morse <james.morse@arm.com> wrote:

> You appear to have something that behaves as GHES-ASSIST. Can you post the decompiled dump
> of your HEST table? (decompiled, no binaries!) If its large, you can post it to me off
> list and I'll copy the relevant bits here...
> 
I think we we can reach a consensus, from and follow Intel Document #563361 23.1 :
Feature Name MCA 2.0 Recovery (as per EMCA Gen2 architecture)
Feature Description
Software layer assisted recovery from uncorrected data errors as defined by the EMCA Gen2
specification. EMCA Gen2 is a capability that allows firmware to intercept errors triggered via Machine
Check Architecture (corrected and uncorrected errors) enabling a Firmware First Model (FFM) of error
handling and possible recovery.
Use Case
Enhanced error reporting to support Firmware First Model (FFM) with following attributes:
1. Allows the SMM code to intercept the MCE/CMCI.
2. Allows the SMM code to write the MCA Status/Add/Misc registers.
3. Allows the SMM code to generate MCEs.
4. Allows the DSM based pointer for enhanced error logs.
5. Additional IA32_MCG_CAP bit for eMCA support

> 
> If and only if those GHES are marked as GHES_ASSIST.
> 
> If they are not, then you have a fully fledged firwmare-first system.
> 

Yeah, This should be GHES_ASSIST, But For x86, BIOS don't supply a hest table for it as BIOS will trigger
MCE, It's out of APEI scope. 


-- 
Best Regards!

Aili Yao

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] Dump cper error table in mce_panic
  2020-11-17  9:58 ` [PATCH v2] " Aili Yao
@ 2020-11-18 12:45   ` Borislav Petkov
  2020-11-19  5:40     ` Aili Yao
  0 siblings, 1 reply; 16+ messages in thread
From: Borislav Petkov @ 2020-11-18 12:45 UTC (permalink / raw)
  To: Aili Yao
  Cc: rjw, lenb, tony.luck, james.morse, linux-acpi, linux-edac,
	yangfeng1, CHENGUOMIN

On Tue, Nov 17, 2020 at 05:58:04PM +0800, Aili Yao wrote:
> Subject: [PATCH v2] Dump cper error table in mce_panic
> 
> For X86_MCE, When BIOS option WHEA memory log is enabled,if there is a
> ...

This commit message makes no sense to me and I have no clue what you're
trying to "fix". So before you do anything, please describe the problem
you're seeing first using this structure:

Problem is A.

It happens because of B.

Fix it by doing C.

(Potentially do D).

When you do that, use simple english sentences and not complex ones.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] Dump cper error table in mce_panic
  2020-11-18 12:45   ` Borislav Petkov
@ 2020-11-19  5:40     ` Aili Yao
  2020-11-19 17:45       ` Borislav Petkov
  0 siblings, 1 reply; 16+ messages in thread
From: Aili Yao @ 2020-11-19  5:40 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: rjw, lenb, tony.luck, james.morse, linux-acpi, linux-edac,
	yangfeng1, CHENGUOMIN

Hi, Thanks for your reply! 

On Wed, 18 Nov 2020 13:45:38 +0100
Borislav Petkov <bp@alien8.de> wrote:

> 
> Problem is A.
> 

1.There are a lot of Memory UE crash cases on production enviroment, but there is no vmcore-dmesg.txt collected. 
Only log we got is SEL log, We don't know what is happenning in kernel.

2.Then I tested this using EINJ module, I injected a Fatal memory error, checked the console log followed: 

Error injected:

[  140.833351] EINJ: Error INJection is initialized.
[  140.852107] EDAC MC: Removed device 0 for skx_edac Skylake Socket#0 IMC#0: DEV 0000:2e:0a.0
[  140.867095] EDAC MC: Removed device 1 for skx_edac Skylake Socket#0 IMC#1: DEV 0000:2e:0c.0
[  140.885091] EDAC MC: Removed device 2 for skx_edac Skylake Socket#1 IMC#0: DEV 0000:ae:0a.0
[  140.900096] EDAC MC: Removed device 3 for skx_edac Skylake Socket#1 IMC#1: DEV 0000:ae:0c.0
[  144.092321] Disabling lock debugging due to kernel taint
[  144.098423] core: [Hardware Error]: CPU 18: Machine Check Exception: 5 Bank 7: be00000001010090
[  144.108138] core: [Hardware Error]: RIP !INEXACT! 10:<ffffffffa16506be> {native_safe_halt+0xe/0x10}
[  144.118239] core: [Hardware Error]: TSC a806dc0f34 ADDR 5c7a4df000 MISC 200000c020002086 
[  144.127366] core: [Hardware Error]: PROCESSOR 0:50657 TIME 1605754444 SOCKET 1 APIC 40 microcode 5000021
[  144.137946] core: [Hardware Error]: Run the above through 'mcelog --ascii'
[  144.148275] core: [Hardware Error]: Machine check: Processor context corrupt
[  144.156140] Kernel panic - not syncing: Fatal machine check

Kdump triggered:

[    0.000000] Linux version 4.18.0+  #37 SMP Thu Nov 19 10:38:16 CST 2020
[    0.000000] Command line: BOOT_IMAGE=/vmlinuz-4.18.0+ ro nomodeset net.ifnames=0 biosdevname=0 rdloaddriver=mlx5_core rdloaddriver=i40e rdloaddriver=ixgbe strict-devmem=0 console=ttyS0,115200n8 irqpoll nr_cpus=1 reset_devices cgroup_disable=memory mce=off numa=off udev.children-max=2 panic=10 rootflags=nofail acpi_no_memhotplug transparent_hugepage=never nokaslr disable_cpu_apicid=0 elfcorehdr=403684744K
[    0.000000] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x008: 'MPX bounds registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x010: 'MPX CSR'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x020: 'AVX-512 opmask'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x040: 'AVX-512 Hi256'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x080: 'AVX-512 ZMM_Hi256'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x200: 'Protection Keys User registers'
[    0.000000] x86/fpu: xstate_offset[2]:  576, xstate_sizes[2]:  256
[    0.000000] x86/fpu: xstate_offset[3]:  832, xstate_sizes[3]:   64
[    0.000000] x86/fpu: xstate_offset[4]:  896, xstate_sizes[4]:   64
[    0.000000] x86/fpu: xstate_offset[5]:  960, xstate_sizes[5]:   64
[    0.000000] x86/fpu: xstate_offset[6]: 1024, xstate_sizes[6]:  512
[    0.000000] x86/fpu: xstate_offset[7]: 1536, xstate_sizes[7]: 1024
[    0.000000] x86/fpu: xstate_offset[9]: 2560, xstate_sizes[9]:    8
[    0.000000] x86/fpu: Enabled xstate features 0x2ff, context size is 2568 bytes, using 'compacted' format.
[    0.000000] BIOS-provided physical RAM map:
[    0.000000] BIOS-e820: [mem 0x0000000000001000-0x000000000009afff] usable
[    0.000000] BIOS-e820: [mem 0x0000000095000000-0x00000000a4ffffff] usable
[    0.000000] BIOS-e820: [mem 0x00000000a5a74000-0x00000000a5f3ffff] ACPI data
[    0.000000] BIOS-e820: [mem 0x00000000a5f40000-0x00000000a6b56fff] ACPI NVS
[    0.000000] BIOS-e820: [mem 0x0000005fbf000000-0x000000603ef61fff] usable
...
...
...
[    2.597514] ACPI: Power Button [PWRF]
[    2.601782] APEI: Can not request [mem 0xa7d0e040-0xa7d0e04c] for APEI ERST registers
[    2.610529] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 4
[    2.611493] {1}[Hardware Error]: event severity: fatal
[    2.611493] {1}[Hardware Error]:  Error 0, type: fatal
[    2.611493] {1}[Hardware Error]:  fru_text: Card03, ChnA, DIMM0
[    2.611493] {1}[Hardware Error]:   section_type: memory error
[    2.611493] {1}[Hardware Error]:   error_status: 0x0000000000000000
[    2.611493] {1}[Hardware Error]:   physical_address: 0x0000005c7a4df000
[    2.611493] {1}[Hardware Error]:   node: 2 card: 0 module: 0 rank: 1 bank: 1 device: 0 row: 57217 column: 976 
[    2.611493] {1}[Hardware Error]:   DIMM location: CPU 1 DIMM 8 
[    2.611493] Kernel panic - not syncing: Fatal hardware error!
[    2.611493] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.18.0+ #37
[    2.611493] Hardware name: Lenovo ThinkSystem SR650, BIOS -[IVE636Z-2.13]- 07/18/2019
[    2.611493] Call Trace:
[    2.611493]  dump_stack+0x5c/0x80
[    2.611493]  panic+0xe7/0x2a9
[    2.611493]  __ghes_panic.cold+0x21/0x21
[    2.611493]  ? ghes_proc+0xfd/0x120
[    2.611493]  ? ghes_probe+0x129/0x3b0
[    2.611493]  ? platform_drv_probe+0x38/0x90
[    2.611493]  ? really_probe+0xf9/0x3a0
[    2.611493]  ? driver_probe_device+0x4b/0xc0
[    2.611493]  ? device_driver_attach+0x55/0x60
[    2.611493]  ? __driver_attach+0x62/0x140
[    2.611493]  ? device_driver_attach+0x60/0x60
[    2.611493]  ? bus_for_each_dev+0x78/0xc0
[    2.611493]  ? bus_add_driver+0x14d/0x1f0
[    2.611493]  ? driver_register+0x6c/0xb0
[    2.611493]  ? bert_init+0x220/0x220
[    2.611493]  ? ghes_init+0x88/0xe6
[    2.611493]  ? do_one_initcall+0x46/0x1c4
[    2.611493]  ? kernel_init_freeable+0x334/0x3ca
[    2.611493]  ? rest_init+0xaa/0xaa
[    2.611493]  ? kernel_init+0xa/0xf7
[    2.611493]  ? ret_from_fork+0x1f/0x40
[    2.611493] Kernel Offset: disabled
[    2.611493] Rebooting in 10 seconds..
[    2.611493] ACPI MEMORY or I/O RESET_REG.
 UEFI:START PEI          
 UEFI:START PEI          
 UEFI:MEM INIT           
<F1> System Setup     <F10> PXE Boot            
<F2> Diagnostic       <F12> One Time Boot Device
 UEFI:DXE INIT           

> It happens because of B.
> 
It happens because of: 
1. BIOS Option WHEA Memory Log Enabled, BIOS will provide a CPER error record with Panic severity for fatal memory UE
2. A fatal memory error happens.
3. This record is not processed in first kernel, mce handler do panic action.
4. Second kernel is started and the record is processed, panic.
5. vmcore and vmcore-dmesg.txt is not collected.

> Fix it by doing C.
> 
1. In the mce_panic(), dump the cper error records, ack and clear the related cper record.
   The cper record will not be leaked to second kernel. Then the kdump will work right.
   still the cper log is collected in vmcore-dmesg.txt.

> (Potentially do D).
1. Don't do panic in ghes module for Fatal Memory UE in x86 Platform when EMCA enabled.
When Fatal Memory UE is in process, the NMI watchdog may be triggered, then the ghes_notify_nmi() will be called. For x86 Platform
We don't know the source of NMI, we have no idea about this NMI is just for watchdog or other reason.
if ghes_notify_nmi() find the cper record with panic severity, then do panic.
This is not supposed to so.
2. In some senario, the un-expected NMI process will preempt the MCE, like following:
The NMI totally preempt MCE, we can collect vmcore-dmesg.txt this time,
But we have lost the mcelog in vmcore-dmesg.txt and erst pstore.

[  995.167270] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 4
[  995.167271] {1}[Hardware Error]: event severity: fatal
[  995.167271] {1}[Hardware Error]:  Error 0, type: fatal
[  995.167271] {1}[Hardware Error]:  fru_text: Card03, ChnA, DIMM0
[  995.167272] {1}[Hardware Error]:   section_type: memory error
[  995.167272] {1}[Hardware Error]:   error_status: 0x0000000000000000
[  995.167272] {1}[Hardware Error]:   physical_address: 0x00000046fe735000
[  995.167273] {1}[Hardware Error]:   node: 2 card: 0 module: 0 rank: 0 bank: 1 device: 0 row: 62761 column: 208
[  995.167273] {1}[Hardware Error]:   DIMM location: CPU 1 DIMM 4
[  995.167274] Kernel panic - not syncing: Fatal hardware error!
[  995.167274] CPU: 47 PID: 0 Comm: swapper/47 Kdump: loaded Not tainted 4.18.0+ #20
[  995.167274] Hardware name: Lenovo ThinkSystem SR650 -[7X06CTO1WW]-/-[7X06CTO1WW]-, BIOS -[IVE636Z-2.13]- 07/18/2019
[  995.167275] Call Trace:
[  995.167275]  <NMI>
[  995.167275]  dump_stack+0x5a/0x73
[  995.167275]  panic+0xe8/0x2bc
[  995.167276]  __ghes_panic+0x68/0x6a
[  995.167276]  ghes_notify_nmi+0x23b/0x290
[  995.167276]  nmi_handle+0x69/0x110
[  995.167276]  default_do_nmi+0x3e/0x110
[  995.167277]  do_nmi+0x116/0x190
[  995.167277]  end_repeat_nmi+0x16/0x6a
[  995.167277] RIP: 0010:machine_check+0x0/0x40
[  995.167278] Code: 00 00 48 89 e7 48 8b 74 24 78 48 c7 44 24 78 ff ff ff ff e8 b2 0e 66 ff e9 ed 02 00 00 0f 1f 00 66 2e 0f 1f 84 00 00 00 00 00 <0f> 01 ca 6a ff f6 44 24 10 03 75 14 e8 2f 00 00 00 48 89 e7 31 f6
[  995.167278] RSP: 0018:fffffe00007effd8 EFLAGS: 00000046
[  995.167279] RAX: ffffffff97eac2c0 RBX: ffff8f8cc0ce5c40 RCX: 7fffff18a792c0bf
[  995.167279] RDX: 0000000000000001 RSI: 000000000000002f RDI: ffff8fa39fd5d600
[  995.167280] RBP: 000000000000002f R08: 0000000000000008 R09: ffffffdb56bbc181
[  995.167280] R10: 000000000000002f R11: 0000000000000000 R12: 0000000000000000
[  995.167280] R13: 0000000000000000 R14: ffff8f8cc0ce5c40 R15: ffff8f8cc0ce5c40
[  995.167280]  ? __sched_text_end+0x4/0x4
[  995.167281]  ? async_page_fault+0x30/0x30
[  995.167281]  ? async_page_fault+0x30/0x30
[  995.167281]  </NMI>
[  995.167281]  <#MC>
[  995.167282] RIP: 0010:native_safe_halt+0xe/0x10
[  995.167282] Code: ff ff eb bc 90 90 90 90 90 90 90 90 e9 07 00 00 00 0f 00 2d 76 c8 55 00 f4 c3 66 90 e9 07 00 00 00 0f 00 2d 66 c8 55 00 fb f4 <c3> 90 0f 1f 44 00 00 41 54 55 53 e8 e2 29 88 ff 65 8b 2d d3 2a 16
[  995.167283] RSP: 0018:ffffa2e4cca47ea0 EFLAGS: 00000246 </#MC>
[  995.167283]  default_idle+0x1a/0x130
[  995.167283]  do_idle+0x1a6/0x290
[  995.167283]  cpu_startup_entry+0x6f/0x80
[  995.167284]  start_secondary+0x1aa/0x200
[  995.167284]  secondary_startup_64+0xb7/0xc0

-- 
Best Regards!

Aili Yao

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] Dump cper error table in mce_panic
  2020-11-19  5:40     ` Aili Yao
@ 2020-11-19 17:45       ` Borislav Petkov
  2020-11-20  3:40         ` Aili Yao
  2020-11-20  9:22         ` Aili Yao
  0 siblings, 2 replies; 16+ messages in thread
From: Borislav Petkov @ 2020-11-19 17:45 UTC (permalink / raw)
  To: Aili Yao
  Cc: rjw, lenb, tony.luck, james.morse, linux-acpi, linux-edac,
	yangfeng1, CHENGUOMIN

On Thu, Nov 19, 2020 at 01:40:57PM +0800, Aili Yao wrote:
> [    0.000000] Linux version 4.18.0+  #37 SMP Thu Nov 19 10:38:16 CST 2020

Ok, before we look any further into this, please redo the whole exercise
with the latest upstream kernel - not some 4.18 old crap. Use the
tip/master branch:

https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/log/?h=master

And then paste results again according to the scheme.

Thx.

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] Dump cper error table in mce_panic
  2020-11-19 17:45       ` Borislav Petkov
@ 2020-11-20  3:40         ` Aili Yao
  2020-11-20  9:22         ` Aili Yao
  1 sibling, 0 replies; 16+ messages in thread
From: Aili Yao @ 2020-11-20  3:40 UTC (permalink / raw)
  To: Borislav Petkov, lenb
  Cc: rjw, tony.luck, james.morse, linux-acpi, linux-edac, yangfeng1,
	CHENGUOMIN

On Thu, 19 Nov 2020 18:45:08 +0100
Borislav Petkov <bp@alien8.de> wrote:

> 
> Ok, before we look any further into this, please redo the whole exercise
> with the latest upstream kernel - not some 4.18 old crap. Use the
> tip/master branch:
>
Hi, here it is:

Error injected:

[    0.000000] Linux version 5.10.0-rc4 (root@qd01-test-ec2177009236.qd01.ksyun.com) (gcc (GCC) 9.3.0, GNU ld version 2.27-43.base.el7_8.1) #3 SMP Fri Nov 20 11:22:24 CST 2020
[    0.000000] Command line: BOOT_IMAGE=/vmlinuz-5.10.0-rc4 root=UUID=a503562a-dafc-40eb-b4c2-6607a3497b5e ro crashkernel=2G nomodeset net.ifnames=0 biosdevname=0 rdloaddriver=mlx5_core rdloaddriver=i40e rdloaddriver=ixgbe strict-devmem=0 hugepages=8192 console=ttyS0,115200n8
qd01-test-ec2177009236 login: [  103.725568] EINJ: Error INJection is initialized.
[  103.746313] EDAC MC: Removed device 0 for skx_edac Skylake Socket#0 IMC#0: DEV 0000:2e:0a.0
[  103.764305] EDAC MC: Removed device 1 for skx_edac Skylake Socket#0 IMC#1: DEV 0000:2e:0c.0
[  103.779293] EDAC MC: Removed device 2 for skx_edac Skylake Socket#1 IMC#0: DEV 0000:ae:0a.0
[  103.794290] EDAC MC: Removed device 3 for skx_edac Skylake Socket#1 IMC#1: DEV 0000:ae:0c.0
[  106.956286] Disabling lock debugging due to kernel taint
[  106.962373] mce: [Hardware Error]: CPU 18: Machine Check Exception: 5 Bank 7: be00000001010091
[  106.962377] mce: [Hardware Error]: RIP !INEXACT! 10:<ffffffffac58472a> 
[  106.996488] {acpi_idle_do_entry+0x4a/0x60}
[  107.001057] mce: [Hardware Error]: TSC ae4b410af0b8 ADDR 314d193000 MISC 200400c008002086 
[  107.010283] mce: [Hardware Error]: PROCESSOR 0:50657 TIME 1605843017 SOCKET 1 APIC 40 microcode 5000021
[  107.020767] mce: [Hardware Error]: Run the above through 'mcelog --ascii'
[  107.031295] mce: [Hardware Error]: Machine check: Processor context corrupt
[  107.039065] Kernel panic - not syncing: Fatal machine check

Kdump triggered:

[    0.000000] Linux version 5.10.0-rc4 (root@qd01-test-ec2177009236.qd01.ksyun.com) (gcc (GCC) 9.3.0, GNU ld version 2.27-43.base.el7_8.1) #3 SMP Fri Nov 20 11:22:24 CST 2020
[    0.000000] Command line: BOOT_IMAGE=/vmlinuz-5.10.0-rc4 ro nomodeset net.ifnames=0 biosdevname=0 rdloaddriver=mlx5_core rdloaddriver=i40e rdloaddriver=ixgbe strict-devmem=0 console=ttyS0,115200n8 irqpoll nr_cpus=1 reset_devices cgroup_disable=memory mce=off numa=off udev.children-max=2 panic=10 rootflags=nofail acpi_no_memhotplug transparent_hugepage=never nokaslr disable_cpu_apicid=0 elfcorehdr=403684744K
[    0.000000] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x008: 'MPX bounds registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x010: 'MPX CSR'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x020: 'AVX-512 opmask'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x040: 'AVX-512 Hi256'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x080: 'AVX-512 ZMM_Hi256'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x200: 'Protection Keys User registers'
[    0.000000] x86/fpu: xstate_offset[2]:  576, xstate_sizes[2]:  256
[    0.000000] x86/fpu: xstate_offset[3]:  832, xstate_sizes[3]:   64
[    0.000000] x86/fpu: xstate_offset[4]:  896, xstate_sizes[4]:   64
[    0.000000] x86/fpu: xstate_offset[5]:  960, xstate_sizes[5]:   64
[    0.000000] x86/fpu: xstate_offset[6]: 1024, xstate_sizes[6]:  512
[    0.000000] x86/fpu: xstate_offset[7]: 1536, xstate_sizes[7]: 1024
[    0.000000] x86/fpu: xstate_offset[9]: 2560, xstate_sizes[9]:    8
[    0.000000] x86/fpu: Enabled xstate features 0x2ff, context size is 2568 bytes, using 'compacted' format.

...
...
...

[    5.946962] io scheduler bfq registered
[    5.951261] atomic64_test: passed for x86-64 platform with CX8 and with SSE
[    5.960169] shpchp: Standard Hot Plug PCI Controller Driver version: 0.4
[    5.968096] input: Power Button as /devices/LNXSYSTM:00/LNXPWRBN:00/input/input0
[    5.976376] ACPI: Power Button [PWRF]
[    5.980794] APEI: Can not request [mem 0xa7d0e040-0xa7d0e04c] for APEI ERST registers
[    5.989550] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 4
[    5.990513] {1}[Hardware Error]: event severity: fatal
[    5.990513] {1}[Hardware Error]:  Error 0, type: fatal
[    5.990513] {1}[Hardware Error]:  fru_text: Card03, ChnB, DIMM0
[    5.990513] {1}[Hardware Error]:   section_type: memory error
[    5.990513] {1}[Hardware Error]:   error_status: 0x0000000000000000
[    5.990513] {1}[Hardware Error]:   physical_address: 0x000000314d193000
[    5.990513] {1}[Hardware Error]:   node: 2 card: 1 module: 0 rank: 0 bank: 3 device: 0 row: 1651 column: 128 
[    5.990513] {1}[Hardware Error]:   DIMM location: CPU 1 DIMM 8 
[    5.990513] Kernel panic - not syncing: Fatal hardware error!
[    5.990513] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G          I       5.10.0-rc4 #3
[    5.990513] Hardware name: Lenovo ThinkSystem SR650 -[7X06CTO1WW]-/-[7X06CTO1WW]-, BIOS -[IVE636Z-2.13]- 07/18/2019
[    5.990513] Call Trace:
[    5.990513]  dump_stack+0x57/0x6a
[    5.990513]  panic+0xfb/0x2d7
[    5.990513]  __ghes_panic.cold+0x21/0x21
[    5.990513]  ghes_proc+0xe0/0x140
[    5.990513]  ghes_probe+0x129/0x380
[    5.990513]  platform_drv_probe+0x35/0x80
[    5.990513]  really_probe+0x31b/0x420
[    5.990513]  driver_probe_device+0xe1/0x150
[    5.990513]  device_driver_attach+0x53/0x60
[    5.990513]  __driver_attach+0x8a/0x150
[    5.990513]  ? device_driver_attach+0x60/0x60
[    5.990513]  ? device_driver_attach+0x60/0x60
[    5.990513]  bus_for_each_dev+0x78/0xc0
[    5.990513]  bus_add_driver+0x14d/0x1f0
[    5.990513]  driver_register+0x6c/0xc0
[    5.990513]  ? bert_init+0x22a/0x22a
[    5.990513]  ghes_init+0x87/0xe5
[    5.990513]  do_one_initcall+0x44/0x1d0
[    5.990513]  kernel_init_freeable+0x1d3/0x235
[    5.990513]  ? rest_init+0xb4/0xb4
[    5.990513]  kernel_init+0xa/0x10c
[    5.990513]  ret_from_fork+0x1f/0x30
[    5.990513] Kernel Offset: disabled
[    5.990513] Rebooting in 10 seconds..
[    5.990513] ACPI MEMORY or I/O RESET_REG.
 UEFI:START PEI          
 UEFI:START PEI          
 UEFI:MEM INIT           
<F1> System Setup     <F10> PXE Boot            
<F2> Diagnostic       <F12> One Time Boot Device
 UEFI:DXE INIT           
 UEFI:DXE INIT           
                

-- 
Thanks

Best Regards!

Aili Yao

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] Dump cper error table in mce_panic
  2020-11-19 17:45       ` Borislav Petkov
  2020-11-20  3:40         ` Aili Yao
@ 2020-11-20  9:22         ` Aili Yao
  2020-11-20 10:24           ` Borislav Petkov
  1 sibling, 1 reply; 16+ messages in thread
From: Aili Yao @ 2020-11-20  9:22 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: rjw, lenb, tony.luck, james.morse, linux-acpi, linux-edac,
	yangfeng1, CHENGUOMIN, Aili Yao

On Thu, 19 Nov 2020 18:45:08 +0100
Borislav Petkov <bp@alien8.de> wrote:

> On Thu, Nov 19, 2020 at 01:40:57PM +0800, Aili Yao wrote:
> > [    0.000000] Linux version 4.18.0+  #37 SMP Thu Nov 19 10:38:16 CST 2020  
> 
> Ok, before we look any further into this, please redo the whole exercise
> with the latest upstream kernel - not some 4.18 old crap. Use the
> tip/master branch:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/log/?h=master
> 
Hi, This test result if from tip/master, previous is upstream latest.

qd01-test-ec2177009236 login: 
CentOS Linux 7 (Core)
Kernel 5.10.0-rc3 on an x86_64

qd01-test-ec2177009236 login: [  103.306303] usb 1-1.6: USB disconnect, device number 4
[  103.312086] cdc_ether 1-1.6:1.0 usb0: unregister 'cdc_ether' usb-0000:00:14.0-1.6, CDC Ethernet Device
[  105.326306] usb 1-1.6: new high-speed USB device number 5 using xhci_hcd
[  105.410891] usb 1-1.6: New USB device found, idVendor=04b3, idProduct=4010, bcdDevice= 3.14
[  105.420214] usb 1-1.6: New USB device strings: Mfr=1, Product=2, SerialNumber=0
[  105.428377] usb 1-1.6: Product: XClarity Controller
[  105.433827] usb 1-1.6: Manufacturer: IBM
[  105.440362] cdc_ether 1-1.6:1.0 usb0: register 'cdc_ether' at usb-0000:00:14.0-1.6, CDC Ethernet Device, 3a:68:dd:14:51:1f
[  111.648657] EINJ: Error INJection is initialized.
[  111.666355] EDAC MC: Removed device 0 for skx_edac Skylake Socket#0 IMC#0: DEV 0000:2e:0a.0
[  111.686338] EDAC MC: Removed device 1 for skx_edac Skylake Socket#0 IMC#1: DEV 0000:2e:0c.0
[  111.701336] EDAC MC: Removed device 2 for skx_edac Skylake Socket#1 IMC#0: DEV 0000:ae:0a.0
[  111.726334] EDAC MC: Removed device 3 for skx_edac Skylake Socket#1 IMC#1: DEV 0000:ae:0c.0

[  [    0.000000] Linux version 5.10.0-rc3 (root@qd01-test-ec2177009236.qd01.ksyun.com) (gcc (GCC) 9.3.0, GNU ld version 2.27-43.base.el7_8.1) #3 SMP Fri Nov 20 16:59:03 CST 2020
[    0.000000] Command line: BOOT_IMAGE=/vmlinuz-5.10.0-rc3 ro nomodeset net.ifnames=0 biosdevname=0 rdloaddriver=mlx5_core rdloaddriver=i40e rdloaddriver=ixgbe strict-devmem=0 console=ttyS0,115200n8 irqpoll nr_cpus=1 reset_devices cgroup_disable=memory mce=off numa=off udev.children-max=2 panic=10 rootflags=nofail acpi_no_memhotplug transparent_hugepage=never nokaslr disable_cpu_apicid=0 acpi_rsdp=0xf05a0 elfcorehdr=403684744K
[    0.000000] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x008: 'MPX bounds registers'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x010: 'MPX CSR'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x020: 'AVX-512 opmask'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x040: 'AVX-512 Hi256'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x080: 'AVX-512 ZMM_Hi256'
[    0.000000] x86/fpu: Supporting XSAVE feature 0x200: 'Protection Keys User registers'
[    0.000000] x86/fpu: xstate_offset[2]:  576, xstate_sizes[2]:  256
[    0.000000] x86/fpu: xstate_offset[3]:  832, xstate_sizes[3]:   64
[    0.000000] x86/fpu: xstate_offset[4]:  896, xstate_sizes[4]:   64
[    0.000000] x86/fpu: xstate_offset[5]:  960, xstate_sizes[5]:   64
[    0.000000] x86/fpu: xstate_offset[6]: 1024, xstate_sizes[6]:  512
[    0.000000] x86/fpu: xstate_offset[7]: 1536, xstate_sizes[7]: 1024
[    0.000000] x86/fpu: xstate_offset[9]: 2560, xstate_sizes[9]:    8
[    0.000000] x86/fpu: Enabled xstate features 0x2ff, context size is 2568 bytes, using 'compacted' format.
...
...
...
[    6.280390] input: Power Button as /devices/LNXSYSTM:00/LNXPWRBN:00/input/input0
[    6.288655] ACPI: Power Button [PWRF]
[    6.292961] ERST: Error Record Serialization Table (ERST) support is initialized.
[    6.301295] pstore: Registered erst as persistent store backend
[    6.307912] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 4
[    6.308886] {1}[Hardware Error]: event severity: fatal
[    6.308886] {1}[Hardware Error]:  Error 0, type: fatal
[    6.308886] {1}[Hardware Error]:  fru_text: Card03, ChnB, DIMM0
[    6.308886] {1}[Hardware Error]:   section_type: memory error
[    6.308886] {1}[Hardware Error]:   error_status: 0x0000000000000000
[    6.308886] {1}[Hardware Error]:   physical_address: 0x00000031a53f0000
[    6.308886] {1}[Hardware Error]:   node: 2 card: 1 module: 0 rank: 0 bank: 1 device: 0 row: 3541 column: 0 
[    6.308886] {1}[Hardware Error]:   DIMM location: CPU 1 DIMM 8 
[    6.308886] Kernel panic - not syncing: Fatal hardware error!
[    6.308886] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.10.0-rc3 #3
[    6.308886] Hardware name: Lenovo ThinkSystem SR650 -[7X06CTO1WW]-/-[7X06CTO1WW]-, BIOS -[IVE636Z-2.13]- 07/18/2019
[    6.308886] Call Trace:
[    6.308886]  dump_stack+0x57/0x6a
[    6.308886]  panic+0xfb/0x2d7
[    6.308886]  __ghes_panic.cold+0x21/0x21
[    6.308886]  ghes_proc+0xe0/0x140
[    6.308886]  ghes_probe+0x129/0x380
[    6.308886]  platform_drv_probe+0x35/0x80
[    6.308886]  really_probe+0x31b/0x420
[    6.308886]  driver_probe_device+0xe1/0x150
[    6.308886]  device_driver_attach+0x53/0x60
[    6.308886]  __driver_attach+0x8a/0x150
[    6.308886]  ? device_driver_attach+0x60/0x60
[    6.308886]  ? device_driver_attach+0x60/0x60
[    6.308886]  bus_for_each_dev+0x78/0xc0
[    6.308886]  bus_add_driver+0x14d/0x1f0
[    6.308886]  driver_register+0x6c/0xc0
[    6.308886]  ? bert_init+0x22a/0x22a
[    6.308886]  ghes_init+0x87/0xe5
[    6.308886]  do_one_initcall+0x44/0x1d0
[    6.308886]  kernel_init_freeable+0x1d3/0x235
[    6.308886]  ? rest_init+0xb4/0xb4
[    6.308886]  kernel_init+0xa/0x10c
[    6.308886]  ret_from_fork+0x1f/0x30
[    6.308886] Kernel Offset: disabled
[    6.308886] Rebooting in 10 seconds..
[    6.308886] ACPI MEMORY or I/O RESET_REG.
 UEFI:START PEI          
 UEFI:START PEI          
 UEFI:MEM INIT           
<F1> System Setup     <F10> PXE Boot            
<F2> Diagnostic       <F12> One Time Boot Device
 UEFI:DXE INIT           
 UEFI:DXE INIT           
                
ThinkSystem SR650                                   
Serial Number 
J3016250
Machine Type 
7X06
BMC IP 
10.177.134.108
UEFI Version 
2.13 IVE636Z (07/18/2019)
BMC Version    
2.70 CDI338D (06/19/2019)

-- 
Best Regards!

Aili Yao

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] Dump cper error table in mce_panic
  2020-11-20  9:22         ` Aili Yao
@ 2020-11-20 10:24           ` Borislav Petkov
  2021-01-28 12:01             ` Aili Yao
  0 siblings, 1 reply; 16+ messages in thread
From: Borislav Petkov @ 2020-11-20 10:24 UTC (permalink / raw)
  To: Aili Yao, tony.luck
  Cc: rjw, lenb, james.morse, linux-acpi, linux-edac, yangfeng1, CHENGUOMIN

On Fri, Nov 20, 2020 at 05:22:35PM +0800, Aili Yao wrote:
> Hi, This test result if from tip/master, previous is upstream latest.

Thanks for doing those, now let's see.

With rc4 you have the MCE error in the first kernel:

[  106.956286] Disabling lock debugging due to kernel taint
[  106.962373] mce: [Hardware Error]: CPU 18: Machine Check Exception: 5 Bank 7: be00000001010091
[  106.962377] mce: [Hardware Error]: RIP !INEXACT! 10:<ffffffffac58472a>
[  106.996488] {acpi_idle_do_entry+0x4a/0x60}
[  107.001057] mce: [Hardware Error]: TSC ae4b410af0b8 ADDR 314d193000 MISC 200400c008002086
[  107.010283] mce: [Hardware Error]: PROCESSOR 0:50657 TIME 1605843017 SOCKET 1 APIC 40 microcode 5000021
[  107.020767] mce: [Hardware Error]: Run the above through 'mcelog --ascii'
[  107.031295] mce: [Hardware Error]: Machine check: Processor context corrupt
[  107.039065] Kernel panic - not syncing: Fatal machine check

Now the kdump kernel fires and there's an error record in the CPER
thing.

> [    6.280390] input: Power Button as /devices/LNXSYSTM:00/LNXPWRBN:00/input/input0
> [    6.288655] ACPI: Power Button [PWRF]
> [    6.292961] ERST: Error Record Serialization Table (ERST) support is initialized.
> [    6.301295] pstore: Registered erst as persistent store backend
> [    6.307912] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 4
> [    6.308886] {1}[Hardware Error]: event severity: fatal
> [    6.308886] {1}[Hardware Error]:  Error 0, type: fatal
> [    6.308886] {1}[Hardware Error]:  fru_text: Card03, ChnB, DIMM0
> [    6.308886] {1}[Hardware Error]:   section_type: memory error
> [    6.308886] {1}[Hardware Error]:   error_status: 0x0000000000000000

And this error_status is all clear. I wonder why.

Looking at the ÜFI spec "Section O: Error Status" it defines a couple
of bits there: whether it was an address or control bits error, who
detected the error (responder, requestor), whether it was the first
error, etc, etc.

And none of those bits are set.

Which makes me not trust that error record a whole lot but that's a
given, since it is firmware and firmware is an unfixable piece of crap
by definition.

So then one could probably say that if none of those error status bits
are set, then the error being reported is not something, let's say
"fresh". This is doubly the case considering that it gets detected when
the GHES driver probes:

        /* Handle any pending errors right away */
        spin_lock_irqsave(&ghes_notify_lock_irq, flags);
        ghes_proc(ghes);
        spin_unlock_irqrestore(&ghes_notify_lock_irq, flags);

so *maybe*, just *maybe* one could say here:

If the error_status doesn't have any valid bits *and* it has been
detected on driver init - i.e., the error has been there before the
driver probed, then even if the error is fatal, GHES should not call
__ghes_panic().

The even better way to detect this is to be able to check whether this
is the kdump kernel and whether it got loaded due to a fatal MCE in the
first kernel and then match that error address with the error address of
the error which caused the first panic in the mce code. Then the second
kernel won't need to panic but simply log.

However, I think that second way to check is probably hard and the first
heuristic is probably good enough...

Tony, thoughts?

-- 
Regards/Gruss,
    Boris.

https://people.kernel.org/tglx/notes-about-netiquette

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] Dump cper error table in mce_panic
  2020-11-20 10:24           ` Borislav Petkov
@ 2021-01-28 12:01             ` Aili Yao
  2021-01-28 17:22               ` Luck, Tony
  0 siblings, 1 reply; 16+ messages in thread
From: Aili Yao @ 2021-01-28 12:01 UTC (permalink / raw)
  To: Borislav Petkov
  Cc: tony.luck, rjw, lenb, james.morse, linux-acpi, linux-edac,
	yangfeng1, CHENGUOMIN


> On Fri, Nov 20, 2020 at 05:22:35PM +0800, Aili Yao wrote:
> > Hi, This test result if from tip/master, previous is upstream latest.  
> 
> Thanks for doing those, now let's see.
> 
> With rc4 you have the MCE error in the first kernel:
> 
> [  106.956286] Disabling lock debugging due to kernel taint
> [  106.962373] mce: [Hardware Error]: CPU 18: Machine Check Exception: 5 Bank 7: be00000001010091
> [  106.962377] mce: [Hardware Error]: RIP !INEXACT! 10:<ffffffffac58472a>
> [  106.996488] {acpi_idle_do_entry+0x4a/0x60}
> [  107.001057] mce: [Hardware Error]: TSC ae4b410af0b8 ADDR 314d193000 MISC 200400c008002086
> [  107.010283] mce: [Hardware Error]: PROCESSOR 0:50657 TIME 1605843017 SOCKET 1 APIC 40 microcode 5000021
> [  107.020767] mce: [Hardware Error]: Run the above through 'mcelog --ascii'
> [  107.031295] mce: [Hardware Error]: Machine check: Processor context corrupt
> [  107.039065] Kernel panic - not syncing: Fatal machine check
> 
> Now the kdump kernel fires and there's an error record in the CPER
> thing.
> 
> > [    6.280390] input: Power Button as /devices/LNXSYSTM:00/LNXPWRBN:00/input/input0
> > [    6.288655] ACPI: Power Button [PWRF]
> > [    6.292961] ERST: Error Record Serialization Table (ERST) support is initialized.
> > [    6.301295] pstore: Registered erst as persistent store backend
> > [    6.307912] {1}[Hardware Error]: Hardware error from APEI Generic Hardware Error Source: 4
> > [    6.308886] {1}[Hardware Error]: event severity: fatal
> > [    6.308886] {1}[Hardware Error]:  Error 0, type: fatal
> > [    6.308886] {1}[Hardware Error]:  fru_text: Card03, ChnB, DIMM0
> > [    6.308886] {1}[Hardware Error]:   section_type: memory error
> > [    6.308886] {1}[Hardware Error]:   error_status: 0x0000000000000000  
> 
> And this error_status is all clear. I wonder why.
> 
> Looking at the ÜFI spec "Section O: Error Status" it defines a couple
> of bits there: whether it was an address or control bits error, who
> detected the error (responder, requestor), whether it was the first
> error, etc, etc.
> 
> And none of those bits are set.
> 
> Which makes me not trust that error record a whole lot but that's a
> given, since it is firmware and firmware is an unfixable piece of crap
> by definition.
> 
> So then one could probably say that if none of those error status bits
> are set, then the error being reported is not something, let's say
> "fresh". This is doubly the case considering that it gets detected when
> the GHES driver probes:
> 
>         /* Handle any pending errors right away */
>         spin_lock_irqsave(&ghes_notify_lock_irq, flags);
>         ghes_proc(ghes);
>         spin_unlock_irqrestore(&ghes_notify_lock_irq, flags);
> 
> so *maybe*, just *maybe* one could say here:
> 
> If the error_status doesn't have any valid bits *and* it has been
> detected on driver init - i.e., the error has been there before the
> driver probed, then even if the error is fatal, GHES should not call
> __ghes_panic().
> 
> The even better way to detect this is to be able to check whether this
> is the kdump kernel and whether it got loaded due to a fatal MCE in the
> first kernel and then match that error address with the error address of
> the error which caused the first panic in the mce code. Then the second
> kernel won't need to panic but simply log.
> 
> However, I think that second way to check is probably hard and the first
> heuristic is probably good enough...
> 
> Tony, thoughts?
> 

Long away from this issue, any feedback?
From kexec-tool, the hest_disable parameter has been added to 2nd kernel, So
the kdump will not be affected by ghes errors.
But we still may lose the ghes error info, so i think this patch is still needed?

Thanks

-- 
Best Regards!

Aili Yao

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v2] Dump cper error table in mce_panic
  2021-01-28 12:01             ` Aili Yao
@ 2021-01-28 17:22               ` Luck, Tony
  2021-02-23  9:18                 ` Aili Yao
  0 siblings, 1 reply; 16+ messages in thread
From: Luck, Tony @ 2021-01-28 17:22 UTC (permalink / raw)
  To: Aili Yao, Borislav Petkov
  Cc: rjw, lenb, james.morse, linux-acpi, linux-edac, yangfeng1, CHENGUOMIN

> The even better way to detect this is to be able to check whether this
> is the kdump kernel and whether it got loaded due to a fatal MCE in the
> first kernel and then match that error address with the error address of
> the error which caused the first panic in the mce code. Then the second
> kernel won't need to panic but simply log.

The biggest problem with all of the logging (whether in machine check
banks, or in error records from BIOS) is the lack of a timestamp. If there
was a way to tell if this "just happened", or "happened a while ago" then
such "take action" or "just log" decisions would be simpler.

Maybe you don't need to do *all* those matching checks.  Just a flag
from the first kernel to say "I died from a fatal machine check" could
be used to tell the kdump kernel "just log the cper" stuff.

If the system is broken enough that more machine checks are still
firing in the kdump kernel ... then you would miss trying to recover.
But if more machine checks are happening, then the kdump kernel
is likely doomed anyway.

Getting a full memory dump after a machine check generally isn't
all that useful anyway. The problem was (almost certainly) h/w, so
not much benefit in decoding the dump to find which code was running
when the h/w signalled.

A second bite at getting the error logs from the death of the first
kernel is worth it though.

-Tony

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] Dump cper error table in mce_panic
  2021-01-28 17:22               ` Luck, Tony
@ 2021-02-23  9:18                 ` Aili Yao
  2021-02-23 19:32                   ` Luck, Tony
  0 siblings, 1 reply; 16+ messages in thread
From: Aili Yao @ 2021-02-23  9:18 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, rjw, lenb, james.morse, linux-acpi, linux-edac,
	yangfeng1, yaoaili

On Thu, 28 Jan 2021 17:22:30 +0000
"Luck, Tony" <tony.luck@intel.com> wrote:

> 
> Getting a full memory dump after a machine check generally isn't
> all that useful anyway. The problem was (almost certainly) h/w, so
> not much benefit in decoding the dump to find which code was running
> when the h/w signalled.

The purpose I try to collect the coredump log is not to identify what the backtrace
is, I want to confirm if this panic is really needed, there are too many panics in production
environment with MCA Recovery Enabled, and no kernel log is collected, in some way we can't find
the benifits from MCA recovery, And for purley this feature cost too much.

And the unexpected NMI for fatal memory error is not the right way to get work done.
This is not right, and shouldn't happen.

> A second bite at getting the error logs from the death of the first
> kernel is worth it though.

I am not smart enough to get the point. I have paid a lot of time for this patch, 
I need an result even it doesn't work. so i like the reply like this:

1. this patch is meaningless, and should be rejected.   
2. this issue is real, but we need other methond, not this patch.
3. the patch need to improve.


Thanks
Aili Yao

^ permalink raw reply	[flat|nested] 16+ messages in thread

* RE: [PATCH v2] Dump cper error table in mce_panic
  2021-02-23  9:18                 ` Aili Yao
@ 2021-02-23 19:32                   ` Luck, Tony
  2021-02-24  9:56                     ` Aili Yao
  0 siblings, 1 reply; 16+ messages in thread
From: Luck, Tony @ 2021-02-23 19:32 UTC (permalink / raw)
  To: Aili Yao
  Cc: Borislav Petkov, rjw, lenb, james.morse, linux-acpi, linux-edac,
	yangfeng1

> I am not smart enough to get the point. I have paid a lot of time for this patch, 
> I need an result even it doesn't work. so i like the reply like this:
>
> 1. this patch is meaningless, and should be rejected.   
> 2. this issue is real, but we need other methond, not this patch.
> 3. the patch need to improve.

I don't want to say that the patch is meaningless ... it may be useful to you
in your environment to help sort out machine checks due to h/w issues vs.
programming errors in the machine check recovery code.

But I don't think it is generally useful in the upstream code.

-Tony

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH v2] Dump cper error table in mce_panic
  2021-02-23 19:32                   ` Luck, Tony
@ 2021-02-24  9:56                     ` Aili Yao
  0 siblings, 0 replies; 16+ messages in thread
From: Aili Yao @ 2021-02-24  9:56 UTC (permalink / raw)
  To: Luck, Tony
  Cc: Borislav Petkov, rjw, lenb, james.morse, linux-acpi, linux-edac,
	yangfeng1

On Tue, 23 Feb 2021 19:32:37 +0000
"Luck, Tony" <tony.luck@intel.com> wrote:

> > I am not smart enough to get the point. I have paid a lot of time for this patch, 
> > I need an result even it doesn't work. so i like the reply like this:
> >
> > 1. this patch is meaningless, and should be rejected.   
> > 2. this issue is real, but we need other methond, not this patch.
> > 3. the patch need to improve.  
> 
> I don't want to say that the patch is meaningless ... it may be useful to you
> in your environment to help sort out machine checks due to h/w issues vs.
> programming errors in the machine check recovery code.
> 
> But I don't think it is generally useful in the upstream code.

Got it.
Another thing I want to say is that when mca_cfg.tolerant is set to 3, this NMI handling will
also panic the system in some case, but it seems there is not a big influence though.

Thanks
Aili Yao  


^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2021-02-24  9:58 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-04  6:50 [PATCH] Dump cper error table in mce_panic yaoaili126
2020-11-04 10:16 ` kernel test robot
2020-11-06 19:35 ` James Morse
2020-11-18  3:12   ` Aili Yao
2020-11-17  9:58 ` [PATCH v2] " Aili Yao
2020-11-18 12:45   ` Borislav Petkov
2020-11-19  5:40     ` Aili Yao
2020-11-19 17:45       ` Borislav Petkov
2020-11-20  3:40         ` Aili Yao
2020-11-20  9:22         ` Aili Yao
2020-11-20 10:24           ` Borislav Petkov
2021-01-28 12:01             ` Aili Yao
2021-01-28 17:22               ` Luck, Tony
2021-02-23  9:18                 ` Aili Yao
2021-02-23 19:32                   ` Luck, Tony
2021-02-24  9:56                     ` Aili Yao

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).