[v3] EDAC/ghes: Setup DIMM label from DMI and use it in error reports
diff mbox series

Message ID 20200518095852.28010-1-rrichter@marvell.com
State New
Headers show
Series
  • [v3] EDAC/ghes: Setup DIMM label from DMI and use it in error reports
Related show

Commit Message

Robert Richter May 18, 2020, 9:58 a.m. UTC
The ghes driver reports errors with 'unknown label' even if the actual
DIMM label is known, e.g.:

 EDAC MC0: 1 CE Single-bit ECC on unknown label (node:0 card:0
   module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM location:N0 DIMM_A0
   page:0x966a9b3 offset:0x0 grain:1 syndrome:0x0 - APEI location:
   node:0 card:0 module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM
   location:N0 DIMM_A0 status(0x0000000000000400): Storage error in
   DRAM memory)

Fix this by using struct dimm_info's label string in error reports:

 EDAC MC0: 1 CE Single-bit ECC on N0 DIMM_A0 (node:0 card:0 module:0
   rank:1 bank:515 col:14 bit_pos:16 DIMM location:N0 DIMM_A0
   page:0x99223d8 offset:0x0 grain:1 syndrome:0x0 - APEI location:
   node:0 card:0 module:0 rank:1 bank:515 col:14 bit_pos:16 DIMM
   location:N0 DIMM_A0 status(0x0000000000000400): Storage error in
   DRAM memory)

The labels are initialized by reading the bank and device strings from
DMI. Now, the label information can also read from sysfs. E.g. a
ThunderX2 system will show the following:

 /sys/devices/system/edac/mc/mc0/dimm0/dimm_label:N0 DIMM_A0
 /sys/devices/system/edac/mc/mc0/dimm1/dimm_label:N0 DIMM_B0
 /sys/devices/system/edac/mc/mc0/dimm2/dimm_label:N0 DIMM_C0
 /sys/devices/system/edac/mc/mc0/dimm3/dimm_label:N0 DIMM_D0
 /sys/devices/system/edac/mc/mc0/dimm4/dimm_label:N0 DIMM_E0
 /sys/devices/system/edac/mc/mc0/dimm5/dimm_label:N0 DIMM_F0
 /sys/devices/system/edac/mc/mc0/dimm6/dimm_label:N0 DIMM_G0
 /sys/devices/system/edac/mc/mc0/dimm7/dimm_label:N0 DIMM_H0
 /sys/devices/system/edac/mc/mc0/dimm8/dimm_label:N1 DIMM_I0
 /sys/devices/system/edac/mc/mc0/dimm9/dimm_label:N1 DIMM_J0
 /sys/devices/system/edac/mc/mc0/dimm10/dimm_label:N1 DIMM_K0
 /sys/devices/system/edac/mc/mc0/dimm11/dimm_label:N1 DIMM_L0
 /sys/devices/system/edac/mc/mc0/dimm12/dimm_label:N1 DIMM_M0
 /sys/devices/system/edac/mc/mc0/dimm13/dimm_label:N1 DIMM_N0
 /sys/devices/system/edac/mc/mc0/dimm14/dimm_label:N1 DIMM_O0
 /sys/devices/system/edac/mc/mc0/dimm15/dimm_label:N1 DIMM_P0

Since dimm_labels can be rewritten, that label will be used in a later
error report:

 # echo foobar >/sys/devices/system/edac/mc/mc0/dimm0/dimm_label
 # # some error injection here
 # dmesg | grep foobar
 [ 2119.784489] EDAC MC0: 1 CE Single-bit ECC on foobar (node:0 card:0
 module:0 rank:0 bank:769 col:1 bit_pos:16 DIMM location:foobar
 page:0x94d027 offset:0x0 grain:1 syndrome:0x0 - APEI location: node:0
 card:0 module:0 rank:0 bank:769 col:1 bit_pos:16 DIMM location:foobar
 status(0x0000000000000400): Storage error in DRAM memory)

Signed-off-by: Robert Richter <rrichter@marvell.com>
---
This patch is a self-contained version of:

 [v2,05/10] EDAC/ghes: Setup DIMM label from DMI and use it in error reports
 https://lore.kernel.org/patchwork/patch/1229388/

 [02/11] EDAC/ghes: Setup DIMM label from DMI and use it in error reports
 https://lore.kernel.org/patchwork/patch/1205891/

It applies on ras:edac-for-next commit id af8a9a36af01 ("EDAC/ghes:
Setup DIMM label from DMI and use it in error reports").

v3 changes:

 * shortend function name to dimm_setup_label(),

 * let args stick out of find_dimm_by_handle() (line length 82 chars).
---
 drivers/edac/ghes_edac.c | 43 +++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 16 deletions(-)

Comments

Borislav Petkov May 19, 2020, 8:25 p.m. UTC | #1
On Mon, May 18, 2020 at 11:58:52AM +0200, Robert Richter wrote:
> +static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
> +{
> +	const char *bank = NULL, *device = NULL;
> +
> +	dmi_memdev_name(handle, &bank, &device);
> +
> +	/* both strings must be non-zero */
> +	if (bank && *bank && device && *device)
> +		snprintf(dimm->label, sizeof(dimm->label),
> +			"%s %s", bank, device);
> +	else
> +		snprintf(dimm->label, sizeof(dimm->label),
> +			"unknown memory (handle: 0x%.4x)", handle);

This changes the sysfs strings on my test box like this. 00-ghes.before
and 01-ghes.after are created by doing:

grep -EriIn . /sys/devices/system/edac/ 2>/dev/null > [filename]

edac_mc_alloc_dimms() already sets the dimm->label to "mc#%dmemory#%d"
but I'm guessing that dmi_memdev_name() doesn't give on my machine what
it gives on yours.

Welcome to the wonderful world of consistently implemented firmware!

--- 00-ghes.before      2020-05-19 17:55:50.821220239 +0200
+++ 01-ghes.after       2020-05-19 22:09:28.808492701 +0200
@@ -17,7 +17,7 @@
 /sys/devices/system/edac/mc/mc0/ce_count:1:0
 /sys/devices/system/edac/mc/mc0/mc_name:1:ghes_edac
 /sys/devices/system/edac/mc/mc0/csrow15/ce_count:1:0
-/sys/devices/system/edac/mc/mc0/csrow15/ch0_dimm_label:1:mc#0memory#15
+/sys/devices/system/edac/mc/mc0/csrow15/ch0_dimm_label:1:unknown memory (handle: 0x0030)
 /sys/devices/system/edac/mc/mc0/csrow15/power/runtime_active_time:1:0
 /sys/devices/system/edac/mc/mc0/csrow15/power/runtime_active_kids:1:0
 /sys/devices/system/edac/mc/mc0/csrow15/power/runtime_usage:1:0
@@ -42,7 +42,7 @@
 /sys/devices/system/edac/mc/mc0/power/runtime_enabled:1:disabled & forbidden
 /sys/devices/system/edac/mc/mc0/power/control:1:on
 /sys/devices/system/edac/mc/mc0/csrow31/ce_count:1:0
-/sys/devices/system/edac/mc/mc0/csrow31/ch0_dimm_label:1:mc#0memory#31
+/sys/devices/system/edac/mc/mc0/csrow31/ch0_dimm_label:1:unknown memory (handle: 0x0040)
 /sys/devices/system/edac/mc/mc0/csrow31/power/runtime_active_time:1:0
 /sys/devices/system/edac/mc/mc0/csrow31/power/runtime_active_kids:1:0
 /sys/devices/system/edac/mc/mc0/csrow31/power/runtime_usage:1:0
@@ -73,10 +73,10 @@
 /sys/devices/system/edac/mc/mc0/dimm15/dimm_dev_type:1:Unknown
 /sys/devices/system/edac/mc/mc0/dimm15/size:1:32768
 /sys/devices/system/edac/mc/mc0/dimm15/dimm_ce_count:1:0
-/sys/devices/system/edac/mc/mc0/dimm15/dimm_label:1:mc#0memory#15
+/sys/devices/system/edac/mc/mc0/dimm15/dimm_label:1:unknown memory (handle: 0x0030)
 /sys/devices/system/edac/mc/mc0/dimm15/dimm_location:1:memory 15 
 /sys/devices/system/edac/mc/mc0/dimm15/dimm_edac_mode:1:SECDED
-/sys/devices/system/edac/mc/mc0/seconds_since_reset:1:354
+/sys/devices/system/edac/mc/mc0/seconds_since_reset:1:979
 /sys/devices/system/edac/mc/mc0/dimm31/dimm_ue_count:1:0
 /sys/devices/system/edac/mc/mc0/dimm31/dimm_mem_type:1:Registered-DDR4
 /sys/devices/system/edac/mc/mc0/dimm31/power/runtime_active_time:1:0
@@ -90,7 +90,7 @@
 /sys/devices/system/edac/mc/mc0/dimm31/dimm_dev_type:1:Unknown
 /sys/devices/system/edac/mc/mc0/dimm31/size:1:32768
 /sys/devices/system/edac/mc/mc0/dimm31/dimm_ce_count:1:0
-/sys/devices/system/edac/mc/mc0/dimm31/dimm_label:1:mc#0memory#31
+/sys/devices/system/edac/mc/mc0/dimm31/dimm_label:1:unknown memory (handle: 0x0040)
 /sys/devices/system/edac/mc/mc0/dimm31/dimm_location:1:memory 31 
 /sys/devices/system/edac/mc/mc0/dimm31/dimm_edac_mode:1:SECDED
 /sys/devices/system/edac/mc/mc0/max_location:1:memory 31
Borislav Petkov May 19, 2020, 9:30 p.m. UTC | #2
On Tue, May 19, 2020 at 10:25:35PM +0200, Borislav Petkov wrote:
> but I'm guessing that dmi_memdev_name() doesn't give on my machine what
> it gives on yours.

IOW, this ontop:

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index c7d404629863..f84c117c4310 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -109,9 +109,9 @@ static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
        if (bank && *bank && device && *device)
                snprintf(dimm->label, sizeof(dimm->label),
                        "%s %s", bank, device);
-       else
-               snprintf(dimm->label, sizeof(dimm->label),
-                       "unknown memory (handle: 0x%.4x)", handle);
+       /*
+        * else fallback to the EDAC default name.
+        */
 }
 
 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
Robert Richter May 20, 2020, 2:11 p.m. UTC | #3
Thanks for testing.

On 19.05.20 22:25:35, Borislav Petkov wrote:

> -/sys/devices/system/edac/mc/mc0/csrow15/ch0_dimm_label:1:mc#0memory#15
> +/sys/devices/system/edac/mc/mc0/csrow15/ch0_dimm_label:1:unknown memory (handle: 0x0030)

Looks like on that system device locator or bank locator (or both) are
empty. What shows this (esp. the Locator fields):?

 dmidecode -t 17

So maybe the check is too strict and we should allow one of both being
empty?

If the strings are missing, shouldn't we still provide the handle in
the label information?

The string 'mc#0memory#15' is also of no use as it just takes the
mc_num and dimm_num as a reference, which can be determined from sysfs
without that label.

Your add on patch to just ignore the write does not revert to we old
behavior as you will see now 'mc#0memory#15' in the error reports
where you have seen 'unknown label' before.

So the question is what to do if that information is missing?

Note: it should better show "unknown label ..." instead of "unknown
memory ...".

Thanks,

-Robert

Patch
diff mbox series

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index cb3dab56a875..c7d404629863 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -87,16 +87,31 @@  static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
 		(*num_dimm)++;
 }
 
-static int get_dimm_smbios_index(struct mem_ctl_info *mci, u16 handle)
+static struct dimm_info *find_dimm_by_handle(struct mem_ctl_info *mci, u16 handle)
 {
 	struct dimm_info *dimm;
 
 	mci_for_each_dimm(mci, dimm) {
 		if (dimm->smbios_handle == handle)
-			return dimm->idx;
+			return dimm;
 	}
 
-	return -1;
+	return NULL;
+}
+
+static void dimm_setup_label(struct dimm_info *dimm, u16 handle)
+{
+	const char *bank = NULL, *device = NULL;
+
+	dmi_memdev_name(handle, &bank, &device);
+
+	/* both strings must be non-zero */
+	if (bank && *bank && device && *device)
+		snprintf(dimm->label, sizeof(dimm->label),
+			"%s %s", bank, device);
+	else
+		snprintf(dimm->label, sizeof(dimm->label),
+			"unknown memory (handle: 0x%.4x)", handle);
 }
 
 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
@@ -179,9 +194,7 @@  static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 		dimm->dtype = DEV_UNKNOWN;
 		dimm->grain = 128;		/* Likely, worse case */
 
-		/*
-		 * FIXME: It shouldn't be hard to also fill the DIMM labels
-		 */
+		dimm_setup_label(dimm, entry->handle);
 
 		if (dimm->nr_pages) {
 			edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
@@ -344,19 +357,17 @@  void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 	if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
 		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
 	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
-		const char *bank = NULL, *device = NULL;
-		int index = -1;
+		struct dimm_info *dimm;
 
-		dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
-		if (bank != NULL && device != NULL)
-			p += sprintf(p, "DIMM location:%s %s ", bank, device);
-		else
+		dimm = find_dimm_by_handle(mci, mem_err->mem_dev_handle);
+		if (dimm) {
+			e->top_layer = dimm->idx;
+			strcpy(e->label, dimm->label);
+			p += sprintf(p, "DIMM location:%s ", dimm->label);
+		} else {
 			p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
 				     mem_err->mem_dev_handle);
-
-		index = get_dimm_smbios_index(mci, mem_err->mem_dev_handle);
-		if (index >= 0)
-			e->top_layer = index;
+		}
 	}
 	if (p > e->location)
 		*(p - 1) = '\0';