All of lore.kernel.org
 help / color / mirror / Atom feed
From: Robert Richter <rrichter@marvell.com>
To: Borislav Petkov <bp@alien8.de>,
	Mauro Carvalho Chehab <mchehab@kernel.org>,
	Tony Luck <tony.luck@intel.com>
Cc: James Morse <james.morse@arm.com>,
	Aristeu Rozanski <aris@redhat.com>,
	Robert Richter <rrichter@marvell.com>,
	Matthias Brugger <mbrugger@suse.com>,
	<linux-edac@vger.kernel.org>, <linux-kernel@vger.kernel.org>
Subject: [PATCH v2 05/10] EDAC/ghes: Setup DIMM label from DMI and use it in error reports
Date: Wed, 22 Apr 2020 13:58:09 +0200	[thread overview]
Message-ID: <20200422115814.22205-6-rrichter@marvell.com> (raw)
In-Reply-To: <20200422115814.22205-1-rrichter@marvell.com>

The ghes driver reports errors with 'unknown label' even if the actual
DIMM label is known, e.g.:

 EDAC MC0: 1 CE Single-bit ECC on unknown label (node:0 card:0
   module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM location:N0 DIMM_A0
   page:0x966a9b3 offset:0x0 grain:1 syndrome:0x0 - APEI location:
   node:0 card:0 module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM
   location:N0 DIMM_A0 status(0x0000000000000400): Storage error in
   DRAM memory)

Fix this by using struct dimm_info's label string in error reports:

 EDAC MC0: 1 CE Single-bit ECC on N0 DIMM_A0 (node:0 card:0 module:0
   rank:1 bank:515 col:14 bit_pos:16 DIMM location:N0 DIMM_A0
   page:0x99223d8 offset:0x0 grain:1 syndrome:0x0 - APEI location:
   node:0 card:0 module:0 rank:1 bank:515 col:14 bit_pos:16 DIMM
   location:N0 DIMM_A0 status(0x0000000000000400): Storage error in
   DRAM memory)

The labels are initialized by reading the bank and device strings from
DMI. Now, the label information can also read from sysfs. E.g. a
ThunderX2 system will show the following:

 /sys/devices/system/edac/mc/mc0/dimm0/dimm_label:N0 DIMM_A0
 /sys/devices/system/edac/mc/mc0/dimm1/dimm_label:N0 DIMM_B0
 /sys/devices/system/edac/mc/mc0/dimm2/dimm_label:N0 DIMM_C0
 /sys/devices/system/edac/mc/mc0/dimm3/dimm_label:N0 DIMM_D0
 /sys/devices/system/edac/mc/mc0/dimm4/dimm_label:N0 DIMM_E0
 /sys/devices/system/edac/mc/mc0/dimm5/dimm_label:N0 DIMM_F0
 /sys/devices/system/edac/mc/mc0/dimm6/dimm_label:N0 DIMM_G0
 /sys/devices/system/edac/mc/mc0/dimm7/dimm_label:N0 DIMM_H0
 /sys/devices/system/edac/mc/mc0/dimm8/dimm_label:N1 DIMM_I0
 /sys/devices/system/edac/mc/mc0/dimm9/dimm_label:N1 DIMM_J0
 /sys/devices/system/edac/mc/mc0/dimm10/dimm_label:N1 DIMM_K0
 /sys/devices/system/edac/mc/mc0/dimm11/dimm_label:N1 DIMM_L0
 /sys/devices/system/edac/mc/mc0/dimm12/dimm_label:N1 DIMM_M0
 /sys/devices/system/edac/mc/mc0/dimm13/dimm_label:N1 DIMM_N0
 /sys/devices/system/edac/mc/mc0/dimm14/dimm_label:N1 DIMM_O0
 /sys/devices/system/edac/mc/mc0/dimm15/dimm_label:N1 DIMM_P0

Since dimm_labels can be rewritten, that label will be used in a later
error report:

 # echo foobar >/sys/devices/system/edac/mc/mc0/dimm0/dimm_label
 # # some error injection here
 # dmesg | grep foobar
 [ 2119.784489] EDAC MC0: 1 CE Single-bit ECC on foobar (node:0 card:0
 module:0 rank:0 bank:769 col:1 bit_pos:16 DIMM location:foobar
 page:0x94d027 offset:0x0 grain:1 syndrome:0x0 - APEI location: node:0
 card:0 module:0 rank:0 bank:769 col:1 bit_pos:16 DIMM location:foobar
 status(0x0000000000000400): Storage error in DRAM memory)

Signed-off-by: Robert Richter <rrichter@marvell.com>
---
 drivers/edac/ghes_edac.c | 43 +++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 23adb7674f9b..a5890afa9c71 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -153,7 +153,7 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
 		(*num_dimm)++;
 }
 
-static int get_dimm_smbios_index(u16 handle)
+static struct dimm_info *find_dimm_by_handle(u16 handle)
 {
 	struct ghes_dimm *ghes_dimm;
 
@@ -161,10 +161,25 @@ static int get_dimm_smbios_index(u16 handle)
 
 	list_for_each_entry(ghes_dimm, &ghes_dimm_list, entry) {
 		if (ghes_dimm->handle == handle)
-			return ghes_dimm->dimm->idx;
+			return ghes_dimm->dimm;
 	}
 
-	return -1;
+	return NULL;
+}
+
+static void ghes_dimm_setup_label(struct dimm_info *dimm, u16 handle)
+{
+	const char *bank = NULL, *device = NULL;
+
+	dmi_memdev_name(handle, &bank, &device);
+
+	/* both strings must be non-zero */
+	if (bank && *bank && device && *device)
+		snprintf(dimm->label, sizeof(dimm->label),
+			"%s %s", bank, device);
+	else
+		snprintf(dimm->label, sizeof(dimm->label),
+			"unknown memory (handle: 0x%.4x)", handle);
 }
 
 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
@@ -248,9 +263,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 		dimm->dtype = DEV_UNKNOWN;
 		dimm->grain = 128;		/* Likely, worse case */
 
-		/*
-		 * FIXME: It shouldn't be hard to also fill the DIMM labels
-		 */
+		ghes_dimm_setup_label(dimm, entry->handle);
 
 		if (dimm->nr_pages) {
 			edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
@@ -416,19 +429,17 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 	if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
 		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
 	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
-		const char *bank = NULL, *device = NULL;
-		int index = -1;
+		struct dimm_info *dimm;
 
-		dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
-		if (bank != NULL && device != NULL)
-			p += sprintf(p, "DIMM location:%s %s ", bank, device);
-		else
+		dimm = find_dimm_by_handle(mem_err->mem_dev_handle);
+		if (dimm) {
+			e->top_layer = dimm->idx;
+			strcpy(e->label, dimm->label);
+			p += sprintf(p, "DIMM location:%s ", dimm->label);
+		} else {
 			p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
 				     mem_err->mem_dev_handle);
-
-		index = get_dimm_smbios_index(mem_err->mem_dev_handle);
-		if (index >= 0)
-			e->top_layer = index;
+		}
 	}
 	if (p > e->location)
 		*(p - 1) = '\0';
-- 
2.20.1


  parent reply	other threads:[~2020-04-22 11:59 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-04-22 11:58 [PATCH v2 00/10] EDAC/mc/ghes: Fixes, cleanup and reworks Robert Richter
2020-04-22 11:58 ` [PATCH v2 01/10] EDAC/mc: Fix usage of snprintf() and dimm location setup Robert Richter
2020-04-22 20:52   ` Borislav Petkov
2020-05-19  9:27     ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 02/10] EDAC/mc: Use int type for parameters of edac_mc_alloc() Robert Richter
2020-04-23 17:49   ` Borislav Petkov
2020-05-19  9:33     ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 03/10] EDAC/ghes: Remove unused members of struct ghes_edac_pvt, rename it to ghes_mci Robert Richter
2020-04-23 17:55   ` Borislav Petkov
2020-05-05  7:50     ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 04/10] EDAC/ghes: Make SMBIOS handle private data to ghes Robert Richter
2020-04-24 12:12   ` kbuild test robot
2020-04-24 12:12     ` kbuild test robot
2020-04-24 16:21   ` Borislav Petkov
2020-05-05 12:48     ` Robert Richter
2020-04-22 11:58 ` Robert Richter [this message]
2020-04-22 11:58 ` [PATCH v2 06/10] EDAC/ghes: Remove local variable rdr_mask in ghes_edac_dmidecode() Robert Richter
2020-04-27  7:08   ` Borislav Petkov
2020-04-27 17:24     ` Luck, Tony
2020-04-27 17:34       ` Borislav Petkov
2020-05-19  9:34         ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 07/10] EDAC/ghes: Cleanup struct ghes_edac_dimm_fill, rename it to ghes_dimm_fill Robert Richter
2020-04-27 14:00   ` Borislav Petkov
2020-05-19  9:35     ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 08/10] EDAC/ghes: Carve out MC device handling into separate functions Robert Richter
2020-04-27 16:38   ` Borislav Petkov
2020-05-06  8:45     ` Robert Richter
2020-05-11 13:32       ` Borislav Petkov
2020-05-19  9:57         ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 09/10] EDAC/ghes: Have a separate code path for creating the fake MC Robert Richter
2020-04-22 11:58 ` [PATCH v2 10/10] EDAC/ghes: Carve out code into ghes_edac_register_{one,fake}() Robert Richter
2020-05-06  8:53 ` [PATCH v2 00/10] EDAC/mc/ghes: Fixes, cleanup and reworks Robert Richter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200422115814.22205-6-rrichter@marvell.com \
    --to=rrichter@marvell.com \
    --cc=aris@redhat.com \
    --cc=bp@alien8.de \
    --cc=james.morse@arm.com \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mbrugger@suse.com \
    --cc=mchehab@kernel.org \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.