Linux-EDAC Archive on lore.kernel.org
 help / color / Atom feed
From: Robert Richter <rrichter@marvell.com>
To: Borislav Petkov <bp@alien8.de>,
	Mauro Carvalho Chehab <mchehab@kernel.org>,
	Tony Luck <tony.luck@intel.com>
Cc: James Morse <james.morse@arm.com>,
	Aristeu Rozanski <aris@redhat.com>,
	Robert Richter <rrichter@marvell.com>,
	Matthias Brugger <mbrugger@suse.com>,
	<linux-edac@vger.kernel.org>, <linux-kernel@vger.kernel.org>
Subject: [PATCH v2 05/10] EDAC/ghes: Setup DIMM label from DMI and use it in error reports
Date: Wed, 22 Apr 2020 13:58:09 +0200
Message-ID: <20200422115814.22205-6-rrichter@marvell.com> (raw)
In-Reply-To: <20200422115814.22205-1-rrichter@marvell.com>

The ghes driver reports errors with 'unknown label' even if the actual
DIMM label is known, e.g.:

 EDAC MC0: 1 CE Single-bit ECC on unknown label (node:0 card:0
   module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM location:N0 DIMM_A0
   page:0x966a9b3 offset:0x0 grain:1 syndrome:0x0 - APEI location:
   node:0 card:0 module:0 rank:1 bank:0 col:13 bit_pos:16 DIMM
   location:N0 DIMM_A0 status(0x0000000000000400): Storage error in
   DRAM memory)

Fix this by using struct dimm_info's label string in error reports:

 EDAC MC0: 1 CE Single-bit ECC on N0 DIMM_A0 (node:0 card:0 module:0
   rank:1 bank:515 col:14 bit_pos:16 DIMM location:N0 DIMM_A0
   page:0x99223d8 offset:0x0 grain:1 syndrome:0x0 - APEI location:
   node:0 card:0 module:0 rank:1 bank:515 col:14 bit_pos:16 DIMM
   location:N0 DIMM_A0 status(0x0000000000000400): Storage error in
   DRAM memory)

The labels are initialized by reading the bank and device strings from
DMI. Now, the label information can also read from sysfs. E.g. a
ThunderX2 system will show the following:

 /sys/devices/system/edac/mc/mc0/dimm0/dimm_label:N0 DIMM_A0
 /sys/devices/system/edac/mc/mc0/dimm1/dimm_label:N0 DIMM_B0
 /sys/devices/system/edac/mc/mc0/dimm2/dimm_label:N0 DIMM_C0
 /sys/devices/system/edac/mc/mc0/dimm3/dimm_label:N0 DIMM_D0
 /sys/devices/system/edac/mc/mc0/dimm4/dimm_label:N0 DIMM_E0
 /sys/devices/system/edac/mc/mc0/dimm5/dimm_label:N0 DIMM_F0
 /sys/devices/system/edac/mc/mc0/dimm6/dimm_label:N0 DIMM_G0
 /sys/devices/system/edac/mc/mc0/dimm7/dimm_label:N0 DIMM_H0
 /sys/devices/system/edac/mc/mc0/dimm8/dimm_label:N1 DIMM_I0
 /sys/devices/system/edac/mc/mc0/dimm9/dimm_label:N1 DIMM_J0
 /sys/devices/system/edac/mc/mc0/dimm10/dimm_label:N1 DIMM_K0
 /sys/devices/system/edac/mc/mc0/dimm11/dimm_label:N1 DIMM_L0
 /sys/devices/system/edac/mc/mc0/dimm12/dimm_label:N1 DIMM_M0
 /sys/devices/system/edac/mc/mc0/dimm13/dimm_label:N1 DIMM_N0
 /sys/devices/system/edac/mc/mc0/dimm14/dimm_label:N1 DIMM_O0
 /sys/devices/system/edac/mc/mc0/dimm15/dimm_label:N1 DIMM_P0

Since dimm_labels can be rewritten, that label will be used in a later
error report:

 # echo foobar >/sys/devices/system/edac/mc/mc0/dimm0/dimm_label
 # # some error injection here
 # dmesg | grep foobar
 [ 2119.784489] EDAC MC0: 1 CE Single-bit ECC on foobar (node:0 card:0
 module:0 rank:0 bank:769 col:1 bit_pos:16 DIMM location:foobar
 page:0x94d027 offset:0x0 grain:1 syndrome:0x0 - APEI location: node:0
 card:0 module:0 rank:0 bank:769 col:1 bit_pos:16 DIMM location:foobar
 status(0x0000000000000400): Storage error in DRAM memory)

Signed-off-by: Robert Richter <rrichter@marvell.com>
---
 drivers/edac/ghes_edac.c | 43 +++++++++++++++++++++++++---------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 23adb7674f9b..a5890afa9c71 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -153,7 +153,7 @@ static void ghes_edac_count_dimms(const struct dmi_header *dh, void *arg)
 		(*num_dimm)++;
 }
 
-static int get_dimm_smbios_index(u16 handle)
+static struct dimm_info *find_dimm_by_handle(u16 handle)
 {
 	struct ghes_dimm *ghes_dimm;
 
@@ -161,10 +161,25 @@ static int get_dimm_smbios_index(u16 handle)
 
 	list_for_each_entry(ghes_dimm, &ghes_dimm_list, entry) {
 		if (ghes_dimm->handle == handle)
-			return ghes_dimm->dimm->idx;
+			return ghes_dimm->dimm;
 	}
 
-	return -1;
+	return NULL;
+}
+
+static void ghes_dimm_setup_label(struct dimm_info *dimm, u16 handle)
+{
+	const char *bank = NULL, *device = NULL;
+
+	dmi_memdev_name(handle, &bank, &device);
+
+	/* both strings must be non-zero */
+	if (bank && *bank && device && *device)
+		snprintf(dimm->label, sizeof(dimm->label),
+			"%s %s", bank, device);
+	else
+		snprintf(dimm->label, sizeof(dimm->label),
+			"unknown memory (handle: 0x%.4x)", handle);
 }
 
 static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
@@ -248,9 +263,7 @@ static void ghes_edac_dmidecode(const struct dmi_header *dh, void *arg)
 		dimm->dtype = DEV_UNKNOWN;
 		dimm->grain = 128;		/* Likely, worse case */
 
-		/*
-		 * FIXME: It shouldn't be hard to also fill the DIMM labels
-		 */
+		ghes_dimm_setup_label(dimm, entry->handle);
 
 		if (dimm->nr_pages) {
 			edac_dbg(1, "DIMM%i: %s size = %d MB%s\n",
@@ -416,19 +429,17 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 	if (mem_err->validation_bits & CPER_MEM_VALID_BIT_POSITION)
 		p += sprintf(p, "bit_pos:%d ", mem_err->bit_pos);
 	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE) {
-		const char *bank = NULL, *device = NULL;
-		int index = -1;
+		struct dimm_info *dimm;
 
-		dmi_memdev_name(mem_err->mem_dev_handle, &bank, &device);
-		if (bank != NULL && device != NULL)
-			p += sprintf(p, "DIMM location:%s %s ", bank, device);
-		else
+		dimm = find_dimm_by_handle(mem_err->mem_dev_handle);
+		if (dimm) {
+			e->top_layer = dimm->idx;
+			strcpy(e->label, dimm->label);
+			p += sprintf(p, "DIMM location:%s ", dimm->label);
+		} else {
 			p += sprintf(p, "DIMM DMI handle: 0x%.4x ",
 				     mem_err->mem_dev_handle);
-
-		index = get_dimm_smbios_index(mem_err->mem_dev_handle);
-		if (index >= 0)
-			e->top_layer = index;
+		}
 	}
 	if (p > e->location)
 		*(p - 1) = '\0';
-- 
2.20.1


  parent reply index

Thread overview: 31+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-04-22 11:58 [PATCH v2 00/10] EDAC/mc/ghes: Fixes, cleanup and reworks Robert Richter
2020-04-22 11:58 ` [PATCH v2 01/10] EDAC/mc: Fix usage of snprintf() and dimm location setup Robert Richter
2020-04-22 20:52   ` Borislav Petkov
2020-05-19  9:27     ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 02/10] EDAC/mc: Use int type for parameters of edac_mc_alloc() Robert Richter
2020-04-23 17:49   ` Borislav Petkov
2020-05-19  9:33     ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 03/10] EDAC/ghes: Remove unused members of struct ghes_edac_pvt, rename it to ghes_mci Robert Richter
2020-04-23 17:55   ` Borislav Petkov
2020-05-05  7:50     ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 04/10] EDAC/ghes: Make SMBIOS handle private data to ghes Robert Richter
2020-04-24 12:12   ` kbuild test robot
2020-04-24 16:21   ` Borislav Petkov
2020-05-05 12:48     ` Robert Richter
2020-04-22 11:58 ` Robert Richter [this message]
2020-04-22 11:58 ` [PATCH v2 06/10] EDAC/ghes: Remove local variable rdr_mask in ghes_edac_dmidecode() Robert Richter
2020-04-27  7:08   ` Borislav Petkov
2020-04-27 17:24     ` Luck, Tony
2020-04-27 17:34       ` Borislav Petkov
2020-05-19  9:34         ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 07/10] EDAC/ghes: Cleanup struct ghes_edac_dimm_fill, rename it to ghes_dimm_fill Robert Richter
2020-04-27 14:00   ` Borislav Petkov
2020-05-19  9:35     ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 08/10] EDAC/ghes: Carve out MC device handling into separate functions Robert Richter
2020-04-27 16:38   ` Borislav Petkov
2020-05-06  8:45     ` Robert Richter
2020-05-11 13:32       ` Borislav Petkov
2020-05-19  9:57         ` Robert Richter
2020-04-22 11:58 ` [PATCH v2 09/10] EDAC/ghes: Have a separate code path for creating the fake MC Robert Richter
2020-04-22 11:58 ` [PATCH v2 10/10] EDAC/ghes: Carve out code into ghes_edac_register_{one,fake}() Robert Richter
2020-05-06  8:53 ` [PATCH v2 00/10] EDAC/mc/ghes: Fixes, cleanup and reworks Robert Richter

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200422115814.22205-6-rrichter@marvell.com \
    --to=rrichter@marvell.com \
    --cc=aris@redhat.com \
    --cc=bp@alien8.de \
    --cc=james.morse@arm.com \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mbrugger@suse.com \
    --cc=mchehab@kernel.org \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-EDAC Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-edac/0 linux-edac/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-edac linux-edac/ https://lore.kernel.org/linux-edac \
		linux-edac@vger.kernel.org
	public-inbox-index linux-edac

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-edac


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git