Linux-EDAC Archive on lore.kernel.org
 help / color / Atom feed
From: Robert Richter <rrichter@marvell.com>
To: Borislav Petkov <bp@alien8.de>, Tony Luck <tony.luck@intel.com>,
	"James Morse" <james.morse@arm.com>,
	Mauro Carvalho Chehab <mchehab@kernel.org>
Cc: "linux-edac@vger.kernel.org" <linux-edac@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	Robert Richter <rrichter@marvell.com>
Subject: [PATCH 19/21] EDAC, ghes: Identify dimm by node, card, module and handle
Date: Wed, 29 May 2019 08:44:45 +0000
Message-ID: <20190529084344.28562-20-rrichter@marvell.com> (raw)
In-Reply-To: <20190529084344.28562-1-rrichter@marvell.com>

According to SMBIOS Spec. 2.7 (N.2.5 Memory Error Section), a failing
DIMM (module or rank number) can be identified by its error location
consisting of node, card and module. A module handle is used to map it
to the dimms listed in the dmi table. Collect all those data from the
error record and select the dimm accordingly. Inconsistent error
records will be reported which is the case if the same dimm handle
reports errors with different node, card or module.

The change allows to enable per-layer reporting based on node, card
and module in the next patch.

Signed-off-by: Robert Richter <rrichter@marvell.com>
---
 drivers/edac/ghes_edac.c | 74 +++++++++++++++++++++++++++++++++-------
 1 file changed, 62 insertions(+), 12 deletions(-)

diff --git a/drivers/edac/ghes_edac.c b/drivers/edac/ghes_edac.c
index 4bac643d3404..07c847ed7315 100644
--- a/drivers/edac/ghes_edac.c
+++ b/drivers/edac/ghes_edac.c
@@ -83,8 +83,11 @@ struct memarr_dmi_entry {
 
 struct ghes_dimm_info {
 	struct dimm_info dimm_info;
+	struct dimm_info *dimm;
 	int		idx;
 	int		numa_node;
+	int		card;
+	int		module;
 	phys_addr_t	start;
 	phys_addr_t	end;
 	u16		phys_handle;
@@ -119,6 +122,8 @@ static void ghes_dimm_info_init(void)
 	for_each_dimm(dimm) {
 		dimm->idx	= idx;
 		dimm->numa_node	= NUMA_NO_NODE;
+		dimm->card	= -1;
+		dimm->module	= -1;
 		idx++;
 	}
 }
@@ -401,6 +406,13 @@ static void mci_add_dimm_info(struct mem_ctl_info *mci)
 
 		if (*dmi_dimm->label)
 			strcpy(mci_dimm->label, dmi_dimm->label);
+
+		/*
+		 * From here on do not use any longer &dimm.dimm_info.
+		 * Instead switch to the mci's dimm info which might
+		 * contain updated data, such as the label.
+		 */
+		dimm->dimm = mci_dimm;
 	}
 
 	if (index != mci->tot_dimms)
@@ -408,24 +420,46 @@ static void mci_add_dimm_info(struct mem_ctl_info *mci)
 			index, mci->tot_dimms);
 }
 
-static struct mem_ctl_info *get_mc_by_node(int nid)
+/* Requires ghes_lock being set. */
+static struct ghes_dimm_info *
+get_and_prepare_dimm_info(int nid, int card, int module, int handle)
 {
-	struct mem_ctl_info *mci = edac_mc_find(nid);
+	static struct ghes_dimm_info *dimm;
+	struct dimm_info *di;
 
-	if (mci)
-		return mci;
+	/*
+	 * We require smbios_handle being set in the error report for
+	 * per layer reporting (SMBIOS handle for the Type 17 Memory
+	 * Device Structure that represents the Memory Module)
+	 */
+	for_each_dimm(dimm) {
+		di = dimm->dimm;
+		if (di->smbios_handle == handle)
+			goto found;
+	}
 
-	if (num_possible_nodes() > 1) {
-		edac_mc_printk(fallback, KERN_WARNING,
-			"Invalid or no node information, falling back to first node: %s",
-			fallback->dev_name);
+	return NULL;
+found:
+	if (dimm->card < 0 && card >= 0)
+		dimm->card = card;
+	if (dimm->module < 0 && module >= 0)
+		dimm->module = module;
+
+	if ((num_possible_nodes() > 1 && di->mci->mc_idx != nid) ||
+		(card >= 0 && card != dimm->card) ||
+		(module >= 0 && module != dimm->module)) {
+		edac_mc_printk(di->mci, KERN_WARNING,
+			"Inconsistent error report (nid/card/module): %d/%d/%d (dimm%d: %d/%d/%d)",
+			nid, card, module, di->idx,
+			di->mci->mc_idx, dimm->card, dimm->module);
 	}
 
-	return fallback;
+	return dimm;
 }
 
 void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 {
+	struct ghes_dimm_info *dimm;
 	struct dimm_info *dimm_info;
 	enum hw_event_mc_err_type type;
 	struct edac_raw_error_desc *e;
@@ -434,6 +468,9 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 	unsigned long flags;
 	char *p;
 	int nid = NUMA_NO_NODE;
+	int card = -1;
+	int module = -1;
+	int handle = -1;
 
 	/* We need at least one mc */
 	if (WARN_ON_ONCE(!fallback))
@@ -449,10 +486,23 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 
 	spin_lock_irqsave(&ghes_lock, flags);
 
-	/* select the node's mc device */
 	if (mem_err->validation_bits & CPER_MEM_VALID_NODE)
 		nid = mem_err->node;
-	mci = get_mc_by_node(nid);
+	if (mem_err->validation_bits & CPER_MEM_VALID_CARD)
+		card = mem_err->card;
+	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE)
+		module = mem_err->module;
+	if (mem_err->validation_bits & CPER_MEM_VALID_MODULE_HANDLE)
+		handle = mem_err->mem_dev_handle;
+
+	dimm = get_and_prepare_dimm_info(nid, card, module, handle);
+	if (dimm)
+		mci = dimm->dimm->mci;
+	else
+		mci = edac_mc_find(nid);
+	if (!mci)
+		mci = fallback;
+
 	pvt = mci->pvt_info;
 	e = &mci->error_desc;
 
@@ -670,7 +720,7 @@ void ghes_edac_report_mem_error(int sev, struct cper_sec_mem_err *mem_err)
 	if (p > pvt->other_detail)
 		*(p - 1) = '\0';
 
-	dimm_info = edac_get_dimm_by_index(mci, e->top_layer);
+	dimm_info = dimm ? dimm->dimm : NULL;
 
 	edac_raw_mc_handle_error(type, mci, dimm_info, e, -1, -1);
 
-- 
2.20.1


  parent reply index

Thread overview: 43+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-05-29  8:44 [PATCH 00/21] EDAC, mc, ghes: Fixes and updates to improve memory error reporting Robert Richter
2019-05-29  8:44 ` [PATCH 01/21] EDAC, mc: Fix edac_mc_find() in case no device is found Robert Richter
2019-05-29  8:44 ` [PATCH 02/21] EDAC: Fixes to use put_device() after device_add() errors Robert Richter
2019-06-11 17:28   ` Borislav Petkov
2019-06-12 17:17     ` Robert Richter
2019-05-29  8:44 ` [PATCH 03/21] EDAC: Kill EDAC_DIMM_PTR() macro Robert Richter
2019-05-29  8:44 ` [PATCH 04/21] EDAC: Kill EDAC_DIMM_OFF() macro Robert Richter
2019-05-29  8:44 ` [PATCH 05/21] EDAC: Introduce mci_for_each_dimm() iterator Robert Richter
2019-05-29  8:44 ` [PATCH 06/21] EDAC, mc: Cleanup _edac_mc_free() code Robert Richter
2019-05-29  8:44 ` [PATCH 07/21] EDAC, mc: Remove per layer counters Robert Richter
2019-05-29  8:44 ` [PATCH 08/21] EDAC, mc: Rework edac_raw_mc_handle_error() to use struct dimm_info Robert Richter
2019-05-29  8:44 ` [PATCH 09/21] EDAC, ghes: Use standard kernel macros for page calculations Robert Richter
2019-05-29 15:13   ` James Morse
2019-05-29  8:44 ` [PATCH 10/21] EDAC, ghes: Remove pvt->detail_location string Robert Richter
2019-05-29 15:13   ` James Morse
2019-06-12 18:13     ` Robert Richter
2019-05-29  8:44 ` [PATCH 11/21] EDAC, ghes: Unify trace_mc_event() code with edac_mc driver Robert Richter
2019-05-29 15:12   ` James Morse
2019-06-03 13:10     ` Robert Richter
2019-06-04 17:15       ` James Morse
2019-06-13 22:23         ` Robert Richter
2019-05-29  8:44 ` [PATCH 12/21] EDAC, ghes: Add support for legacy API counters Robert Richter
2019-05-29 15:13   ` James Morse
2019-06-12 18:41     ` Robert Richter
2019-06-19 17:22       ` James Morse
2019-06-20  6:55         ` Robert Richter
2019-06-26  9:33           ` James Morse
2019-06-26 10:27             ` Robert Richter
2019-05-29  8:44 ` [PATCH 13/21] EDAC, ghes: Rework memory hierarchy detection Robert Richter
2019-05-29 15:06   ` James Morse
2019-05-31 13:41     ` Robert Richter
2019-05-29  8:44 ` [PATCH 14/21] EDAC, ghes: Extract numa node information for each dimm Robert Richter
2019-05-29 17:51   ` James Morse
2019-06-13 20:52     ` Robert Richter
2019-05-29  8:44 ` [PATCH 15/21] EDAC, ghes: Moving code around ghes_edac_register() Robert Richter
2019-05-29  8:44 ` [PATCH 16/21] EDAC, ghes: Create one memory controller device per node Robert Richter
2019-05-29  8:44 ` [PATCH 17/21] EDAC, ghes: Fill sysfs with the DMI DIMM label information Robert Richter
2019-05-29  8:44 ` [PATCH 18/21] EDAC, mc: Introduce edac_mc_alloc_by_dimm() for per dimm allocation Robert Richter
2019-05-29  8:44 ` Robert Richter [this message]
2019-05-29  8:44 ` [PATCH 20/21] EDAC, ghes: Enable per-layer reporting based on card/module Robert Richter
2019-05-29  8:44 ` [PATCH 21/21] EDAC, Documentation: Describe CPER module definition and DIMM ranks Robert Richter
2019-05-29 14:54 ` [PATCH 00/21] EDAC, mc, ghes: Fixes and updates to improve memory error reporting Borislav Petkov
2019-05-31 14:48   ` Robert Richter

Reply instructions:

You may reply publically to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190529084344.28562-20-rrichter@marvell.com \
    --to=rrichter@marvell.com \
    --cc=bp@alien8.de \
    --cc=james.morse@arm.com \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mchehab@kernel.org \
    --cc=tony.luck@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Linux-EDAC Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-edac/0 linux-edac/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-edac linux-edac/ https://lore.kernel.org/linux-edac \
		linux-edac@vger.kernel.org linux-edac@archiver.kernel.org
	public-inbox-index linux-edac


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-edac


AGPL code for this site: git clone https://public-inbox.org/ public-inbox