linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Naveen Krishna Chatradhi <nchatrad@amd.com>
To: <linux-edac@vger.kernel.org>, <x86@kernel.org>
Cc: <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
	<mingo@redhat.com>, <mchehab@kernel.org>, <yazen.ghannam@amd.com>,
	Naveen Krishna Chatradhi <nchatrad@amd.com>,
	Muralidhara M K <muralimk@amd.com>
Subject: [PATCH v7 08/12] EDAC/amd64: Add Family ops to update GPU csrow and channel info
Date: Thu, 3 Feb 2022 11:49:38 -0600	[thread overview]
Message-ID: <20220203174942.31630-9-nchatrad@amd.com> (raw)
In-Reply-To: <20220203174942.31630-1-nchatrad@amd.com>

GPU node has 'X' number of PHYs and 'Y' number of channels.
This results in 'X*Y' number of instances in the Data Fabric.
Therefore the Data Fabric ID of an instance in GPU as below:
  df_inst_id = 'X' * number of channels per PHY + 'Y'

On CPUs the Data Fabric ID of an instance on a CPU is equal to the
UMC number. since the UMC number and channel are equal in CPU nodes,
the channel can be used as the Data Fabric ID of the instance.

Cc: Yazen Ghannam <yazen.ghannam@amd.com>
Co-developed-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Muralidhara M K <muralimk@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
---
v1->v7:
* New change in v7

 drivers/edac/amd64_edac.c | 60 +++++++++++++++++++++++++++++++++++++--
 drivers/edac/amd64_edac.h |  2 ++
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 10efe726a959..241419a0be93 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -3653,6 +3653,30 @@ static inline void decode_bus_error(int node_id, struct mce *m)
 	__log_ecc_error(mci, &err, ecc_type);
 }
 
+/*
+ * On CPUs, The Data Fabric ID of an instance is equal to the UMC number.
+ * And since the UMC number and channel are equal in CPU nodes, the channel can be used
+ * as the Data Fabric ID of the instance.
+ */
+static int f17_df_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+			  struct err_info *err)
+{
+	return err->channel;
+}
+
+/*
+ * A GPU node has 'X' number of PHYs and 'Y' number of channels.
+ * This results in 'X*Y' number of instances in the Data Fabric.
+ * Therefore the Data Fabric ID of an instance can be found with the following formula:
+ * df_inst_id = 'X' * number of channels per PHY + 'Y'
+ *
+ */
+static int gpu_df_inst_id(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+			  struct err_info *err)
+{
+	return (err->csrow * pvt->channel_count / mci->nr_csrows) + err->channel;
+}
+
 /*
  * To find the UMC channel represented by this bank we need to match on its
  * instance_id. The instance_id of a bank is held in the lower 32 bits of its
@@ -3670,12 +3694,38 @@ static void update_umc_err_info(struct mce *m, struct err_info *err)
 	err->csrow = m->synd & 0x7;
 }
 
+/*
+ * The CPUs have one channel per UMC, So  UMC number is equivalent to a
+ * channel number. The GPUs have 8 channels per UMC, so the UMC number no
+ * longer works as a channel number.
+ * The channel number within a GPU UMC is given in MCA_IPID[15:12].
+ * However, the IDs are split such that two UMC values go to one UMC, and
+ * the channel numbers are split in two groups of four.
+ *
+ * Refer comment on get_umc_base_gpu() from amd64_edac.h
+ *
+ * For example,
+ * UMC0 CH[3:0] = 0x0005[3:0]000
+ * UMC0 CH[7:4] = 0x0015[3:0]000
+ * UMC1 CH[3:0] = 0x0025[3:0]000
+ * UMC1 CH[7:4] = 0x0035[3:0]000
+ */
+static void gpu_update_umc_err_info(struct mce *m, struct err_info *err)
+{
+	u8 ch = (m->ipid & GENMASK(31, 0)) >> 20;
+	u8 phy = ((m->ipid >> 12) & 0xf);
+
+	err->channel = ch % 2 ? phy + 4 : phy;
+	err->csrow = phy;
+}
+
 static void decode_umc_error(int node_id, struct mce *m)
 {
 	u8 ecc_type = (m->status >> 45) & 0x3;
 	struct mem_ctl_info *mci;
 	struct amd64_pvt *pvt;
 	struct err_info err;
+	u8 df_inst_id;
 	u64 sys_addr;
 
 	mci = edac_mc_find(node_id);
@@ -3705,7 +3755,9 @@ static void decode_umc_error(int node_id, struct mce *m)
 
 	pvt->ops->get_umc_err_info(m, &err);
 
-	if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, err.channel, &sys_addr)) {
+	df_inst_id = pvt->ops->find_umc_inst_id(mci, pvt, &err);
+
+	if (umc_normaddr_to_sysaddr(m->addr, pvt->mc_node_id, df_inst_id, &sys_addr)) {
 		err.err_code = ERR_NORM_ADDR;
 		goto log_error;
 	}
@@ -4625,6 +4677,7 @@ static void per_family_init(struct amd64_pvt *pvt)
 		pvt->ops->get_mc_regs			= __read_mc_regs_df;
 		pvt->ops->populate_csrows		= init_csrows_df;
 		pvt->ops->get_umc_err_info		= update_umc_err_info;
+		pvt->ops->find_umc_inst_id		= f17_df_inst_id;
 
 		if (pvt->fam == 0x18) {
 			pvt->ctl_name			= "F18h";
@@ -4678,6 +4731,8 @@ static void per_family_init(struct amd64_pvt *pvt)
 				pvt->ops->dump_misc_regs	= gpu_dump_misc_regs;
 				pvt->ops->get_mc_regs		= gpu_read_mc_regs;
 				pvt->ops->populate_csrows	= gpu_init_csrows;
+				pvt->ops->get_umc_err_info	= gpu_update_umc_err_info;
+				pvt->ops->find_umc_inst_id	= gpu_df_inst_id;
 				goto end_fam;
 			} else {
 				pvt->ctl_name		= "F19h_M30h";
@@ -4707,6 +4762,7 @@ static void per_family_init(struct amd64_pvt *pvt)
 		pvt->ops->get_mc_regs			= __read_mc_regs_df;
 		pvt->ops->populate_csrows		= init_csrows_df;
 		pvt->ops->get_umc_err_info		= update_umc_err_info;
+		pvt->ops->find_umc_inst_id		= f17_df_inst_id;
  end_fam:
 		break;
 
@@ -4728,7 +4784,7 @@ static void per_family_init(struct amd64_pvt *pvt)
 	}
 
 	/* ops required for families 17h and later */
-	if (pvt->fam >= 0x17 && !pvt->ops->get_umc_err_info) {
+	if (pvt->fam >= 0x17 && (!pvt->ops->get_umc_err_info || !pvt->ops->find_umc_inst_id)) {
 		edac_dbg(1, "Platform specific helper routines not defined.\n");
 		return;
 	}
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 71df08a496d2..b6177bd5d5ba 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -496,6 +496,8 @@ struct low_ops {
 	void (*setup_mci_misc_attrs)(struct mem_ctl_info *mci);
 	int  (*populate_csrows)(struct mem_ctl_info *mci);
 	void (*get_umc_err_info)(struct mce *m, struct err_info *err);
+	int  (*find_umc_inst_id)(struct mem_ctl_info *mci, struct amd64_pvt *pvt,
+				 struct err_info *err);
 };
 
 int __amd64_read_pci_cfg_dword(struct pci_dev *pdev, int offset,
-- 
2.25.1


  parent reply	other threads:[~2022-02-03 17:51 UTC|newest]

Thread overview: 22+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-02-03 17:49 [PATCH v7 00/12] x86/edac/amd64: Add support for GPU nodes Naveen Krishna Chatradhi
2022-02-03 17:49 ` [PATCH v7 01/12] EDAC/amd64: Document heterogeneous enumeration Naveen Krishna Chatradhi
2022-02-09 22:34   ` Yazen Ghannam
2022-02-03 17:49 ` [PATCH v7 02/12] x86/amd_nb: Add support for northbridges on Aldebaran Naveen Krishna Chatradhi
2022-02-09 23:23   ` Yazen Ghannam
2022-02-03 17:49 ` [PATCH v7 03/12] EDAC/mce_amd: Extract node id from MCA_IPID Naveen Krishna Chatradhi
2022-02-09 23:31   ` Yazen Ghannam
2022-02-14 17:54     ` Chatradhi, Naveen Krishna
2022-02-03 17:49 ` [PATCH v7 04/12] EDAC/amd64: Move struct fam_type variables into amd64_pvt structure Naveen Krishna Chatradhi
2022-02-15 15:39   ` Yazen Ghannam
2022-02-03 17:49 ` [PATCH v7 05/12] EDAC/amd64: Define dynamic family ops routines Naveen Krishna Chatradhi
2022-02-15 15:49   ` Yazen Ghannam
2022-02-03 17:49 ` [PATCH v7 06/12] EDAC/amd64: Add AMD heterogeneous family 19h Model 30h-3fh Naveen Krishna Chatradhi
2022-02-15 16:20   ` Yazen Ghannam
2022-02-03 17:49 ` [PATCH v7 07/12] EDAC/amd64: Enumerate Aldebaran GPU nodes by adding family ops Naveen Krishna Chatradhi
2022-02-15 16:34   ` Yazen Ghannam
2022-02-03 17:49 ` Naveen Krishna Chatradhi [this message]
2022-02-15 16:43   ` [PATCH v7 08/12] EDAC/amd64: Add Family ops to update GPU csrow and channel info Yazen Ghannam
2022-02-03 17:49 ` [PATCH v7 09/12] EDAC/amd64: Add check for when to add DRAM base and hole Naveen Krishna Chatradhi
2022-02-03 17:49 ` [PATCH v7 10/12] EDAC/amd64: Save the number of block instances Naveen Krishna Chatradhi
2022-02-03 17:49 ` [PATCH v7 11/12] EDAC/amd64: Add address translation support for DF3.5 Naveen Krishna Chatradhi
2022-02-03 17:49 ` [PATCH v7 12/12] EDAC/amd64: Add fixed UMC to CS mapping Naveen Krishna Chatradhi

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220203174942.31630-9-nchatrad@amd.com \
    --to=nchatrad@amd.com \
    --cc=bp@alien8.de \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mchehab@kernel.org \
    --cc=mingo@redhat.com \
    --cc=muralimk@amd.com \
    --cc=x86@kernel.org \
    --cc=yazen.ghannam@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).