linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Naveen Krishna Chatradhi <nchatrad@amd.com>
To: <linux-edac@vger.kernel.org>, <x86@kernel.org>
Cc: <linux-kernel@vger.kernel.org>, <bp@alien8.de>,
	<mingo@redhat.com>, <mchehab@kernel.org>, <yazen.ghannam@amd.com>,
	Muralidhara M K <muralimk@amd.com>,
	Naveen Krishna Chatradhi <nchatrad@amd.com>
Subject: [PATCH v4 1/4] x86/amd_nb: Add support for northbridges on Aldebaran
Date: Fri, 15 Oct 2021 00:20:55 +0530	[thread overview]
Message-ID: <20211014185058.9587-2-nchatrad@amd.com> (raw)
In-Reply-To: <20211014185058.9587-1-nchatrad@amd.com>

From: Muralidhara M K <muralimk@amd.com>

On newer systems the CPUs manage MCA errors reported from the GPUs.
Enumerate the GPU nodes with the AMD NB framework to support EDAC.

GPU nodes are enumerated in sequential order based on the PCI hierarchy,
and the first GPU node is assumed to have an "AMD Node ID" value after
CPU Nodes are fully populated.

Aldebaran is an AMD GPU, GPU drivers are part of the DRM framework
https://lists.freedesktop.org/archives/amd-gfx/2021-February/059694.html

Each Aldebaran GPU has 2 Data Fabrics, which are enumerated as 2 nodes.
With this implementation detail, the Data Fabric on the GPU nodes can be
accessed the same way as the Data Fabric on CPU nodes.

Special handling was necessary in northbridge enumeration as the
roots_per_misc value is different for GPU and CPU nodes.

Signed-off-by: Muralidhara M K <muralimk@amd.com>
Co-developed-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Signed-off-by: Naveen Krishna Chatradhi <nchatrad@amd.com>
Link: https://lkml.kernel.org/r/20210823185437.94417-2-nchatrad@amd.com
---
Changes since v3:
1. Use word "gpu" instead of "noncpu" in the patch
2. Do not create pci_dev_ids arrays for gpu nodes
3. Identify the gpu node start index from DF18F1 registers on the GPU nodes.
  a. Export cpu node count and gpu start node id

Changes since v2:
1. Added Reviewed-by Yazen Ghannam

Changes since v1:
1. Modified the commit message and comments in the code
2. Squashed patch 1/7: "x86/amd_nb: Add Aldebaran device to PCI IDs"

 arch/x86/include/asm/amd_nb.h |   9 +++
 arch/x86/kernel/amd_nb.c      | 131 ++++++++++++++++++++++++++++------
 include/linux/pci_ids.h       |   1 +
 3 files changed, 118 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 455066a06f60..5898300f11ed 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -68,10 +68,17 @@ struct amd_northbridge {
 	struct threshold_bank *bank4;
 };
 
+/* heterogeneous system node type map variables */
+struct amd_node_map {
+	u16 gpu_node_start_id;
+	u16 cpu_node_count;
+};
+
 struct amd_northbridge_info {
 	u16 num;
 	u64 flags;
 	struct amd_northbridge *nb;
+	struct amd_node_map *nmap;
 };
 
 #define AMD_NB_GART			BIT(0)
@@ -83,6 +90,8 @@ struct amd_northbridge_info {
 u16 amd_nb_num(void);
 bool amd_nb_has_feature(unsigned int feature);
 struct amd_northbridge *node_to_amd_nb(int node);
+u16 amd_gpu_node_start_id(void);
+u16 amd_cpu_node_count(void);
 
 static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev)
 {
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index c92c9c774c0e..54a6a7462f07 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -19,6 +19,7 @@
 #define PCI_DEVICE_ID_AMD_17H_M10H_ROOT	0x15d0
 #define PCI_DEVICE_ID_AMD_17H_M30H_ROOT	0x1480
 #define PCI_DEVICE_ID_AMD_17H_M60H_ROOT	0x1630
+#define PCI_DEVICE_ID_AMD_ALDEBARAN_ROOT 0x14bb
 #define PCI_DEVICE_ID_AMD_17H_DF_F4	0x1464
 #define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
 #define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
@@ -28,6 +29,7 @@
 #define PCI_DEVICE_ID_AMD_19H_M40H_ROOT	0x14b5
 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F4 0x167d
 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F4 0x166e
+#define PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F4 0x14d4
 
 /* Protect the PCI config register pairs used for SMN and DF indirect access. */
 static DEFINE_MUTEX(smn_mutex);
@@ -40,6 +42,7 @@ static const struct pci_device_id amd_root_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_ROOT) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_ALDEBARAN_ROOT) },
 	{}
 };
 
@@ -63,6 +66,7 @@ static const struct pci_device_id amd_nb_misc_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F3) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F3) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F3) },
 	{}
 };
 
@@ -81,6 +85,7 @@ static const struct pci_device_id amd_nb_link_ids[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M40H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_M50H_DF_F4) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F4) },
 	{}
 };
 
@@ -126,6 +131,55 @@ struct amd_northbridge *node_to_amd_nb(int node)
 }
 EXPORT_SYMBOL_GPL(node_to_amd_nb);
 
+/*
+ * GPU start index and CPU count values on an heterogeneous system,
+ * these values will be used by the AMD EDAC and MCE modules.
+ */
+u16 amd_gpu_node_start_id(void)
+{
+	return (amd_northbridges.nmap) ?
+		amd_northbridges.nmap->gpu_node_start_id : 0;
+}
+EXPORT_SYMBOL_GPL(amd_gpu_node_start_id);
+
+u16 amd_cpu_node_count(void)
+{
+	return (amd_northbridges.nmap) ?
+		amd_northbridges.nmap->cpu_node_count : amd_northbridges.num;
+}
+EXPORT_SYMBOL_GPL(amd_cpu_node_count);
+
+/* DF18xF1 regsters on Aldebaran GPU */
+#define REG_LOCAL_NODE_TYPE_MAP		0x144
+#define REG_RMT_NODE_TYPE_MAP		0x148
+
+#define PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F1 0x14d1
+
+static int amd_get_node_map(void)
+{
+	struct amd_node_map *np;
+	struct pci_dev *pdev = NULL;
+	u32 tmp;
+
+	pdev = pci_get_device(PCI_VENDOR_ID_AMD,
+			      PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F1, pdev);
+	if (!pdev)
+		return -ENODEV;
+
+	np = kmalloc(sizeof(*np), GFP_KERNEL);
+	if (!np)
+		return -ENOMEM;
+
+	pci_read_config_dword(pdev, REG_LOCAL_NODE_TYPE_MAP, &tmp);
+	np->gpu_node_start_id = tmp & 0xFFF;
+
+	pci_read_config_dword(pdev, REG_RMT_NODE_TYPE_MAP, &tmp);
+	np->cpu_node_count = tmp >> 16 & 0xFFF;
+
+	amd_northbridges.nmap = np;
+	return 0;
+}
+
 static struct pci_dev *next_northbridge(struct pci_dev *dev,
 					const struct pci_device_id *ids)
 {
@@ -230,6 +284,27 @@ int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo)
 }
 EXPORT_SYMBOL_GPL(amd_df_indirect_read);
 
+struct pci_dev *get_root_devs(struct pci_dev *root,
+			      const struct pci_device_id *root_ids,
+			      u16 roots_per_misc)
+{
+	u16 j;
+
+	/*
+	 * If there are more PCI root devices than data fabric/
+	 * system management network interfaces, then the (N)
+	 * PCI roots per DF/SMN interface are functionally the
+	 * same (for DF/SMN access) and N-1 are redundant.  N-1
+	 * PCI roots should be skipped per DF/SMN interface so
+	 * the following DF/SMN interfaces get mapped to
+	 * correct PCI roots.
+	 */
+	for (j = 0; j < roots_per_misc; j++)
+		root = next_northbridge(root, root_ids);
+
+	return root;
+}
+
 int amd_cache_northbridges(void)
 {
 	const struct pci_device_id *misc_ids = amd_nb_misc_ids;
@@ -237,10 +312,10 @@ int amd_cache_northbridges(void)
 	const struct pci_device_id *root_ids = amd_root_ids;
 	struct pci_dev *root, *misc, *link;
 	struct amd_northbridge *nb;
-	u16 roots_per_misc = 0;
-	u16 misc_count = 0;
-	u16 root_count = 0;
-	u16 i, j;
+	u16 roots_per_misc = 0, gpu_roots_per_misc = 0;
+	u16 misc_count = 0, gpu_misc_count = 0;
+	u16 root_count = 0, gpu_root_count = 0;
+	u16 i;
 
 	if (amd_northbridges.num)
 		return 0;
@@ -252,15 +327,23 @@ int amd_cache_northbridges(void)
 	}
 
 	misc = NULL;
-	while ((misc = next_northbridge(misc, misc_ids)) != NULL)
-		misc_count++;
+	while ((misc = next_northbridge(misc, misc_ids)) != NULL) {
+		if (misc->device == PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F3)
+			gpu_misc_count++;
+		else
+			misc_count++;
+	}
 
 	if (!misc_count)
 		return -ENODEV;
 
 	root = NULL;
-	while ((root = next_northbridge(root, root_ids)) != NULL)
-		root_count++;
+	while ((root = next_northbridge(root, root_ids)) != NULL) {
+		if (root->device == PCI_DEVICE_ID_AMD_ALDEBARAN_ROOT)
+			gpu_root_count++;
+		else
+			root_count++;
+	}
 
 	if (root_count) {
 		roots_per_misc = root_count / misc_count;
@@ -275,33 +358,35 @@ int amd_cache_northbridges(void)
 		}
 	}
 
-	nb = kcalloc(misc_count, sizeof(struct amd_northbridge), GFP_KERNEL);
+	/*
+	 * The number of miscs, roots and roots_per_misc might vary on different
+	 * nodes of a heterogeneous system.
+	 * calculate roots_per_misc accordingly in order to skip the redundant
+	 * roots and map the DF/SMN interfaces to correct PCI roots.
+	 */
+	if (gpu_root_count && gpu_misc_count) {
+		if (amd_get_node_map())
+			return -ENOMEM;
+
+		gpu_roots_per_misc = gpu_root_count / gpu_misc_count;
+	}
+
+	amd_northbridges.num = misc_count + gpu_misc_count;
+	nb = kcalloc(amd_northbridges.num, sizeof(struct amd_northbridge), GFP_KERNEL);
 	if (!nb)
 		return -ENOMEM;
 
 	amd_northbridges.nb = nb;
-	amd_northbridges.num = misc_count;
 
 	link = misc = root = NULL;
 	for (i = 0; i < amd_northbridges.num; i++) {
+		u16 misc_roots = i < misc_count ? roots_per_misc : gpu_roots_per_misc;
 		node_to_amd_nb(i)->root = root =
-			next_northbridge(root, root_ids);
+			get_root_devs(root, root_ids, misc_roots);
 		node_to_amd_nb(i)->misc = misc =
 			next_northbridge(misc, misc_ids);
 		node_to_amd_nb(i)->link = link =
 			next_northbridge(link, link_ids);
-
-		/*
-		 * If there are more PCI root devices than data fabric/
-		 * system management network interfaces, then the (N)
-		 * PCI roots per DF/SMN interface are functionally the
-		 * same (for DF/SMN access) and N-1 are redundant.  N-1
-		 * PCI roots should be skipped per DF/SMN interface so
-		 * the following DF/SMN interfaces get mapped to
-		 * correct PCI roots.
-		 */
-		for (j = 1; j < roots_per_misc; j++)
-			root = next_northbridge(root, root_ids);
 	}
 
 	if (amd_gart_present())
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 011f2f1ea5bb..b3a0ec29dbd6 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -557,6 +557,7 @@
 #define PCI_DEVICE_ID_AMD_19H_DF_F3	0x1653
 #define PCI_DEVICE_ID_AMD_19H_M40H_DF_F3 0x167c
 #define PCI_DEVICE_ID_AMD_19H_M50H_DF_F3 0x166d
+#define PCI_DEVICE_ID_AMD_ALDEBARAN_DF_F3 0x14d3
 #define PCI_DEVICE_ID_AMD_CNB17H_F3	0x1703
 #define PCI_DEVICE_ID_AMD_LANCE		0x2000
 #define PCI_DEVICE_ID_AMD_LANCE_HOME	0x2001
-- 
2.25.1


  reply	other threads:[~2021-10-14 18:51 UTC|newest]

Thread overview: 67+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-06-30 15:28 [PATCH 0/7] x86/edac/amd64: Add support for noncpu nodes Naveen Krishna Chatradhi
2021-06-30 15:28 ` [PATCH 1/7] x86/amd_nb: Add Aldebaran device to PCI IDs Naveen Krishna Chatradhi
2021-07-19 19:28   ` Yazen Ghannam
2021-08-10 12:45     ` Chatradhi, Naveen Krishna
2021-08-10 13:54       ` Borislav Petkov
2021-06-30 15:28 ` [PATCH 2/7] x86/amd_nb: Add support for northbridges on Aldebaran Naveen Krishna Chatradhi
2021-07-19 20:25   ` Yazen Ghannam
2021-08-10 12:45     ` Chatradhi, Naveen Krishna
2021-06-30 15:28 ` [PATCH 3/7] EDAC/mc: Add new HBM2 memory type Naveen Krishna Chatradhi
2021-07-19 20:47   ` Yazen Ghannam
2021-07-19 21:16     ` Luck, Tony
2021-07-20 12:13       ` Zhuo, Qiuxu
2021-07-20 16:30         ` [PATCH] EDAC/skx_common: Set the memory type correctly for HBM memory Luck, Tony
2021-07-20 16:33     ` [PATCH 3/7] EDAC/mc: Add new HBM2 memory type Luck, Tony
2021-06-30 15:28 ` [PATCH 4/7] EDAC/mce_amd: extract node id from InstanceHi in IPID Naveen Krishna Chatradhi
2021-07-29 16:32   ` Yazen Ghannam
2021-08-10 12:45     ` Chatradhi, Naveen Krishna
2021-06-30 15:28 ` [PATCH 5/7] EDAC/amd64: Enumerate memory on noncpu nodes Naveen Krishna Chatradhi
2021-07-29 17:55   ` Yazen Ghannam
2021-08-10 12:49     ` Chatradhi, Naveen Krishna
2021-06-30 15:28 ` [PATCH 6/7] EDAC/amd64: Add address translation support for DF3.5 Naveen Krishna Chatradhi
2021-07-29 17:59   ` Yazen Ghannam
2021-07-29 18:13     ` Chatradhi, Naveen Krishna
2021-06-30 15:28 ` [PATCH 7/7] EDAC/amd64: Add fixed UMC to CS mapping Naveen Krishna Chatradhi
2021-08-06  7:43 ` [PATCH v2 0/3] x86/edac/amd64: Add support for noncpu nodes Naveen Krishna Chatradhi
2021-08-06  7:43   ` [PATCH v2 1/3] x86/amd_nb: Add support for northbridges on Aldebaran Naveen Krishna Chatradhi
2021-08-20 15:36     ` Yazen Ghannam
2021-08-06  7:43   ` [PATCH v2 2/3] EDAC/mce_amd: Extract node id from InstanceHi in IPID Naveen Krishna Chatradhi
2021-08-20 15:43     ` Yazen Ghannam
2021-08-06  7:43   ` [PATCH v2 3/3] EDAC/amd64: Enumerate memory on noncpu nodes Naveen Krishna Chatradhi
2021-08-20 17:02     ` Yazen Ghannam
2021-08-23 16:56       ` Chatradhi, Naveen Krishna
2021-08-23 18:54     ` [PATCH v3 0/3] x86/edac/amd64: Add support for " Naveen Krishna Chatradhi
2021-08-23 18:54       ` [PATCH v3 1/3] x86/amd_nb: Add support for northbridges on Aldebaran Naveen Krishna Chatradhi
2021-08-25 10:42         ` Borislav Petkov
2021-09-01 18:17           ` Yazen Ghannam
2021-09-02 17:30             ` Borislav Petkov
2021-09-13 18:07               ` Yazen Ghannam
2021-09-16 16:36                 ` Borislav Petkov
2021-10-11 14:26           ` Chatradhi, Naveen Krishna
2021-10-11 18:08             ` Borislav Petkov
2021-08-23 18:54       ` [PATCH v3 2/3] EDAC/mce_amd: Extract node id from MCA_IPID Naveen Krishna Chatradhi
2021-08-27 10:24         ` Borislav Petkov
2021-09-01 18:18           ` Yazen Ghannam
2021-08-23 18:54       ` [PATCH v3 3/3] EDAC/amd64: Enumerate memory on noncpu nodes Naveen Krishna Chatradhi
2021-08-27 11:30         ` Borislav Petkov
2021-09-01 18:42           ` Yazen Ghannam
2021-09-08 18:41             ` Borislav Petkov
2021-09-13 18:19               ` Yazen Ghannam
2021-10-14 18:50       ` [PATCH v4 0/4] x86/edac/amd64: Add support for " Naveen Krishna Chatradhi
2021-10-14 18:50         ` Naveen Krishna Chatradhi [this message]
2021-10-14 18:50         ` [PATCH v4 2/4] EDAC/mce_amd: Extract node id from MCA_IPID Naveen Krishna Chatradhi
2021-10-14 18:50         ` [PATCH 3/4] EDAC/amd64: Extend family ops functions Naveen Krishna Chatradhi
2021-10-14 18:50         ` [PATCH 4/4] EDAC/amd64: Enumerate memory on Aldebaran GPU nodes Naveen Krishna Chatradhi
2021-10-14 18:53       ` [PATCH v4 0/4] x86/edac/amd64: Add support for noncpu nodes Naveen Krishna Chatradhi
2021-10-14 18:53         ` [PATCH v4 1/4] x86/amd_nb: Add support for northbridges on Aldebaran Naveen Krishna Chatradhi
2021-10-21 15:52           ` Yazen Ghannam
2021-10-14 18:53         ` [PATCH v4 2/4] EDAC/mce_amd: Extract node id from MCA_IPID Naveen Krishna Chatradhi
2021-10-21 15:55           ` Yazen Ghannam
2021-10-14 18:53         ` [PATCH v4 3/4] EDAC/amd64: Extend family ops functions Naveen Krishna Chatradhi
2021-10-21 16:27           ` Yazen Ghannam
2021-10-14 18:54         ` [PATCH v4 4/4] EDAC/amd64: Enumerate memory on Aldebaran GPU nodes Naveen Krishna Chatradhi
2021-10-21 16:40           ` Yazen Ghannam
2021-10-14 19:53         ` [PATCH v4 0/4] x86/edac/amd64: Add support for noncpu nodes Borislav Petkov
2021-10-15 12:18           ` Chatradhi, Naveen Krishna
2021-10-15 19:25             ` Borislav Petkov
2021-08-20 17:07   ` [PATCH v2 0/3] " Yazen Ghannam

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20211014185058.9587-2-nchatrad@amd.com \
    --to=nchatrad@amd.com \
    --cc=bp@alien8.de \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mchehab@kernel.org \
    --cc=mingo@redhat.com \
    --cc=muralimk@amd.com \
    --cc=x86@kernel.org \
    --cc=yazen.ghannam@amd.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).