linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/6] Bundle of Intel EDAC patches
@ 2021-06-11 17:01 Tony Luck
  2021-06-11 17:01 ` [PATCH 1/6] EDAC/skx_common: Add new ADXL components for 2-level memory Tony Luck
                   ` (5 more replies)
  0 siblings, 6 replies; 7+ messages in thread
From: Tony Luck @ 2021-06-11 17:01 UTC (permalink / raw)
  To: tony.luck
  Cc: Aristeu Rozanski, Borislav Petkov, Mauro Carvalho Chehab,
	Qiuxu Zhuo, linux-edac, linux-kernel

Intel EDAC patches are like trains/buses ... nothing for a long time
and then six arrive together.

Patch 1 & 2:	Mix logging for 2-level memory configurations (to report
		the error in either the "near" (cache) or "far" memory
		as appropriate.

Patch 3:	On package memory is coming (in the future)

Patch 4, 5, 6:	Add support to igen6_edac driver for three extra CPU
		models.

Qiuxu Zhuo (6):
  EDAC/skx_common: Add new ADXL components for 2-level memory
  EDAC/i10nm: Add detection of memory levels for ICX/SPR servers
  EDAC/i10nm: Add support for high bandwidth memory
  EDAC/igen6: Add Intel ICL-NNPI SoC support
  EDAC/igen6: Add Intel Tiger Lake SoC support
  EDAC/igen6: Add Intel Alder Lake SoC support

 drivers/edac/i10nm_base.c | 171 +++++++++++++++--
 drivers/edac/igen6_edac.c | 374 +++++++++++++++++++++++++++++++++++---
 drivers/edac/skx_common.c |  82 +++++++--
 drivers/edac/skx_common.h |  34 +++-
 4 files changed, 606 insertions(+), 55 deletions(-)


base-commit: 614124bea77e452aa6df7a8714e8bc820b489922
-- 
2.29.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 1/6] EDAC/skx_common: Add new ADXL components for 2-level memory
  2021-06-11 17:01 [PATCH 0/6] Bundle of Intel EDAC patches Tony Luck
@ 2021-06-11 17:01 ` Tony Luck
  2021-06-11 17:01 ` [PATCH 2/6] EDAC/i10nm: Add detection of memory levels for ICX/SPR servers Tony Luck
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Tony Luck @ 2021-06-11 17:01 UTC (permalink / raw)
  To: tony.luck
  Cc: Qiuxu Zhuo, Aristeu Rozanski, Borislav Petkov,
	Mauro Carvalho Chehab, linux-edac, linux-kernel

From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>

Some Intel servers may configure memory in 2 levels, using
fast "near" memory (e.g. DDR) as a cache for larger, slower,
"far" memory (e.g. 3D X-point).

In these configurations the BIOS ADXL address translation for
an address in a 2-level memory range will provide details of
both the "near" and far components.

Current exported ADXL components are only for 1-level memory
system or for 2nd level memory of 2-level memory system. So
add new ADXL components for 1st level memory of 2-level memory
system to fully support 2-level memory system and the detection
of memory error source(1st level memory or 2nd level memory).

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/edac/skx_common.c | 67 ++++++++++++++++++++++++++++++++-------
 drivers/edac/skx_common.h | 11 +++++++
 2 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 81c3e2ec6f56..c8691abb720d 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -23,10 +23,13 @@
 #include "skx_common.h"
 
 static const char * const component_names[] = {
-	[INDEX_SOCKET]	= "ProcessorSocketId",
-	[INDEX_MEMCTRL]	= "MemoryControllerId",
-	[INDEX_CHANNEL]	= "ChannelId",
-	[INDEX_DIMM]	= "DimmSlotId",
+	[INDEX_SOCKET]		= "ProcessorSocketId",
+	[INDEX_MEMCTRL]		= "MemoryControllerId",
+	[INDEX_CHANNEL]		= "ChannelId",
+	[INDEX_DIMM]		= "DimmSlotId",
+	[INDEX_NM_MEMCTRL]	= "NmMemoryControllerId",
+	[INDEX_NM_CHANNEL]	= "NmChannelId",
+	[INDEX_NM_DIMM]		= "NmDimmSlotId",
 };
 
 static int component_indices[ARRAY_SIZE(component_names)];
@@ -34,12 +37,14 @@ static int adxl_component_count;
 static const char * const *adxl_component_names;
 static u64 *adxl_values;
 static char *adxl_msg;
+static unsigned long adxl_nm_bitmap;
 
 static char skx_msg[MSG_SIZE];
 static skx_decode_f skx_decode;
 static skx_show_retry_log_f skx_show_retry_rd_err_log;
 static u64 skx_tolm, skx_tohm;
 static LIST_HEAD(dev_edac_list);
+static bool skx_mem_cfg_2lm;
 
 int __init skx_adxl_get(void)
 {
@@ -56,14 +61,25 @@ int __init skx_adxl_get(void)
 		for (j = 0; names[j]; j++) {
 			if (!strcmp(component_names[i], names[j])) {
 				component_indices[i] = j;
+
+				if (i >= INDEX_NM_FIRST)
+					adxl_nm_bitmap |= 1 << i;
+
 				break;
 			}
 		}
 
-		if (!names[j])
+		if (!names[j] && i < INDEX_NM_FIRST)
 			goto err;
 	}
 
+	if (skx_mem_cfg_2lm) {
+		if (!adxl_nm_bitmap)
+			skx_printk(KERN_NOTICE, "Not enough ADXL components for 2-level memory.\n");
+		else
+			edac_dbg(2, "adxl_nm_bitmap: 0x%lx\n", adxl_nm_bitmap);
+	}
+
 	adxl_component_names = names;
 	while (*names++)
 		adxl_component_count++;
@@ -99,7 +115,7 @@ void __exit skx_adxl_put(void)
 	kfree(adxl_msg);
 }
 
-static bool skx_adxl_decode(struct decoded_addr *res)
+static bool skx_adxl_decode(struct decoded_addr *res, bool error_in_1st_level_mem)
 {
 	struct skx_dev *d;
 	int i, len = 0;
@@ -116,11 +132,20 @@ static bool skx_adxl_decode(struct decoded_addr *res)
 	}
 
 	res->socket  = (int)adxl_values[component_indices[INDEX_SOCKET]];
-	res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
-	res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
-	res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+	if (error_in_1st_level_mem) {
+		res->imc     = (adxl_nm_bitmap & BIT_NM_MEMCTRL) ?
+			       (int)adxl_values[component_indices[INDEX_NM_MEMCTRL]] : -1;
+		res->channel = (adxl_nm_bitmap & BIT_NM_CHANNEL) ?
+			       (int)adxl_values[component_indices[INDEX_NM_CHANNEL]] : -1;
+		res->dimm    = (adxl_nm_bitmap & BIT_NM_DIMM) ?
+			       (int)adxl_values[component_indices[INDEX_NM_DIMM]] : -1;
+	} else {
+		res->imc     = (int)adxl_values[component_indices[INDEX_MEMCTRL]];
+		res->channel = (int)adxl_values[component_indices[INDEX_CHANNEL]];
+		res->dimm    = (int)adxl_values[component_indices[INDEX_DIMM]];
+	}
 
-	if (res->imc > NUM_IMC - 1) {
+	if (res->imc > NUM_IMC - 1 || res->imc < 0) {
 		skx_printk(KERN_ERR, "Bad imc %d\n", res->imc);
 		return false;
 	}
@@ -151,6 +176,11 @@ static bool skx_adxl_decode(struct decoded_addr *res)
 	return true;
 }
 
+void skx_set_mem_cfg(bool mem_cfg_2lm)
+{
+	skx_mem_cfg_2lm = mem_cfg_2lm;
+}
+
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log)
 {
 	skx_decode = decode;
@@ -578,6 +608,21 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			     optype, skx_msg);
 }
 
+static bool skx_error_in_1st_level_mem(const struct mce *m)
+{
+	u32 errcode;
+
+	if (!skx_mem_cfg_2lm)
+		return false;
+
+	errcode = GET_BITFIELD(m->status, 0, 15);
+
+	if ((errcode & 0xef80) != 0x280)
+		return false;
+
+	return true;
+}
+
 int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 			void *data)
 {
@@ -597,7 +642,7 @@ int skx_mce_check_error(struct notifier_block *nb, unsigned long val,
 	res.addr = mce->addr;
 
 	if (adxl_component_count) {
-		if (!skx_adxl_decode(&res))
+		if (!skx_adxl_decode(&res, skx_error_in_1st_level_mem(mce)))
 			return NOTIFY_DONE;
 	} else if (!skx_decode || !skx_decode(&res)) {
 		return NOTIFY_DONE;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index bf56bebff138..8b5a49058ce4 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -9,6 +9,8 @@
 #ifndef _SKX_COMM_EDAC_H
 #define _SKX_COMM_EDAC_H
 
+#include <linux/bits.h>
+
 #define MSG_SIZE		1024
 
 /*
@@ -92,9 +94,17 @@ enum {
 	INDEX_MEMCTRL,
 	INDEX_CHANNEL,
 	INDEX_DIMM,
+	INDEX_NM_FIRST,
+	INDEX_NM_MEMCTRL = INDEX_NM_FIRST,
+	INDEX_NM_CHANNEL,
+	INDEX_NM_DIMM,
 	INDEX_MAX
 };
 
+#define BIT_NM_MEMCTRL	BIT_ULL(INDEX_NM_MEMCTRL)
+#define BIT_NM_CHANNEL	BIT_ULL(INDEX_NM_CHANNEL)
+#define BIT_NM_DIMM	BIT_ULL(INDEX_NM_DIMM)
+
 struct decoded_addr {
 	struct skx_dev *dev;
 	u64	addr;
@@ -133,6 +143,7 @@ typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int le
 int __init skx_adxl_get(void);
 void __exit skx_adxl_put(void);
 void skx_set_decode(skx_decode_f decode, skx_show_retry_log_f show_retry_log);
+void skx_set_mem_cfg(bool mem_cfg_2lm);
 
 int skx_get_src_id(struct skx_dev *d, int off, u8 *id);
 int skx_get_node_id(struct skx_dev *d, u8 *id);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 2/6] EDAC/i10nm: Add detection of memory levels for ICX/SPR servers
  2021-06-11 17:01 [PATCH 0/6] Bundle of Intel EDAC patches Tony Luck
  2021-06-11 17:01 ` [PATCH 1/6] EDAC/skx_common: Add new ADXL components for 2-level memory Tony Luck
@ 2021-06-11 17:01 ` Tony Luck
  2021-06-11 17:01 ` [PATCH 3/6] EDAC/i10nm: Add support for high bandwidth memory Tony Luck
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Tony Luck @ 2021-06-11 17:01 UTC (permalink / raw)
  To: tony.luck
  Cc: Qiuxu Zhuo, Aristeu Rozanski, Borislav Petkov,
	Mauro Carvalho Chehab, linux-edac, linux-kernel

From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>

Current i10nm_edac driver is only for system configured in 1-level
memory. If the system is configured in 2-level memory, the driver
doesn't report the 1st level memory DIMM for the error address, even
if the error occurs in the 1st level memory.

Both Ice Lake servers and Sapphire Rapids servers can be configured
in 2-level memory. Add detection of memory levels to i10nm_edac for
the two kinds of servers so that the driver can report the 2nd level
memory DIMM or the 1st level memory DIMM according to error source.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/edac/i10nm_base.c | 39 +++++++++++++++++++++++++++++++++++++++
 drivers/edac/skx_common.h |  3 +++
 2 files changed, 42 insertions(+)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 238a4ad1e526..91431d8922a0 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -24,6 +24,8 @@
 	pci_read_config_dword((d)->uracu, 0xd0, &(reg))
 #define I10NM_GET_IMC_BAR(d, i, reg)	\
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
+#define I10NM_GET_SAD(d, offset, i, reg)\
+	pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
 	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i, j)	\
@@ -38,6 +40,10 @@
 #define I10NM_GET_IMC_MMIO_SIZE(reg)	((GET_BITFIELD(reg, 13, 23) - \
 					 GET_BITFIELD(reg, 0, 10) + 1) << 12)
 
+#define I10NM_MAX_SAD			16
+#define I10NM_SAD_ENABLE(reg)		GET_BITFIELD(reg, 0, 0)
+#define I10NM_SAD_NM_CACHEABLE(reg)	GET_BITFIELD(reg, 5, 5)
+
 static struct list_head *i10nm_edac_list;
 
 static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
@@ -63,6 +69,31 @@ static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 	return pdev;
 }
 
+static bool i10nm_check_2lm(struct res_config *cfg)
+{
+	struct skx_dev *d;
+	u32 reg;
+	int i;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		d->sad_all = pci_get_dev_wrapper(d->seg, d->bus[1],
+						 PCI_SLOT(cfg->sad_all_devfn),
+						 PCI_FUNC(cfg->sad_all_devfn));
+		if (!d->sad_all)
+			continue;
+
+		for (i = 0; i < I10NM_MAX_SAD; i++) {
+			I10NM_GET_SAD(d, cfg->sad_all_offset, i, reg);
+			if (I10NM_SAD_ENABLE(reg) && I10NM_SAD_NM_CACHEABLE(reg)) {
+				edac_dbg(2, "2-level memory configuration.\n");
+				return true;
+			}
+		}
+	}
+
+	return false;
+}
+
 static int i10nm_get_all_munits(void)
 {
 	struct pci_dev *mdev;
@@ -132,6 +163,8 @@ static struct res_config i10nm_cfg0 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xcc,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.sad_all_devfn		= PCI_DEVFN(29, 0),
+	.sad_all_offset		= 0x108,
 };
 
 static struct res_config i10nm_cfg1 = {
@@ -139,6 +172,8 @@ static struct res_config i10nm_cfg1 = {
 	.decs_did		= 0x3452,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x4000,
+	.sad_all_devfn		= PCI_DEVFN(29, 0),
+	.sad_all_offset		= 0x108,
 };
 
 static struct res_config spr_cfg = {
@@ -147,6 +182,8 @@ static struct res_config spr_cfg = {
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
 	.support_ddr5		= true,
+	.sad_all_devfn		= PCI_DEVFN(10, 0),
+	.sad_all_offset		= 0x300,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
@@ -296,6 +333,8 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 	}
 
+	skx_set_mem_cfg(i10nm_check_2lm(cfg));
+
 	rc = i10nm_get_all_munits();
 	if (rc < 0)
 		goto fail;
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 8b5a49058ce4..34e89f7ddf93 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -133,6 +133,9 @@ struct res_config {
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
 	bool support_ddr5;
+	/* SAD device number and function number */
+	unsigned int sad_all_devfn;
+	int sad_all_offset;
 };
 
 typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
-- 
2.29.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 3/6] EDAC/i10nm: Add support for high bandwidth memory
  2021-06-11 17:01 [PATCH 0/6] Bundle of Intel EDAC patches Tony Luck
  2021-06-11 17:01 ` [PATCH 1/6] EDAC/skx_common: Add new ADXL components for 2-level memory Tony Luck
  2021-06-11 17:01 ` [PATCH 2/6] EDAC/i10nm: Add detection of memory levels for ICX/SPR servers Tony Luck
@ 2021-06-11 17:01 ` Tony Luck
  2021-06-11 17:01 ` [PATCH 4/6] EDAC/igen6: Add Intel ICL-NNPI SoC support Tony Luck
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 7+ messages in thread
From: Tony Luck @ 2021-06-11 17:01 UTC (permalink / raw)
  To: tony.luck
  Cc: Qiuxu Zhuo, Hongyu Ning, Aristeu Rozanski, Borislav Petkov,
	Mauro Carvalho Chehab, linux-edac, linux-kernel

From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>

A future Xeon processor will include in-package HBM (high bandwidth
memory). The in-package HBM memory controller shares the same
architecture with the regular DDR memory controller.

Add the HBM memory controller devices for EDAC support.

Tested-by: Hongyu Ning <hongyu.ning@linux.intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/edac/i10nm_base.c | 132 ++++++++++++++++++++++++++++++++++----
 drivers/edac/skx_common.c |  15 +++--
 drivers/edac/skx_common.h |  20 +++++-
 3 files changed, 148 insertions(+), 19 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index 91431d8922a0..fb7e72d3fd2c 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -13,7 +13,7 @@
 #include "edac_module.h"
 #include "skx_common.h"
 
-#define I10NM_REVISION	"v0.0.4"
+#define I10NM_REVISION	"v0.0.5"
 #define EDAC_MOD_STR	"i10nm_edac"
 
 /* Debug macros */
@@ -26,19 +26,33 @@
 	pci_read_config_dword((d)->uracu, 0xd8 + (i) * 4, &(reg))
 #define I10NM_GET_SAD(d, offset, i, reg)\
 	pci_read_config_dword((d)->sad_all, (offset) + (i) * 8, &(reg))
+#define I10NM_GET_HBM_IMC_BAR(d, reg)	\
+	pci_read_config_dword((d)->uracu, 0xd4, &(reg))
+#define I10NM_GET_CAPID3_CFG(d, reg)	\
+	pci_read_config_dword((d)->pcu_cr3, 0x90, &(reg))
 #define I10NM_GET_DIMMMTR(m, i, j)	\
-	readl((m)->mbase + 0x2080c + (i) * (m)->chan_mmio_sz + (j) * 4)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x80c : 0x2080c) + \
+	(i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCDDRTCFG(m, i, j)	\
-	readl((m)->mbase + 0x20970 + (i) * (m)->chan_mmio_sz + (j) * 4)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x970 : 0x20970) + \
+	(i) * (m)->chan_mmio_sz + (j) * 4)
 #define I10NM_GET_MCMTR(m, i)		\
-	readl((m)->mbase + 0x20ef8 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0xef8 : 0x20ef8) + \
+	(i) * (m)->chan_mmio_sz)
 #define I10NM_GET_AMAP(m, i)		\
-	readl((m)->mbase + 0x20814 + (i) * (m)->chan_mmio_sz)
+	readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \
+	(i) * (m)->chan_mmio_sz)
 
 #define I10NM_GET_SCK_MMIO_BASE(reg)	(GET_BITFIELD(reg, 0, 28) << 23)
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
 #define I10NM_GET_IMC_MMIO_SIZE(reg)	((GET_BITFIELD(reg, 13, 23) - \
 					 GET_BITFIELD(reg, 0, 10) + 1) << 12)
+#define I10NM_GET_HBM_IMC_MMIO_OFFSET(reg)	\
+	((GET_BITFIELD(reg, 0, 10) << 12) + 0x140000)
+
+#define I10NM_HBM_IMC_MMIO_SIZE		0x9000
+#define I10NM_IS_HBM_PRESENT(reg)	GET_BITFIELD(reg, 27, 30)
+#define I10NM_IS_HBM_IMC(reg)		GET_BITFIELD(reg, 29, 29)
 
 #define I10NM_MAX_SAD			16
 #define I10NM_SAD_ENABLE(reg)		GET_BITFIELD(reg, 0, 0)
@@ -94,7 +108,7 @@ static bool i10nm_check_2lm(struct res_config *cfg)
 	return false;
 }
 
-static int i10nm_get_all_munits(void)
+static int i10nm_get_ddr_munits(void)
 {
 	struct pci_dev *mdev;
 	void __iomem *mbase;
@@ -122,7 +136,7 @@ static int i10nm_get_all_munits(void)
 		edac_dbg(2, "socket%d mmio base 0x%llx (reg 0x%x)\n",
 			 j++, base, reg);
 
-		for (i = 0; i < I10NM_NUM_IMC; i++) {
+		for (i = 0; i < I10NM_NUM_DDR_IMC; i++) {
 			mdev = pci_get_dev_wrapper(d->seg, d->bus[0],
 						   12 + i, 0);
 			if (i == 0 && !mdev) {
@@ -158,6 +172,90 @@ static int i10nm_get_all_munits(void)
 	return 0;
 }
 
+static bool i10nm_check_hbm_imc(struct skx_dev *d)
+{
+	u32 reg;
+
+	if (I10NM_GET_CAPID3_CFG(d, reg)) {
+		i10nm_printk(KERN_ERR, "Failed to get capid3_cfg\n");
+		return false;
+	}
+
+	return I10NM_IS_HBM_PRESENT(reg) != 0;
+}
+
+static int i10nm_get_hbm_munits(void)
+{
+	struct pci_dev *mdev;
+	void __iomem *mbase;
+	u32 reg, off, mcmtr;
+	struct skx_dev *d;
+	int i, lmc;
+	u64 base;
+
+	list_for_each_entry(d, i10nm_edac_list, list) {
+		d->pcu_cr3 = pci_get_dev_wrapper(d->seg, d->bus[1], 30, 3);
+		if (!d->pcu_cr3)
+			return -ENODEV;
+
+		if (!i10nm_check_hbm_imc(d)) {
+			i10nm_printk(KERN_DEBUG, "No hbm memory\n");
+			return -ENODEV;
+		}
+
+		if (I10NM_GET_SCK_BAR(d, reg)) {
+			i10nm_printk(KERN_ERR, "Failed to get socket bar\n");
+			return -ENODEV;
+		}
+		base = I10NM_GET_SCK_MMIO_BASE(reg);
+
+		if (I10NM_GET_HBM_IMC_BAR(d, reg)) {
+			i10nm_printk(KERN_ERR, "Failed to get hbm mc bar\n");
+			return -ENODEV;
+		}
+		base += I10NM_GET_HBM_IMC_MMIO_OFFSET(reg);
+
+		lmc = I10NM_NUM_DDR_IMC;
+
+		for (i = 0; i < I10NM_NUM_HBM_IMC; i++) {
+			mdev = pci_get_dev_wrapper(d->seg, d->bus[0],
+						   12 + i / 4, 1 + i % 4);
+			if (i == 0 && !mdev) {
+				i10nm_printk(KERN_ERR, "No hbm mc found\n");
+				return -ENODEV;
+			}
+			if (!mdev)
+				continue;
+
+			d->imc[lmc].mdev = mdev;
+			off = i * I10NM_HBM_IMC_MMIO_SIZE;
+
+			edac_dbg(2, "hbm mc%d mmio base 0x%llx size 0x%x\n",
+				 lmc, base + off, I10NM_HBM_IMC_MMIO_SIZE);
+
+			mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE);
+			if (!mbase) {
+				i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n",
+					     base + off);
+				return -ENOMEM;
+			}
+
+			d->imc[lmc].mbase = mbase;
+			d->imc[lmc].hbm_mc = true;
+
+			mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0);
+			if (!I10NM_IS_HBM_IMC(mcmtr)) {
+				i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n");
+				return -ENODEV;
+			}
+
+			lmc++;
+		}
+	}
+
+	return 0;
+}
+
 static struct res_config i10nm_cfg0 = {
 	.type			= I10NM,
 	.decs_did		= 0x3452,
@@ -181,6 +279,7 @@ static struct res_config spr_cfg = {
 	.decs_did		= 0x3252,
 	.busno_cfg_offset	= 0xd0,
 	.ddr_chan_mmio_sz	= 0x8000,
+	.hbm_chan_mmio_sz	= 0x4000,
 	.support_ddr5		= true,
 	.sad_all_devfn		= PCI_DEVFN(10, 0),
 	.sad_all_offset		= 0x300,
@@ -216,13 +315,13 @@ static int i10nm_get_dimm_config(struct mem_ctl_info *mci,
 	struct dimm_info *dimm;
 	int i, j, ndimms;
 
-	for (i = 0; i < I10NM_NUM_CHANNELS; i++) {
+	for (i = 0; i < imc->num_channels; i++) {
 		if (!imc->mbase)
 			continue;
 
 		ndimms = 0;
 		amap = I10NM_GET_AMAP(imc, i);
-		for (j = 0; j < I10NM_NUM_DIMMS; j++) {
+		for (j = 0; j < imc->num_dimms; j++) {
 			dimm = edac_get_dimm(mci, i, j, 0);
 			mtr = I10NM_GET_DIMMMTR(imc, i, j);
 			mcddrtcfg = I10NM_GET_MCDDRTCFG(imc, i, j);
@@ -335,8 +434,9 @@ static int __init i10nm_init(void)
 
 	skx_set_mem_cfg(i10nm_check_2lm(cfg));
 
-	rc = i10nm_get_all_munits();
-	if (rc < 0)
+	rc = i10nm_get_ddr_munits();
+
+	if (i10nm_get_hbm_munits() && rc)
 		goto fail;
 
 	list_for_each_entry(d, i10nm_edac_list, list) {
@@ -357,7 +457,15 @@ static int __init i10nm_init(void)
 			d->imc[i].lmc = i;
 			d->imc[i].src_id  = src_id;
 			d->imc[i].node_id = node_id;
-			d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
+			if (d->imc[i].hbm_mc) {
+				d->imc[i].chan_mmio_sz = cfg->hbm_chan_mmio_sz;
+				d->imc[i].num_channels = I10NM_NUM_HBM_CHANNELS;
+				d->imc[i].num_dimms    = I10NM_NUM_HBM_DIMMS;
+			} else {
+				d->imc[i].chan_mmio_sz = cfg->ddr_chan_mmio_sz;
+				d->imc[i].num_channels = I10NM_NUM_DDR_CHANNELS;
+				d->imc[i].num_dimms    = I10NM_NUM_DDR_DIMMS;
+			}
 
 			rc = skx_register_mci(&d->imc[i], d->imc[i].mdev,
 					      "Intel_10nm Socket", EDAC_MOD_STR,
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index c8691abb720d..5e83f59bef8a 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -343,9 +343,9 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 
 	ranks = numrank(mtr);
 	rows = numrow(mtr);
-	cols = numcol(mtr);
+	cols = imc->hbm_mc ? 6 : numcol(mtr);
 
-	if (cfg->support_ddr5 && (amap & 0x8)) {
+	if (cfg->support_ddr5 && ((amap & 0x8) || imc->hbm_mc)) {
 		banks = 32;
 		mtype = MEM_DDR5;
 	} else {
@@ -374,8 +374,13 @@ int skx_get_dimm_info(u32 mtr, u32 mcmtr, u32 amap, struct dimm_info *dimm,
 	dimm->dtype = get_width(mtr);
 	dimm->mtype = mtype;
 	dimm->edac_mode = EDAC_SECDED; /* likely better than this */
-	snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
-		 imc->src_id, imc->lmc, chan, dimmno);
+
+	if (imc->hbm_mc)
+		snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_HBMC#%u_Chan#%u",
+			 imc->src_id, imc->lmc, chan);
+	else
+		snprintf(dimm->label, sizeof(dimm->label), "CPU_SrcID#%u_MC#%u_Chan#%u_DIMM#%u",
+			 imc->src_id, imc->lmc, chan, dimmno);
 
 	return 1;
 }
@@ -703,6 +708,8 @@ void skx_remove(void)
 		}
 		if (d->util_all)
 			pci_dev_put(d->util_all);
+		if (d->pcu_cr3)
+			pci_dev_put(d->pcu_cr3);
 		if (d->sad_all)
 			pci_dev_put(d->sad_all);
 		if (d->uracu)
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 34e89f7ddf93..01f67e731766 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -32,9 +32,17 @@
 #define SKX_NUM_CHANNELS	3	/* Channels per memory controller */
 #define SKX_NUM_DIMMS		2	/* Max DIMMS per channel */
 
-#define I10NM_NUM_IMC		4
-#define I10NM_NUM_CHANNELS	2
-#define I10NM_NUM_DIMMS		2
+#define I10NM_NUM_DDR_IMC	4
+#define I10NM_NUM_DDR_CHANNELS	2
+#define I10NM_NUM_DDR_DIMMS	2
+
+#define I10NM_NUM_HBM_IMC	16
+#define I10NM_NUM_HBM_CHANNELS	2
+#define I10NM_NUM_HBM_DIMMS	1
+
+#define I10NM_NUM_IMC		(I10NM_NUM_DDR_IMC + I10NM_NUM_HBM_IMC)
+#define I10NM_NUM_CHANNELS	MAX(I10NM_NUM_DDR_CHANNELS, I10NM_NUM_HBM_CHANNELS)
+#define I10NM_NUM_DIMMS		MAX(I10NM_NUM_DDR_DIMMS, I10NM_NUM_HBM_DIMMS)
 
 #define MAX(a, b)	((a) > (b) ? (a) : (b))
 #define NUM_IMC		MAX(SKX_NUM_IMC, I10NM_NUM_IMC)
@@ -56,12 +64,16 @@ struct skx_dev {
 	struct pci_dev *sad_all;
 	struct pci_dev *util_all;
 	struct pci_dev *uracu; /* for i10nm CPU */
+	struct pci_dev *pcu_cr3; /* for HBM memory detection */
 	u32 mcroute;
 	struct skx_imc {
 		struct mem_ctl_info *mci;
 		struct pci_dev *mdev; /* for i10nm CPU */
 		void __iomem *mbase;  /* for i10nm CPU */
 		int chan_mmio_sz;     /* for i10nm CPU */
+		int num_channels; /* channels per memory controller */
+		int num_dimms; /* dimms per channel */
+		bool hbm_mc;
 		u8 mc;	/* system wide mc# */
 		u8 lmc;	/* socket relative mc# */
 		u8 src_id, node_id;
@@ -132,6 +144,8 @@ struct res_config {
 	int busno_cfg_offset;
 	/* Per DDR channel memory-mapped I/O size */
 	int ddr_chan_mmio_sz;
+	/* Per HBM channel memory-mapped I/O size */
+	int hbm_chan_mmio_sz;
 	bool support_ddr5;
 	/* SAD device number and function number */
 	unsigned int sad_all_devfn;
-- 
2.29.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 4/6] EDAC/igen6: Add Intel ICL-NNPI SoC support
  2021-06-11 17:01 [PATCH 0/6] Bundle of Intel EDAC patches Tony Luck
                   ` (2 preceding siblings ...)
  2021-06-11 17:01 ` [PATCH 3/6] EDAC/i10nm: Add support for high bandwidth memory Tony Luck
@ 2021-06-11 17:01 ` Tony Luck
  2021-06-11 17:01 ` [PATCH 5/6] EDAC/igen6: Add Intel Tiger Lake " Tony Luck
  2021-06-11 17:01 ` [PATCH 6/6] EDAC/igen6: Add Intel Alder " Tony Luck
  5 siblings, 0 replies; 7+ messages in thread
From: Tony Luck @ 2021-06-11 17:01 UTC (permalink / raw)
  To: tony.luck
  Cc: Qiuxu Zhuo, Aristeu Rozanski, Borislav Petkov,
	Mauro Carvalho Chehab, linux-edac, linux-kernel

From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>

The Ice Lake Neural Network Processor for Deep Learning Inference
(ICL-NNPI) SoC shares the same memory controller and In-Band ECC with
Elkhart Lake SoC. Add the ICL-NNPI compute die IDs for EDAC support.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/edac/igen6_edac.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c
index 6be9986fc6bd..0fedf2d206d3 100644
--- a/drivers/edac/igen6_edac.c
+++ b/drivers/edac/igen6_edac.c
@@ -183,6 +183,12 @@ static struct work_struct ecclog_work;
 #define DID_EHL_SKU14	0x4534
 #define DID_EHL_SKU15	0x4536
 
+/* Compute die IDs for ICL-NNPI with IBECC */
+#define DID_ICL_SKU8	0x4581
+#define DID_ICL_SKU10	0x4585
+#define DID_ICL_SKU11	0x4589
+#define DID_ICL_SKU12	0x458d
+
 static bool ehl_ibecc_available(struct pci_dev *pdev)
 {
 	u32 v;
@@ -212,6 +218,17 @@ static u64 ehl_err_addr_to_imc_addr(u64 eaddr)
 	return eaddr;
 }
 
+static bool icl_ibecc_available(struct pci_dev *pdev)
+{
+	u32 v;
+
+	if (pci_read_config_dword(pdev, CAPID_C_OFFSET, &v))
+		return false;
+
+	return !(CAPID_C_IBECC & v) &&
+		(boot_cpu_data.x86_stepping >= 1);
+}
+
 static struct res_config ehl_cfg = {
 	.num_imc	 = 1,
 	.ibecc_base	 = 0xdc00,
@@ -220,6 +237,14 @@ static struct res_config ehl_cfg = {
 	.err_addr_to_imc_addr  = ehl_err_addr_to_imc_addr,
 };
 
+static struct res_config icl_cfg = {
+	.num_imc	 = 1,
+	.ibecc_base	 = 0xd800,
+	.ibecc_available = icl_ibecc_available,
+	.err_addr_to_sys_addr  = ehl_err_addr_to_sys_addr,
+	.err_addr_to_imc_addr  = ehl_err_addr_to_imc_addr,
+};
+
 static const struct pci_device_id igen6_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, DID_EHL_SKU5), (kernel_ulong_t)&ehl_cfg },
 	{ PCI_VDEVICE(INTEL, DID_EHL_SKU6), (kernel_ulong_t)&ehl_cfg },
@@ -232,6 +257,10 @@ static const struct pci_device_id igen6_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, DID_EHL_SKU13), (kernel_ulong_t)&ehl_cfg },
 	{ PCI_VDEVICE(INTEL, DID_EHL_SKU14), (kernel_ulong_t)&ehl_cfg },
 	{ PCI_VDEVICE(INTEL, DID_EHL_SKU15), (kernel_ulong_t)&ehl_cfg },
+	{ PCI_VDEVICE(INTEL, DID_ICL_SKU8), (kernel_ulong_t)&icl_cfg },
+	{ PCI_VDEVICE(INTEL, DID_ICL_SKU10), (kernel_ulong_t)&icl_cfg },
+	{ PCI_VDEVICE(INTEL, DID_ICL_SKU11), (kernel_ulong_t)&icl_cfg },
+	{ PCI_VDEVICE(INTEL, DID_ICL_SKU12), (kernel_ulong_t)&icl_cfg },
 	{ },
 };
 MODULE_DEVICE_TABLE(pci, igen6_pci_tbl);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 5/6] EDAC/igen6: Add Intel Tiger Lake SoC support
  2021-06-11 17:01 [PATCH 0/6] Bundle of Intel EDAC patches Tony Luck
                   ` (3 preceding siblings ...)
  2021-06-11 17:01 ` [PATCH 4/6] EDAC/igen6: Add Intel ICL-NNPI SoC support Tony Luck
@ 2021-06-11 17:01 ` Tony Luck
  2021-06-11 17:01 ` [PATCH 6/6] EDAC/igen6: Add Intel Alder " Tony Luck
  5 siblings, 0 replies; 7+ messages in thread
From: Tony Luck @ 2021-06-11 17:01 UTC (permalink / raw)
  To: tony.luck
  Cc: Qiuxu Zhuo, Aristeu Rozanski, Borislav Petkov,
	Mauro Carvalho Chehab, linux-edac, linux-kernel

From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>

Tiger Lake SoC shares the same memory controller and In-Band ECC
(IBECC) IP with Elkhart Lake SoC. The main differences are that Tiger
Lake has two memory controllers each associated with one IBECC and
uses Machine Check for the memory error notification.

So add Tiger Lake compute die IDs, MCE decoding chain registration,
and memory slice decoding for Tiger Lake EDAC support.

Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/edac/igen6_edac.c | 273 +++++++++++++++++++++++++++++++++++---
 1 file changed, 253 insertions(+), 20 deletions(-)

diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c
index 0fedf2d206d3..9f653ace50e7 100644
--- a/drivers/edac/igen6_edac.c
+++ b/drivers/edac/igen6_edac.c
@@ -22,6 +22,7 @@
 #include <linux/io.h>
 #include <asm/mach_traps.h>
 #include <asm/nmi.h>
+#include <asm/mce.h>
 
 #include "edac_mc.h"
 #include "edac_module.h"
@@ -40,7 +41,7 @@
 
 #define GET_BITFIELD(v, lo, hi) (((v) & GENMASK_ULL(hi, lo)) >> (lo))
 
-#define NUM_IMC				1 /* Max memory controllers */
+#define NUM_IMC				2 /* Max memory controllers */
 #define NUM_CHANNELS			2 /* Max channels */
 #define NUM_DIMMS			2 /* Max DIMMs per channel */
 
@@ -54,6 +55,10 @@
 #define CAPID_C_OFFSET			0xec
 #define CAPID_C_IBECC			BIT(15)
 
+/* Capability register E */
+#define CAPID_E_OFFSET			0xf0
+#define CAPID_E_IBECC			BIT(12)
+
 /* Error Status */
 #define ERRSTS_OFFSET			0xc8
 #define ERRSTS_CE			BIT_ULL(6)
@@ -109,12 +114,20 @@
 #define CHANNEL_HASH_LSB_MASK_BIT(v)	GET_BITFIELD(v, 24, 26)
 #define CHANNEL_HASH_MODE(v)		GET_BITFIELD(v, 28, 28)
 
+/* Parameters for memory slice decode stage */
+#define MEM_SLICE_HASH_MASK(v)		(GET_BITFIELD(v, 6, 19) << 6)
+#define MEM_SLICE_HASH_LSB_MASK_BIT(v)	GET_BITFIELD(v, 24, 26)
+
 static struct res_config {
+	bool machine_check;
 	int num_imc;
+	u32 cmf_base;
+	u32 cmf_size;
+	u32 ms_hash_offset;
 	u32 ibecc_base;
 	bool (*ibecc_available)(struct pci_dev *pdev);
 	/* Convert error address logged in IBECC to system physical address */
-	u64 (*err_addr_to_sys_addr)(u64 eaddr);
+	u64 (*err_addr_to_sys_addr)(u64 eaddr, int mc);
 	/* Convert error address logged in IBECC to integrated memory controller address */
 	u64 (*err_addr_to_imc_addr)(u64 eaddr);
 } *res_cfg;
@@ -125,6 +138,7 @@ struct igen6_imc {
 	struct pci_dev *pdev;
 	struct device dev;
 	void __iomem *window;
+	u64 size;
 	u64 ch_s_size;
 	int ch_l_map;
 	u64 dimm_s_size[NUM_CHANNELS];
@@ -134,6 +148,9 @@ struct igen6_imc {
 
 static struct igen6_pvt {
 	struct igen6_imc imc[NUM_IMC];
+	u64 ms_hash;
+	u64 ms_s_size;
+	int ms_l_map;
 } *igen6_pvt;
 
 /* The top of low usable DRAM */
@@ -189,6 +206,9 @@ static struct work_struct ecclog_work;
 #define DID_ICL_SKU11	0x4589
 #define DID_ICL_SKU12	0x458d
 
+/* Compute die IDs for Tiger Lake with IBECC */
+#define DID_TGL_SKU	0x9a14
+
 static bool ehl_ibecc_available(struct pci_dev *pdev)
 {
 	u32 v;
@@ -199,7 +219,7 @@ static bool ehl_ibecc_available(struct pci_dev *pdev)
 	return !!(CAPID_C_IBECC & v);
 }
 
-static u64 ehl_err_addr_to_sys_addr(u64 eaddr)
+static u64 ehl_err_addr_to_sys_addr(u64 eaddr, int mc)
 {
 	return eaddr;
 }
@@ -229,20 +249,103 @@ static bool icl_ibecc_available(struct pci_dev *pdev)
 		(boot_cpu_data.x86_stepping >= 1);
 }
 
+static bool tgl_ibecc_available(struct pci_dev *pdev)
+{
+	u32 v;
+
+	if (pci_read_config_dword(pdev, CAPID_E_OFFSET, &v))
+		return false;
+
+	return !(CAPID_E_IBECC & v);
+}
+
+static u64 mem_addr_to_sys_addr(u64 maddr)
+{
+	if (maddr < igen6_tolud)
+		return maddr;
+
+	if (igen6_tom <= _4GB)
+		return maddr - igen6_tolud + _4GB;
+
+	if (maddr < _4GB)
+		return maddr - igen6_tolud + igen6_tom;
+
+	return maddr;
+}
+
+static u64 mem_slice_hash(u64 addr, u64 mask, u64 hash_init, int intlv_bit)
+{
+	u64 hash_addr = addr & mask, hash = hash_init;
+	u64 intlv = (addr >> intlv_bit) & 1;
+	int i;
+
+	for (i = 6; i < 20; i++)
+		hash ^= (hash_addr >> i) & 1;
+
+	return hash ^ intlv;
+}
+
+static u64 tgl_err_addr_to_mem_addr(u64 eaddr, int mc)
+{
+	u64 maddr, hash, mask, ms_s_size;
+	int intlv_bit;
+	u32 ms_hash;
+
+	ms_s_size = igen6_pvt->ms_s_size;
+	if (eaddr >= ms_s_size)
+		return eaddr + ms_s_size;
+
+	ms_hash = igen6_pvt->ms_hash;
+
+	mask = MEM_SLICE_HASH_MASK(ms_hash);
+	intlv_bit = MEM_SLICE_HASH_LSB_MASK_BIT(ms_hash) + 6;
+
+	maddr = GET_BITFIELD(eaddr, intlv_bit, 63) << (intlv_bit + 1) |
+		GET_BITFIELD(eaddr, 0, intlv_bit - 1);
+
+	hash = mem_slice_hash(maddr, mask, mc, intlv_bit);
+
+	return maddr | (hash << intlv_bit);
+}
+
+static u64 tgl_err_addr_to_sys_addr(u64 eaddr, int mc)
+{
+	u64 maddr = tgl_err_addr_to_mem_addr(eaddr, mc);
+
+	return mem_addr_to_sys_addr(maddr);
+}
+
+static u64 tgl_err_addr_to_imc_addr(u64 eaddr)
+{
+	return eaddr;
+}
+
 static struct res_config ehl_cfg = {
-	.num_imc	 = 1,
-	.ibecc_base	 = 0xdc00,
-	.ibecc_available = ehl_ibecc_available,
-	.err_addr_to_sys_addr  = ehl_err_addr_to_sys_addr,
-	.err_addr_to_imc_addr  = ehl_err_addr_to_imc_addr,
+	.num_imc		= 1,
+	.ibecc_base		= 0xdc00,
+	.ibecc_available	= ehl_ibecc_available,
+	.err_addr_to_sys_addr	= ehl_err_addr_to_sys_addr,
+	.err_addr_to_imc_addr	= ehl_err_addr_to_imc_addr,
 };
 
 static struct res_config icl_cfg = {
-	.num_imc	 = 1,
-	.ibecc_base	 = 0xd800,
-	.ibecc_available = icl_ibecc_available,
-	.err_addr_to_sys_addr  = ehl_err_addr_to_sys_addr,
-	.err_addr_to_imc_addr  = ehl_err_addr_to_imc_addr,
+	.num_imc		= 1,
+	.ibecc_base		= 0xd800,
+	.ibecc_available	= icl_ibecc_available,
+	.err_addr_to_sys_addr	= ehl_err_addr_to_sys_addr,
+	.err_addr_to_imc_addr	= ehl_err_addr_to_imc_addr,
+};
+
+static struct res_config tgl_cfg = {
+	.machine_check		= true,
+	.num_imc		= 2,
+	.cmf_base		= 0x11000,
+	.cmf_size		= 0x800,
+	.ms_hash_offset		= 0xac,
+	.ibecc_base		= 0xd400,
+	.ibecc_available	= tgl_ibecc_available,
+	.err_addr_to_sys_addr	= tgl_err_addr_to_sys_addr,
+	.err_addr_to_imc_addr	= tgl_err_addr_to_imc_addr,
 };
 
 static const struct pci_device_id igen6_pci_tbl[] = {
@@ -261,6 +364,7 @@ static const struct pci_device_id igen6_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, DID_ICL_SKU10), (kernel_ulong_t)&icl_cfg },
 	{ PCI_VDEVICE(INTEL, DID_ICL_SKU11), (kernel_ulong_t)&icl_cfg },
 	{ PCI_VDEVICE(INTEL, DID_ICL_SKU12), (kernel_ulong_t)&icl_cfg },
+	{ PCI_VDEVICE(INTEL, DID_TGL_SKU), (kernel_ulong_t)&tgl_cfg },
 	{ },
 };
 MODULE_DEVICE_TABLE(pci, igen6_pci_tbl);
@@ -519,7 +623,7 @@ static void ecclog_work_cb(struct work_struct *work)
 		eaddr = ECC_ERROR_LOG_ADDR(node->ecclog) <<
 			ECC_ERROR_LOG_ADDR_SHIFT;
 		res.mc	     = node->mc;
-		res.sys_addr = res_cfg->err_addr_to_sys_addr(eaddr);
+		res.sys_addr = res_cfg->err_addr_to_sys_addr(eaddr, res.mc);
 		res.imc_addr = res_cfg->err_addr_to_imc_addr(eaddr);
 
 		mci = igen6_pvt->imc[res.mc].mci;
@@ -569,6 +673,57 @@ static int ecclog_nmi_handler(unsigned int cmd, struct pt_regs *regs)
 	return NMI_HANDLED;
 }
 
+static int ecclog_mce_handler(struct notifier_block *nb, unsigned long val,
+			      void *data)
+{
+	struct mce *mce = (struct mce *)data;
+	char *type;
+
+	if (mce->kflags & MCE_HANDLED_CEC)
+		return NOTIFY_DONE;
+
+	/*
+	 * Ignore unless this is a memory related error.
+	 * We don't check the bit MCI_STATUS_ADDRV of MCi_STATUS here,
+	 * since this bit isn't set on some CPU (e.g., Tiger Lake UP3).
+	 */
+	if ((mce->status & 0xefff) >> 7 != 1)
+		return NOTIFY_DONE;
+
+	if (mce->mcgstatus & MCG_STATUS_MCIP)
+		type = "Exception";
+	else
+		type = "Event";
+
+	edac_dbg(0, "CPU %d: Machine Check %s: 0x%llx Bank %d: 0x%llx\n",
+		 mce->extcpu, type, mce->mcgstatus,
+		 mce->bank, mce->status);
+	edac_dbg(0, "TSC 0x%llx\n", mce->tsc);
+	edac_dbg(0, "ADDR 0x%llx\n", mce->addr);
+	edac_dbg(0, "MISC 0x%llx\n", mce->misc);
+	edac_dbg(0, "PROCESSOR %u:0x%x TIME %llu SOCKET %u APIC 0x%x\n",
+		 mce->cpuvendor, mce->cpuid, mce->time,
+		 mce->socketid, mce->apicid);
+	/*
+	 * We just use the Machine Check for the memory error notification.
+	 * Each memory controller is associated with an IBECC instance.
+	 * Directly read and clear the error information(error address and
+	 * error type) on all the IBECC instances so that we know on which
+	 * memory controller the memory error(s) occurred.
+	 */
+	if (!ecclog_handler())
+		return NOTIFY_DONE;
+
+	mce->kflags |= MCE_HANDLED_EDAC;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block ecclog_mce_dec = {
+	.notifier_call	= ecclog_mce_handler,
+	.priority	= MCE_PRIO_EDAC,
+};
+
 static bool igen6_check_ecc(struct igen6_imc *imc)
 {
 	u32 activate = readl(imc->window + IBECC_ACTIVATE_OFFSET);
@@ -602,6 +757,8 @@ static int igen6_get_dimm_config(struct mem_ctl_info *mci)
 		imc->dimm_l_size[i] = MAD_DIMM_CH_DIMM_L_SIZE(mad_dimm);
 		imc->dimm_s_size[i] = MAD_DIMM_CH_DIMM_S_SIZE(mad_dimm);
 		imc->dimm_l_map[i]  = MAD_INTRA_CH_DIMM_L_MAP(mad_intra);
+		imc->size += imc->dimm_s_size[i];
+		imc->size += imc->dimm_l_size[i];
 		ndimms = 0;
 
 		for (j = 0; j < NUM_DIMMS; j++) {
@@ -637,6 +794,8 @@ static int igen6_get_dimm_config(struct mem_ctl_info *mci)
 		}
 	}
 
+	edac_dbg(0, "MC %d, total size %llu MiB\n", mc, imc->size >> 20);
+
 	return 0;
 }
 
@@ -886,6 +1045,77 @@ static void igen6_unregister_mcis(void)
 	}
 }
 
+static int igen6_mem_slice_setup(u64 mchbar)
+{
+	struct igen6_imc *imc = &igen6_pvt->imc[0];
+	u64 base = mchbar + res_cfg->cmf_base;
+	u32 offset = res_cfg->ms_hash_offset;
+	u32 size = res_cfg->cmf_size;
+	u64 ms_s_size, ms_hash;
+	void __iomem *cmf;
+	int ms_l_map;
+
+	edac_dbg(2, "\n");
+
+	if (imc[0].size < imc[1].size) {
+		ms_s_size = imc[0].size;
+		ms_l_map  = 1;
+	} else {
+		ms_s_size = imc[1].size;
+		ms_l_map  = 0;
+	}
+
+	igen6_pvt->ms_s_size = ms_s_size;
+	igen6_pvt->ms_l_map  = ms_l_map;
+
+	edac_dbg(0, "ms_s_size: %llu MiB, ms_l_map %d\n",
+		 ms_s_size >> 20, ms_l_map);
+
+	cmf = ioremap(base, size);
+	if (!cmf) {
+		igen6_printk(KERN_ERR, "Failed to ioremap cmf 0x%llx\n", base);
+		return -ENODEV;
+	}
+
+	ms_hash = readq(cmf + offset);
+	igen6_pvt->ms_hash = ms_hash;
+
+	edac_dbg(0, "MEM_SLICE_HASH: 0x%llx\n", ms_hash);
+
+	iounmap(cmf);
+
+	return 0;
+}
+
+static int register_err_handler(void)
+{
+	int rc;
+
+	if (res_cfg->machine_check) {
+		mce_register_decode_chain(&ecclog_mce_dec);
+		return 0;
+	}
+
+	rc = register_nmi_handler(NMI_SERR, ecclog_nmi_handler,
+				  0, IGEN6_NMI_NAME);
+	if (rc) {
+		igen6_printk(KERN_ERR, "Failed to register NMI handler\n");
+		return rc;
+	}
+
+	return 0;
+}
+
+static void unregister_err_handler(void)
+{
+	if (res_cfg->machine_check) {
+		mce_unregister_decode_chain(&ecclog_mce_dec);
+		return;
+	}
+
+	unregister_nmi_handler(NMI_SERR, IGEN6_NMI_NAME);
+}
+
 static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 {
 	u64 mchbar;
@@ -909,6 +1139,12 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 			goto fail2;
 	}
 
+	if (res_cfg->num_imc > 1) {
+		rc = igen6_mem_slice_setup(mchbar);
+		if (rc)
+			goto fail2;
+	}
+
 	ecclog_pool = ecclog_gen_pool_create();
 	if (!ecclog_pool) {
 		rc = -ENOMEM;
@@ -921,12 +1157,9 @@ static int igen6_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	/* Check if any pending errors before registering the NMI handler */
 	ecclog_handler();
 
-	rc = register_nmi_handler(NMI_SERR, ecclog_nmi_handler,
-				  0, IGEN6_NMI_NAME);
-	if (rc) {
-		igen6_printk(KERN_ERR, "Failed to register NMI handler\n");
+	rc = register_err_handler();
+	if (rc)
 		goto fail3;
-	}
 
 	/* Enable error reporting */
 	rc = errcmd_enable_error_reporting(true);
@@ -954,7 +1187,7 @@ static void igen6_remove(struct pci_dev *pdev)
 
 	igen6_debug_teardown();
 	errcmd_enable_error_reporting(false);
-	unregister_nmi_handler(NMI_SERR, IGEN6_NMI_NAME);
+	unregister_err_handler();
 	irq_work_sync(&ecclog_irq_work);
 	flush_work(&ecclog_work);
 	gen_pool_destroy(ecclog_pool);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH 6/6] EDAC/igen6: Add Intel Alder Lake SoC support
  2021-06-11 17:01 [PATCH 0/6] Bundle of Intel EDAC patches Tony Luck
                   ` (4 preceding siblings ...)
  2021-06-11 17:01 ` [PATCH 5/6] EDAC/igen6: Add Intel Tiger Lake " Tony Luck
@ 2021-06-11 17:01 ` Tony Luck
  5 siblings, 0 replies; 7+ messages in thread
From: Tony Luck @ 2021-06-11 17:01 UTC (permalink / raw)
  To: tony.luck
  Cc: Qiuxu Zhuo, Vrukesh V Panse, Aristeu Rozanski, Borislav Petkov,
	Mauro Carvalho Chehab, linux-edac, linux-kernel

From: Qiuxu Zhuo <qiuxu.zhuo@intel.com>

Alder Lake SoC shares the same memory controller and In-Band ECC
(IBECC) IP with Tiger Lake SoC. Like Tiger Lake, it also has two
memory controllers each associated one IBECC instance. The minor
differences include the MMIO offset of each memory controller and
the type of memory error address logged in the IBECC.

So add Alder Lake compute die IDs, adjust the MMIO offset for each
memory controller and handle the type of memory error address logged
in the IBECC for Alder Lake EDAC support.

Tested-by: Vrukesh V Panse <vrukesh.v.panse@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/edac/igen6_edac.c | 84 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 73 insertions(+), 11 deletions(-)

diff --git a/drivers/edac/igen6_edac.c b/drivers/edac/igen6_edac.c
index 9f653ace50e7..a07bbfd075d0 100644
--- a/drivers/edac/igen6_edac.c
+++ b/drivers/edac/igen6_edac.c
@@ -27,7 +27,7 @@
 #include "edac_mc.h"
 #include "edac_module.h"
 
-#define IGEN6_REVISION	"v2.4"
+#define IGEN6_REVISION	"v2.5"
 
 #define EDAC_MOD_STR	"igen6_edac"
 #define IGEN6_NMI_NAME	"igen6_ibecc"
@@ -75,7 +75,7 @@
 #define IBECC_ACTIVATE_EN		BIT(0)
 
 /* IBECC error log */
-#define ECC_ERROR_LOG_OFFSET		(IBECC_BASE + 0x170)
+#define ECC_ERROR_LOG_OFFSET		(IBECC_BASE + res_cfg->ibecc_error_log_offset)
 #define ECC_ERROR_LOG_CE		BIT_ULL(62)
 #define ECC_ERROR_LOG_UE		BIT_ULL(63)
 #define ECC_ERROR_LOG_ADDR_SHIFT	5
@@ -89,27 +89,32 @@
 #define MCHBAR_SIZE			0x10000
 
 /* Parameters for the channel decode stage */
-#define MAD_INTER_CHANNEL_OFFSET	0x5000
+#define IMC_BASE			(res_cfg->imc_base)
+#define MAD_INTER_CHANNEL_OFFSET	IMC_BASE
 #define MAD_INTER_CHANNEL_DDR_TYPE(v)	GET_BITFIELD(v, 0, 2)
 #define MAD_INTER_CHANNEL_ECHM(v)	GET_BITFIELD(v, 3, 3)
 #define MAD_INTER_CHANNEL_CH_L_MAP(v)	GET_BITFIELD(v, 4, 4)
 #define MAD_INTER_CHANNEL_CH_S_SIZE(v)	((u64)GET_BITFIELD(v, 12, 19) << 29)
 
 /* Parameters for DRAM decode stage */
-#define MAD_INTRA_CH0_OFFSET		0x5004
+#define MAD_INTRA_CH0_OFFSET		(IMC_BASE + 4)
 #define MAD_INTRA_CH_DIMM_L_MAP(v)	GET_BITFIELD(v, 0, 0)
 
 /* DIMM characteristics */
-#define MAD_DIMM_CH0_OFFSET		0x500c
+#define MAD_DIMM_CH0_OFFSET		(IMC_BASE + 0xc)
 #define MAD_DIMM_CH_DIMM_L_SIZE(v)	((u64)GET_BITFIELD(v, 0, 6) << 29)
 #define MAD_DIMM_CH_DLW(v)		GET_BITFIELD(v, 7, 8)
 #define MAD_DIMM_CH_DIMM_S_SIZE(v)	((u64)GET_BITFIELD(v, 16, 22) << 29)
 #define MAD_DIMM_CH_DSW(v)		GET_BITFIELD(v, 24, 25)
 
+/* Hash for memory controller selection */
+#define MAD_MC_HASH_OFFSET		(IMC_BASE + 0x1b8)
+#define MAC_MC_HASH_LSB(v)		GET_BITFIELD(v, 1, 3)
+
 /* Hash for channel selection */
-#define CHANNEL_HASH_OFFSET		0X5024
+#define CHANNEL_HASH_OFFSET		(IMC_BASE + 0x24)
 /* Hash for enhanced channel selection */
-#define CHANNEL_EHASH_OFFSET		0X5028
+#define CHANNEL_EHASH_OFFSET		(IMC_BASE + 0x28)
 #define CHANNEL_HASH_MASK(v)		(GET_BITFIELD(v, 6, 19) << 6)
 #define CHANNEL_HASH_LSB_MASK_BIT(v)	GET_BITFIELD(v, 24, 26)
 #define CHANNEL_HASH_MODE(v)		GET_BITFIELD(v, 28, 28)
@@ -121,15 +126,17 @@
 static struct res_config {
 	bool machine_check;
 	int num_imc;
+	u32 imc_base;
 	u32 cmf_base;
 	u32 cmf_size;
 	u32 ms_hash_offset;
 	u32 ibecc_base;
+	u32 ibecc_error_log_offset;
 	bool (*ibecc_available)(struct pci_dev *pdev);
 	/* Convert error address logged in IBECC to system physical address */
 	u64 (*err_addr_to_sys_addr)(u64 eaddr, int mc);
 	/* Convert error address logged in IBECC to integrated memory controller address */
-	u64 (*err_addr_to_imc_addr)(u64 eaddr);
+	u64 (*err_addr_to_imc_addr)(u64 eaddr, int mc);
 } *res_cfg;
 
 struct igen6_imc {
@@ -209,6 +216,12 @@ static struct work_struct ecclog_work;
 /* Compute die IDs for Tiger Lake with IBECC */
 #define DID_TGL_SKU	0x9a14
 
+/* Compute die IDs for Alder Lake with IBECC */
+#define DID_ADL_SKU1	0x4601
+#define DID_ADL_SKU2	0x4602
+#define DID_ADL_SKU3	0x4621
+#define DID_ADL_SKU4	0x4641
+
 static bool ehl_ibecc_available(struct pci_dev *pdev)
 {
 	u32 v;
@@ -224,7 +237,7 @@ static u64 ehl_err_addr_to_sys_addr(u64 eaddr, int mc)
 	return eaddr;
 }
 
-static u64 ehl_err_addr_to_imc_addr(u64 eaddr)
+static u64 ehl_err_addr_to_imc_addr(u64 eaddr, int mc)
 {
 	if (eaddr < igen6_tolud)
 		return eaddr;
@@ -315,22 +328,51 @@ static u64 tgl_err_addr_to_sys_addr(u64 eaddr, int mc)
 	return mem_addr_to_sys_addr(maddr);
 }
 
-static u64 tgl_err_addr_to_imc_addr(u64 eaddr)
+static u64 tgl_err_addr_to_imc_addr(u64 eaddr, int mc)
 {
 	return eaddr;
 }
 
+static u64 adl_err_addr_to_sys_addr(u64 eaddr, int mc)
+{
+	return mem_addr_to_sys_addr(eaddr);
+}
+
+static u64 adl_err_addr_to_imc_addr(u64 eaddr, int mc)
+{
+	u64 imc_addr, ms_s_size = igen6_pvt->ms_s_size;
+	struct igen6_imc *imc = &igen6_pvt->imc[mc];
+	int intlv_bit;
+	u32 mc_hash;
+
+	if (eaddr >= 2 * ms_s_size)
+		return eaddr - ms_s_size;
+
+	mc_hash = readl(imc->window + MAD_MC_HASH_OFFSET);
+
+	intlv_bit = MAC_MC_HASH_LSB(mc_hash) + 6;
+
+	imc_addr = GET_BITFIELD(eaddr, intlv_bit + 1, 63) << intlv_bit |
+		   GET_BITFIELD(eaddr, 0, intlv_bit - 1);
+
+	return imc_addr;
+}
+
 static struct res_config ehl_cfg = {
 	.num_imc		= 1,
+	.imc_base		= 0x5000,
 	.ibecc_base		= 0xdc00,
 	.ibecc_available	= ehl_ibecc_available,
+	.ibecc_error_log_offset	= 0x170,
 	.err_addr_to_sys_addr	= ehl_err_addr_to_sys_addr,
 	.err_addr_to_imc_addr	= ehl_err_addr_to_imc_addr,
 };
 
 static struct res_config icl_cfg = {
 	.num_imc		= 1,
+	.imc_base		= 0x5000,
 	.ibecc_base		= 0xd800,
+	.ibecc_error_log_offset	= 0x170,
 	.ibecc_available	= icl_ibecc_available,
 	.err_addr_to_sys_addr	= ehl_err_addr_to_sys_addr,
 	.err_addr_to_imc_addr	= ehl_err_addr_to_imc_addr,
@@ -339,15 +381,28 @@ static struct res_config icl_cfg = {
 static struct res_config tgl_cfg = {
 	.machine_check		= true,
 	.num_imc		= 2,
+	.imc_base		= 0x5000,
 	.cmf_base		= 0x11000,
 	.cmf_size		= 0x800,
 	.ms_hash_offset		= 0xac,
 	.ibecc_base		= 0xd400,
+	.ibecc_error_log_offset	= 0x170,
 	.ibecc_available	= tgl_ibecc_available,
 	.err_addr_to_sys_addr	= tgl_err_addr_to_sys_addr,
 	.err_addr_to_imc_addr	= tgl_err_addr_to_imc_addr,
 };
 
+static struct res_config adl_cfg = {
+	.machine_check		= true,
+	.num_imc		= 2,
+	.imc_base		= 0xd800,
+	.ibecc_base		= 0xd400,
+	.ibecc_error_log_offset	= 0x68,
+	.ibecc_available	= tgl_ibecc_available,
+	.err_addr_to_sys_addr	= adl_err_addr_to_sys_addr,
+	.err_addr_to_imc_addr	= adl_err_addr_to_imc_addr,
+};
+
 static const struct pci_device_id igen6_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, DID_EHL_SKU5), (kernel_ulong_t)&ehl_cfg },
 	{ PCI_VDEVICE(INTEL, DID_EHL_SKU6), (kernel_ulong_t)&ehl_cfg },
@@ -365,6 +420,10 @@ static const struct pci_device_id igen6_pci_tbl[] = {
 	{ PCI_VDEVICE(INTEL, DID_ICL_SKU11), (kernel_ulong_t)&icl_cfg },
 	{ PCI_VDEVICE(INTEL, DID_ICL_SKU12), (kernel_ulong_t)&icl_cfg },
 	{ PCI_VDEVICE(INTEL, DID_TGL_SKU), (kernel_ulong_t)&tgl_cfg },
+	{ PCI_VDEVICE(INTEL, DID_ADL_SKU1), (kernel_ulong_t)&adl_cfg },
+	{ PCI_VDEVICE(INTEL, DID_ADL_SKU2), (kernel_ulong_t)&adl_cfg },
+	{ PCI_VDEVICE(INTEL, DID_ADL_SKU3), (kernel_ulong_t)&adl_cfg },
+	{ PCI_VDEVICE(INTEL, DID_ADL_SKU4), (kernel_ulong_t)&adl_cfg },
 	{ },
 };
 MODULE_DEVICE_TABLE(pci, igen6_pci_tbl);
@@ -624,7 +683,7 @@ static void ecclog_work_cb(struct work_struct *work)
 			ECC_ERROR_LOG_ADDR_SHIFT;
 		res.mc	     = node->mc;
 		res.sys_addr = res_cfg->err_addr_to_sys_addr(eaddr, res.mc);
-		res.imc_addr = res_cfg->err_addr_to_imc_addr(eaddr);
+		res.imc_addr = res_cfg->err_addr_to_imc_addr(eaddr, res.mc);
 
 		mci = igen6_pvt->imc[res.mc].mci;
 
@@ -1071,6 +1130,9 @@ static int igen6_mem_slice_setup(u64 mchbar)
 	edac_dbg(0, "ms_s_size: %llu MiB, ms_l_map %d\n",
 		 ms_s_size >> 20, ms_l_map);
 
+	if (!size)
+		return 0;
+
 	cmf = ioremap(base, size);
 	if (!cmf) {
 		igen6_printk(KERN_ERR, "Failed to ioremap cmf 0x%llx\n", base);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2021-06-11 17:01 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-11 17:01 [PATCH 0/6] Bundle of Intel EDAC patches Tony Luck
2021-06-11 17:01 ` [PATCH 1/6] EDAC/skx_common: Add new ADXL components for 2-level memory Tony Luck
2021-06-11 17:01 ` [PATCH 2/6] EDAC/i10nm: Add detection of memory levels for ICX/SPR servers Tony Luck
2021-06-11 17:01 ` [PATCH 3/6] EDAC/i10nm: Add support for high bandwidth memory Tony Luck
2021-06-11 17:01 ` [PATCH 4/6] EDAC/igen6: Add Intel ICL-NNPI SoC support Tony Luck
2021-06-11 17:01 ` [PATCH 5/6] EDAC/igen6: Add Intel Tiger Lake " Tony Luck
2021-06-11 17:01 ` [PATCH 6/6] EDAC/igen6: Add Intel Alder " Tony Luck

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).