linux-edac.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Tony Luck <tony.luck@intel.com>
To: linux-edac@vger.kernel.org
Cc: Youquan Song <youquan.song@intel.com>,
	Qiuxu Zhuo <qiuxu.zhuo@intel.com>,
	Tony Luck <tony.luck@intel.com>,
	Aristeu Rozanski <aris@redhat.com>,
	Borislav Petkov <bp@alien8.de>,
	Mauro Carvalho Chehab <mchehab@kernel.org>,
	linux-kernel@vger.kernel.org
Subject: [PATCH 2/2] EDAC/i10nm: Retrieve and print retry_rd_err_log registers
Date: Wed, 18 Aug 2021 10:57:01 -0700	[thread overview]
Message-ID: <20210818175701.1611513-3-tony.luck@intel.com> (raw)
In-Reply-To: <20210818175701.1611513-1-tony.luck@intel.com>

From: Youquan Song <youquan.song@intel.com>

Retrieve and print retry_rd_err_log registers like the earlier change:
commit e80634a75aba ("EDAC, skx: Retrieve and print retry_rd_err_log registers")

This is a little trickier than on Skylake because of potential
interference with BIOS use of the same registers. The default
behavior is to ignore these registers.

A module parameter retry_rd_err_log(default=0) controls the mode of operation:
- 0=off  : Default.
- 1=bios : Linux doesn't reset any control bits, but just reports values.
           This is "no harm" mode, but it may miss reporting some data.
- 2=linux: Linux tries to take control and resets mode bits,
           clears valid/UC bits after reading. This should be
           more reliable (especially if BIOS interference is reduced
           by disabling eMCA reporting mode in BIOS setup).

Co-developed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Signed-off-by: Youquan Song <youquan.song@intel.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 drivers/edac/i10nm_base.c | 146 ++++++++++++++++++++++++++++++++++++++
 drivers/edac/skx_base.c   |   3 +-
 drivers/edac/skx_common.c |   4 +-
 drivers/edac/skx_common.h |   7 +-
 4 files changed, 157 insertions(+), 3 deletions(-)

diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c
index b4a024cb8b97..83345bfac246 100644
--- a/drivers/edac/i10nm_base.c
+++ b/drivers/edac/i10nm_base.c
@@ -42,6 +42,12 @@
 #define I10NM_GET_AMAP(m, i)		\
 	readl((m)->mbase + ((m)->hbm_mc ? 0x814 : 0x20814) + \
 	(i) * (m)->chan_mmio_sz)
+#define I10NM_GET_REG32(m, i, offset)	\
+	readl((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
+#define I10NM_GET_REG64(m, i, offset)	\
+	readq((m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
+#define I10NM_SET_REG32(m, i, offset, v)	\
+	writel(v, (m)->mbase + (i) * (m)->chan_mmio_sz + (offset))
 
 #define I10NM_GET_SCK_MMIO_BASE(reg)	(GET_BITFIELD(reg, 0, 28) << 23)
 #define I10NM_GET_IMC_MMIO_OFFSET(reg)	(GET_BITFIELD(reg, 0, 10) << 12)
@@ -58,8 +64,125 @@
 #define I10NM_SAD_ENABLE(reg)		GET_BITFIELD(reg, 0, 0)
 #define I10NM_SAD_NM_CACHEABLE(reg)	GET_BITFIELD(reg, 5, 5)
 
+#define RETRY_RD_ERR_LOG_UC		BIT(1)
+#define RETRY_RD_ERR_LOG_NOOVER		BIT(14)
+#define RETRY_RD_ERR_LOG_EN		BIT(15)
+#define RETRY_RD_ERR_LOG_NOOVER_UC	(BIT(14) | BIT(1))
+#define RETRY_RD_ERR_LOG_OVER_UC_V	(BIT(2) | BIT(1) | BIT(0))
+
 static struct list_head *i10nm_edac_list;
 
+static struct res_config *res_cfg;
+static int retry_rd_err_log;
+
+static u32 offsets_scrub_icx[]  = {0x22c60, 0x22c54, 0x22c5c, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_scrub_spr[]  = {0x22c60, 0x22c54, 0x22f08, 0x22c58, 0x22c28, 0x20ed8};
+static u32 offsets_demand_icx[] = {0x22e54, 0x22e60, 0x22e64, 0x22e58, 0x22e5c, 0x20ee0};
+static u32 offsets_demand_spr[] = {0x22e54, 0x22e60, 0x22f10, 0x22e58, 0x22e5c, 0x20ee0};
+
+static void __enable_retry_rd_err_log(struct skx_imc *imc, int chan, bool enable)
+{
+	u32 s, d;
+
+	if (!imc->mbase)
+		return;
+
+	s = I10NM_GET_REG32(imc, chan, res_cfg->offsets_scrub[0]);
+	d = I10NM_GET_REG32(imc, chan, res_cfg->offsets_demand[0]);
+
+	if (enable) {
+		/* Save default configurations */
+		imc->chan[chan].retry_rd_err_log_s = s;
+		imc->chan[chan].retry_rd_err_log_d = d;
+
+		s &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
+		s |=  RETRY_RD_ERR_LOG_EN;
+		d &= ~RETRY_RD_ERR_LOG_NOOVER_UC;
+		d |=  RETRY_RD_ERR_LOG_EN;
+	} else {
+		/* Restore default configurations */
+		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_UC)
+			s |=  RETRY_RD_ERR_LOG_UC;
+		if (imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_NOOVER)
+			s |=  RETRY_RD_ERR_LOG_NOOVER;
+		if (!(imc->chan[chan].retry_rd_err_log_s & RETRY_RD_ERR_LOG_EN))
+			s &= ~RETRY_RD_ERR_LOG_EN;
+		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_UC)
+			d |=  RETRY_RD_ERR_LOG_UC;
+		if (imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_NOOVER)
+			d |=  RETRY_RD_ERR_LOG_NOOVER;
+		if (!(imc->chan[chan].retry_rd_err_log_d & RETRY_RD_ERR_LOG_EN))
+			d &= ~RETRY_RD_ERR_LOG_EN;
+	}
+
+	I10NM_SET_REG32(imc, chan, res_cfg->offsets_scrub[0], s);
+	I10NM_SET_REG32(imc, chan, res_cfg->offsets_demand[0], d);
+}
+
+static void enable_retry_rd_err_log(bool enable)
+{
+	struct skx_dev *d;
+	int i, j;
+
+	edac_dbg(2, "\n");
+
+	list_for_each_entry(d, i10nm_edac_list, list)
+		for (i = 0; i < I10NM_NUM_IMC; i++)
+			for (j = 0; j < I10NM_NUM_CHANNELS; j++)
+				__enable_retry_rd_err_log(&d->imc[i], j, enable);
+}
+
+static void show_retry_rd_err_log(struct decoded_addr *res, char *msg,
+				  int len, bool scrub_err)
+{
+	struct skx_imc *imc = &res->dev->imc[res->imc];
+	u32 log0, log1, log2, log3, log4;
+	u32 corr0, corr1, corr2, corr3;
+	u64 log2a, log5;
+	u32 *offsets;
+	int n;
+
+	if (!imc->mbase)
+		return;
+
+	offsets = scrub_err ? res_cfg->offsets_scrub : res_cfg->offsets_demand;
+
+	log0 = I10NM_GET_REG32(imc, res->channel, offsets[0]);
+	log1 = I10NM_GET_REG32(imc, res->channel, offsets[1]);
+	log3 = I10NM_GET_REG32(imc, res->channel, offsets[3]);
+	log4 = I10NM_GET_REG32(imc, res->channel, offsets[4]);
+	log5 = I10NM_GET_REG64(imc, res->channel, offsets[5]);
+
+	if (res_cfg->type == SPR) {
+		log2a = I10NM_GET_REG64(imc, res->channel, offsets[2]);
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.16llx %.8x %.8x %.16llx]",
+			     log0, log1, log2a, log3, log4, log5);
+	} else {
+		log2 = I10NM_GET_REG32(imc, res->channel, offsets[2]);
+		n = snprintf(msg, len, " retry_rd_err_log[%.8x %.8x %.8x %.8x %.8x %.16llx]",
+			     log0, log1, log2, log3, log4, log5);
+	}
+
+	corr0 = I10NM_GET_REG32(imc, res->channel, 0x22c18);
+	corr1 = I10NM_GET_REG32(imc, res->channel, 0x22c1c);
+	corr2 = I10NM_GET_REG32(imc, res->channel, 0x22c20);
+	corr3 = I10NM_GET_REG32(imc, res->channel, 0x22c24);
+
+	if (len - n > 0)
+		snprintf(msg + n, len - n,
+			 " correrrcnt[%.4x %.4x %.4x %.4x %.4x %.4x %.4x %.4x]",
+			 corr0 & 0xffff, corr0 >> 16,
+			 corr1 & 0xffff, corr1 >> 16,
+			 corr2 & 0xffff, corr2 >> 16,
+			 corr3 & 0xffff, corr3 >> 16);
+
+	/* Clear status bits */
+	if (retry_rd_err_log == 2 && (log0 & RETRY_RD_ERR_LOG_OVER_UC_V)) {
+		log0 &= ~RETRY_RD_ERR_LOG_OVER_UC_V;
+		I10NM_SET_REG32(imc, res->channel, offsets[0], log0);
+	}
+}
+
 static struct pci_dev *pci_get_dev_wrapper(int dom, unsigned int bus,
 					   unsigned int dev, unsigned int fun)
 {
@@ -263,6 +386,8 @@ static struct res_config i10nm_cfg0 = {
 	.ddr_chan_mmio_sz	= 0x4000,
 	.sad_all_devfn		= PCI_DEVFN(29, 0),
 	.sad_all_offset		= 0x108,
+	.offsets_scrub		= offsets_scrub_icx,
+	.offsets_demand		= offsets_demand_icx,
 };
 
 static struct res_config i10nm_cfg1 = {
@@ -272,6 +397,8 @@ static struct res_config i10nm_cfg1 = {
 	.ddr_chan_mmio_sz	= 0x4000,
 	.sad_all_devfn		= PCI_DEVFN(29, 0),
 	.sad_all_offset		= 0x108,
+	.offsets_scrub		= offsets_scrub_icx,
+	.offsets_demand		= offsets_demand_icx,
 };
 
 static struct res_config spr_cfg = {
@@ -283,6 +410,8 @@ static struct res_config spr_cfg = {
 	.support_ddr5		= true,
 	.sad_all_devfn		= PCI_DEVFN(10, 0),
 	.sad_all_offset		= 0x300,
+	.offsets_scrub		= offsets_scrub_spr,
+	.offsets_demand		= offsets_demand_spr,
 };
 
 static const struct x86_cpu_id i10nm_cpuids[] = {
@@ -422,6 +551,7 @@ static int __init i10nm_init(void)
 		return -ENODEV;
 
 	cfg = (struct res_config *)id->driver_data;
+	res_cfg = cfg;
 
 	rc = skx_get_hi_lo(0x09a2, off, &tolm, &tohm);
 	if (rc)
@@ -486,6 +616,12 @@ static int __init i10nm_init(void)
 	mce_register_decode_chain(&i10nm_mce_dec);
 	setup_i10nm_debug();
 
+	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
+		skx_set_decode(NULL, show_retry_rd_err_log);
+		if (retry_rd_err_log == 2)
+			enable_retry_rd_err_log(true);
+	}
+
 	i10nm_printk(KERN_INFO, "%s\n", I10NM_REVISION);
 
 	return 0;
@@ -497,6 +633,13 @@ static int __init i10nm_init(void)
 static void __exit i10nm_exit(void)
 {
 	edac_dbg(2, "\n");
+
+	if (retry_rd_err_log && res_cfg->offsets_scrub && res_cfg->offsets_demand) {
+		skx_set_decode(NULL, NULL);
+		if (retry_rd_err_log == 2)
+			enable_retry_rd_err_log(false);
+	}
+
 	teardown_i10nm_debug();
 	mce_unregister_decode_chain(&i10nm_mce_dec);
 	skx_adxl_put();
@@ -506,5 +649,8 @@ static void __exit i10nm_exit(void)
 module_init(i10nm_init);
 module_exit(i10nm_exit);
 
+module_param(retry_rd_err_log, int, 0444);
+MODULE_PARM_DESC(retry_rd_err_log, "retry_rd_err_log: 0=off(default), 1=bios(Linux doesn't reset any control bits, but just reports values.), 2=linux(Linux tries to take control and resets mode bits, clear valid/UC bits after reading.)");
+
 MODULE_LICENSE("GPL v2");
 MODULE_DESCRIPTION("MC Driver for Intel 10nm server processors");
diff --git a/drivers/edac/skx_base.c b/drivers/edac/skx_base.c
index 4dbd46575bfb..1abc020d49ab 100644
--- a/drivers/edac/skx_base.c
+++ b/drivers/edac/skx_base.c
@@ -230,7 +230,8 @@ static int skx_get_dimm_config(struct mem_ctl_info *mci, struct res_config *cfg)
 #define SKX_ILV_TARGET(tgt)	((tgt) & 7)
 
 static void skx_show_retry_rd_err_log(struct decoded_addr *res,
-				      char *msg, int len)
+				      char *msg, int len,
+				      bool scrub_err)
 {
 	u32 log0, log1, log2, log3, log4;
 	u32 corr0, corr1, corr2, corr3;
diff --git a/drivers/edac/skx_common.c b/drivers/edac/skx_common.c
index 5e83f59bef8a..0c133d32b777 100644
--- a/drivers/edac/skx_common.c
+++ b/drivers/edac/skx_common.c
@@ -529,6 +529,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	bool ripv = GET_BITFIELD(m->mcgstatus, 0, 0);
 	bool overflow = GET_BITFIELD(m->status, 62, 62);
 	bool uncorrected_error = GET_BITFIELD(m->status, 61, 61);
+	bool scrub_err = false;
 	bool recoverable;
 	int len;
 	u32 core_err_cnt = GET_BITFIELD(m->status, 38, 52);
@@ -580,6 +581,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 			break;
 		case 4:
 			optype = "memory scrubbing error";
+			scrub_err = true;
 			break;
 		default:
 			optype = "reserved";
@@ -602,7 +604,7 @@ static void skx_mce_output_error(struct mem_ctl_info *mci,
 	}
 
 	if (skx_show_retry_rd_err_log)
-		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len);
+		skx_show_retry_rd_err_log(res, skx_msg + len, MSG_SIZE - len, scrub_err);
 
 	edac_dbg(0, "%s\n", skx_msg);
 
diff --git a/drivers/edac/skx_common.h b/drivers/edac/skx_common.h
index 01f67e731766..03ac067a80b9 100644
--- a/drivers/edac/skx_common.h
+++ b/drivers/edac/skx_common.h
@@ -80,6 +80,8 @@ struct skx_dev {
 		struct skx_channel {
 			struct pci_dev	*cdev;
 			struct pci_dev	*edev;
+			u32 retry_rd_err_log_s;
+			u32 retry_rd_err_log_d;
 			struct skx_dimm {
 				u8 close_pg;
 				u8 bank_xor_enable;
@@ -150,12 +152,15 @@ struct res_config {
 	/* SAD device number and function number */
 	unsigned int sad_all_devfn;
 	int sad_all_offset;
+	/* Offsets of retry_rd_err_log registers */
+	u32 *offsets_scrub;
+	u32 *offsets_demand;
 };
 
 typedef int (*get_dimm_config_f)(struct mem_ctl_info *mci,
 				 struct res_config *cfg);
 typedef bool (*skx_decode_f)(struct decoded_addr *res);
-typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len);
+typedef void (*skx_show_retry_log_f)(struct decoded_addr *res, char *msg, int len, bool scrub_err);
 
 int __init skx_adxl_get(void);
 void __exit skx_adxl_put(void);
-- 
2.29.2


      parent reply	other threads:[~2021-08-18 17:57 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-08-18 17:56 [PATCH 0/2] EDAC/i10nm fix and enhancement Tony Luck
2021-08-18 17:57 ` [PATCH 1/2] EDAC/i10nm: Fix NVDIMM detection Tony Luck
2021-08-18 17:57 ` Tony Luck [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210818175701.1611513-3-tony.luck@intel.com \
    --to=tony.luck@intel.com \
    --cc=aris@redhat.com \
    --cc=bp@alien8.de \
    --cc=linux-edac@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mchehab@kernel.org \
    --cc=qiuxu.zhuo@intel.com \
    --cc=youquan.song@intel.com \
    --subject='Re: [PATCH 2/2] EDAC/i10nm: Retrieve and print retry_rd_err_log registers' \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).