Linux-EDAC Archive on lore.kernel.org
 help / color / Atom feed
* [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce()
@ 2019-07-15 10:53 Hanna Hawa
  2019-07-17 12:06 ` Jan Glauber
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Hanna Hawa @ 2019-07-15 10:53 UTC (permalink / raw)
  To: thor.thayer, bp, mchehab, james.morse, rric, morbidrsa, ralf,
	david.daney, andy.gross, david.brown, ckadabi, vnkgutta,
	jglauber, khuong
  Cc: dwmw, benh, ronenk, talel, jonnyc, hanochu, linux-edac,
	linux-kernel, linux-mips, linux-arm-msm, hhhawa

Add a counter parameter in order to avoid losing errors count for edac
device, the error count reports the number of errors reported by an edac
device similar to the way MC_EDAC do.

Signed-off-by: Hanna Hawa <hhhawa@amazon.com>
---
 drivers/edac/altera_edac.c      | 20 ++++++++++++--------
 drivers/edac/amd8111_edac.c     |  6 +++---
 drivers/edac/cpc925_edac.c      |  4 ++--
 drivers/edac/edac_device.c      | 18 ++++++++++--------
 drivers/edac/edac_device.h      |  8 ++++++--
 drivers/edac/highbank_l2_edac.c |  4 ++--
 drivers/edac/mpc85xx_edac.c     |  4 ++--
 drivers/edac/mv64x60_edac.c     |  4 ++--
 drivers/edac/octeon_edac-l2c.c  | 20 ++++++++++----------
 drivers/edac/octeon_edac-pc.c   |  6 +++---
 drivers/edac/qcom_edac.c        |  8 ++++----
 drivers/edac/thunderx_edac.c    | 10 +++++-----
 drivers/edac/xgene_edac.c       | 26 +++++++++++++-------------
 13 files changed, 74 insertions(+), 64 deletions(-)

diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
index 8816f74..747dd43 100644
--- a/drivers/edac/altera_edac.c
+++ b/drivers/edac/altera_edac.c
@@ -616,12 +616,12 @@ static irqreturn_t altr_edac_device_handler(int irq, void *dev_id)
 	if (irq == drvdata->sb_irq) {
 		if (priv->ce_clear_mask)
 			writel(priv->ce_clear_mask, drvdata->base);
-		edac_device_handle_ce(dci, 0, 0, drvdata->edac_dev_name);
+		edac_device_handle_ce(dci, 1, 0, 0, drvdata->edac_dev_name);
 		ret_value = IRQ_HANDLED;
 	} else if (irq == drvdata->db_irq) {
 		if (priv->ue_clear_mask)
 			writel(priv->ue_clear_mask, drvdata->base);
-		edac_device_handle_ue(dci, 0, 0, drvdata->edac_dev_name);
+		edac_device_handle_ue(dci, 1, 0, 0, drvdata->edac_dev_name);
 		panic("\nEDAC:ECC_DEVICE[Uncorrectable errors]\n");
 		ret_value = IRQ_HANDLED;
 	} else {
@@ -919,13 +919,15 @@ static irqreturn_t __maybe_unused altr_edac_a10_ecc_irq(int irq, void *dev_id)
 	if (irq == dci->sb_irq) {
 		writel(ALTR_A10_ECC_SERRPENA,
 		       base + ALTR_A10_ECC_INTSTAT_OFST);
-		edac_device_handle_ce(dci->edac_dev, 0, 0, dci->edac_dev_name);
+		edac_device_handle_ce(dci->edac_dev, 1, 0, 0,
+				      dci->edac_dev_name);
 
 		return IRQ_HANDLED;
 	} else if (irq == dci->db_irq) {
 		writel(ALTR_A10_ECC_DERRPENA,
 		       base + ALTR_A10_ECC_INTSTAT_OFST);
-		edac_device_handle_ue(dci->edac_dev, 0, 0, dci->edac_dev_name);
+		edac_device_handle_ue(dci->edac_dev, 1, 0, 0,
+				      dci->edac_dev_name);
 		if (dci->data->panic)
 			panic("\nEDAC:ECC_DEVICE[Uncorrectable errors]\n");
 
@@ -1308,14 +1310,16 @@ static irqreturn_t altr_edac_a10_l2_irq(int irq, void *dev_id)
 		regmap_write(dci->edac->ecc_mgr_map,
 			     A10_SYSGMR_MPU_CLEAR_L2_ECC_OFST,
 			     A10_SYSGMR_MPU_CLEAR_L2_ECC_SB);
-		edac_device_handle_ce(dci->edac_dev, 0, 0, dci->edac_dev_name);
+		edac_device_handle_ce(dci->edac_dev, 1, 0, 0,
+				      dci->edac_dev_name);
 
 		return IRQ_HANDLED;
 	} else if (irq == dci->db_irq) {
 		regmap_write(dci->edac->ecc_mgr_map,
 			     A10_SYSGMR_MPU_CLEAR_L2_ECC_OFST,
 			     A10_SYSGMR_MPU_CLEAR_L2_ECC_MB);
-		edac_device_handle_ue(dci->edac_dev, 0, 0, dci->edac_dev_name);
+		edac_device_handle_ue(dci->edac_dev, 1, 0, 0,
+				      dci->edac_dev_name);
 		panic("\nEDAC:ECC_DEVICE[Uncorrectable errors]\n");
 
 		return IRQ_HANDLED;
@@ -1652,12 +1656,12 @@ static irqreturn_t altr_edac_a10_ecc_irq_portb(int irq, void *dev_id)
 	if (irq == ad->sb_irq) {
 		writel(priv->ce_clear_mask,
 		       base + ALTR_A10_ECC_INTSTAT_OFST);
-		edac_device_handle_ce(ad->edac_dev, 0, 0, ad->edac_dev_name);
+		edac_device_handle_ce(ad->edac_dev, 1, 0, 0, ad->edac_dev_name);
 		return IRQ_HANDLED;
 	} else if (irq == ad->db_irq) {
 		writel(priv->ue_clear_mask,
 		       base + ALTR_A10_ECC_INTSTAT_OFST);
-		edac_device_handle_ue(ad->edac_dev, 0, 0, ad->edac_dev_name);
+		edac_device_handle_ue(ad->edac_dev, 1, 0, 0, ad->edac_dev_name);
 		return IRQ_HANDLED;
 	}
 
diff --git a/drivers/edac/amd8111_edac.c b/drivers/edac/amd8111_edac.c
index b5786cf..e595fab 100644
--- a/drivers/edac/amd8111_edac.c
+++ b/drivers/edac/amd8111_edac.c
@@ -303,7 +303,7 @@ static void amd8111_lpc_bridge_check(struct edac_device_ctl_info *edac_dev)
 		val8 |= IO_CTRL_1_CLEAR_MASK;
 		edac_pci_write_byte(dev, REG_IO_CTRL_1, val8);
 
-		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 	}
 
 	if (at_compat_reg_broken == 0) {
@@ -315,8 +315,8 @@ static void amd8111_lpc_bridge_check(struct edac_device_ctl_info *edac_dev)
 			out8 |= AT_COMPAT_CLRIOCHK;
 		if (out8 > 0) {
 			__do_outb(out8, REG_AT_COMPAT);
-			edac_device_handle_ue(edac_dev, 0, 0,
-						edac_dev->ctl_name);
+			edac_device_handle_ue(edac_dev, 1, 0, 0,
+					      edac_dev->ctl_name);
 		}
 	}
 }
diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c
index 3c0881a..eb74865 100644
--- a/drivers/edac/cpc925_edac.c
+++ b/drivers/edac/cpc925_edac.c
@@ -682,7 +682,7 @@ static void cpc925_cpu_check(struct edac_device_ctl_info *edac_dev)
 	cpc925_printk(KERN_INFO, "APIMASK		0x%08x\n", apimask);
 	cpc925_printk(KERN_INFO, "APIEXCP		0x%08x\n", apiexcp);
 
-	edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+	edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 }
 
 /******************** HT Link err device****************************/
@@ -756,7 +756,7 @@ static void cpc925_htlink_check(struct edac_device_ctl_info *edac_dev)
 		__raw_writel(HT_LINKERR_DETECTED,
 				dev_info->vbase + REG_LINKERR_OFFSET);
 
-	edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
+	edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 }
 
 static struct cpc925_dev_info cpc925_devs[] = {
diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
index 65cf2b9..d1de296 100644
--- a/drivers/edac/edac_device.c
+++ b/drivers/edac/edac_device.c
@@ -556,7 +556,8 @@ static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
 }
 
 void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
-			int inst_nr, int block_nr, const char *msg)
+			   u16 error_count, int inst_nr, int block_nr,
+			   const char *msg)
 {
 	struct edac_device_instance *instance;
 	struct edac_device_block *block = NULL;
@@ -582,12 +583,12 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
 
 	if (instance->nr_blocks > 0) {
 		block = instance->blocks + block_nr;
-		block->counters.ce_count++;
+		block->counters.ce_count += error_count;
 	}
 
 	/* Propagate the count up the 'totals' tree */
-	instance->counters.ce_count++;
-	edac_dev->counters.ce_count++;
+	instance->counters.ce_count += error_count;
+	edac_dev->counters.ce_count += error_count;
 
 	if (edac_device_get_log_ce(edac_dev))
 		edac_device_printk(edac_dev, KERN_WARNING,
@@ -598,7 +599,8 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
 EXPORT_SYMBOL_GPL(edac_device_handle_ce);
 
 void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
-			int inst_nr, int block_nr, const char *msg)
+			   u16 error_count, int inst_nr, int block_nr,
+			   const char *msg)
 {
 	struct edac_device_instance *instance;
 	struct edac_device_block *block = NULL;
@@ -624,12 +626,12 @@ void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
 
 	if (instance->nr_blocks > 0) {
 		block = instance->blocks + block_nr;
-		block->counters.ue_count++;
+		block->counters.ue_count += error_count;
 	}
 
 	/* Propagate the count up the 'totals' tree */
-	instance->counters.ue_count++;
-	edac_dev->counters.ue_count++;
+	instance->counters.ue_count += error_count;
+	edac_dev->counters.ue_count += error_count;
 
 	if (edac_device_get_log_ue(edac_dev))
 		edac_device_printk(edac_dev, KERN_EMERG,
diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
index 1aaba74..cf1a1da 100644
--- a/drivers/edac/edac_device.h
+++ b/drivers/edac/edac_device.h
@@ -290,23 +290,27 @@ extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
  *	perform a common output and handling of an 'edac_dev' UE event
  *
  * @edac_dev: pointer to struct &edac_device_ctl_info
+ * @error_count: number of errors of the same type
  * @inst_nr: number of the instance where the UE error happened
  * @block_nr: number of the block where the UE error happened
  * @msg: message to be printed
  */
 extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
-				int inst_nr, int block_nr, const char *msg);
+				  u16 error_count, int inst_nr, int block_nr,
+				  const char *msg);
 /**
  * edac_device_handle_ce():
  *	perform a common output and handling of an 'edac_dev' CE event
  *
  * @edac_dev: pointer to struct &edac_device_ctl_info
+ * @error_count: number of errors of the same type
  * @inst_nr: number of the instance where the CE error happened
  * @block_nr: number of the block where the CE error happened
  * @msg: message to be printed
  */
 extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
-				int inst_nr, int block_nr, const char *msg);
+				  u16 error_count, int inst_nr, int block_nr,
+				  const char *msg);
 
 /**
  * edac_device_alloc_index: Allocate a unique device index number
diff --git a/drivers/edac/highbank_l2_edac.c b/drivers/edac/highbank_l2_edac.c
index cd9a2bb..65f016a 100644
--- a/drivers/edac/highbank_l2_edac.c
+++ b/drivers/edac/highbank_l2_edac.c
@@ -39,11 +39,11 @@ static irqreturn_t highbank_l2_err_handler(int irq, void *dev_id)
 
 	if (irq == drvdata->sb_irq) {
 		writel(1, drvdata->base + SR_CLR_SB_ECC_INTR);
-		edac_device_handle_ce(dci, 0, 0, dci->ctl_name);
+		edac_device_handle_ce(dci, 1, 0, 0, dci->ctl_name);
 	}
 	if (irq == drvdata->db_irq) {
 		writel(1, drvdata->base + SR_CLR_DB_ECC_INTR);
-		edac_device_handle_ue(dci, 0, 0, dci->ctl_name);
+		edac_device_handle_ue(dci, 1, 0, 0, dci->ctl_name);
 	}
 
 	return IRQ_HANDLED;
diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c
index 67f7bc3..0618a06 100644
--- a/drivers/edac/mpc85xx_edac.c
+++ b/drivers/edac/mpc85xx_edac.c
@@ -464,10 +464,10 @@ static void mpc85xx_l2_check(struct edac_device_ctl_info *edac_dev)
 	out_be32(pdata->l2_vbase + MPC85XX_L2_ERRDET, err_detect);
 
 	if (err_detect & L2_EDE_CE_MASK)
-		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 
 	if (err_detect & L2_EDE_UE_MASK)
-		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 }
 
 static irqreturn_t mpc85xx_l2_isr(int irq, void *dev_id)
diff --git a/drivers/edac/mv64x60_edac.c b/drivers/edac/mv64x60_edac.c
index 3c68bb5..005b012 100644
--- a/drivers/edac/mv64x60_edac.c
+++ b/drivers/edac/mv64x60_edac.c
@@ -251,7 +251,7 @@ static void mv64x60_sram_check(struct edac_device_ctl_info *edac_dev)
 	       readl(pdata->sram_vbase + MV64X60_SRAM_ERR_PARITY));
 	writel(0, pdata->sram_vbase + MV64X60_SRAM_ERR_CAUSE);
 
-	edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+	edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 }
 
 static irqreturn_t mv64x60_sram_isr(int irq, void *dev_id)
@@ -417,7 +417,7 @@ static void mv64x60_cpu_check(struct edac_device_ctl_info *edac_dev)
 	       readl(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_PARITY));
 	writel(0, pdata->cpu_vbase[1] + MV64x60_CPU_ERR_CAUSE);
 
-	edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+	edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 }
 
 static irqreturn_t mv64x60_cpu_isr(int irq, void *dev_id)
diff --git a/drivers/edac/octeon_edac-l2c.c b/drivers/edac/octeon_edac-l2c.c
index c33059e..8e58531 100644
--- a/drivers/edac/octeon_edac-l2c.c
+++ b/drivers/edac/octeon_edac-l2c.c
@@ -28,12 +28,12 @@ static void octeon_l2c_poll_oct1(struct edac_device_ctl_info *l2c)
 	l2t_err_reset.u64 = 0;
 	l2t_err.u64 = cvmx_read_csr(CVMX_L2T_ERR);
 	if (l2t_err.s.sec_err) {
-		edac_device_handle_ce(l2c, 0, 0,
+		edac_device_handle_ce(l2c, 1, 0, 0,
 				      "Tag Single bit error (corrected)");
 		l2t_err_reset.s.sec_err = 1;
 	}
 	if (l2t_err.s.ded_err) {
-		edac_device_handle_ue(l2c, 0, 0,
+		edac_device_handle_ue(l2c, 1, 0, 0,
 				      "Tag Double bit error (detected)");
 		l2t_err_reset.s.ded_err = 1;
 	}
@@ -43,12 +43,12 @@ static void octeon_l2c_poll_oct1(struct edac_device_ctl_info *l2c)
 	l2d_err_reset.u64 = 0;
 	l2d_err.u64 = cvmx_read_csr(CVMX_L2D_ERR);
 	if (l2d_err.s.sec_err) {
-		edac_device_handle_ce(l2c, 0, 1,
+		edac_device_handle_ce(l2c, 1, 0, 1,
 				      "Data Single bit error (corrected)");
 		l2d_err_reset.s.sec_err = 1;
 	}
 	if (l2d_err.s.ded_err) {
-		edac_device_handle_ue(l2c, 0, 1,
+		edac_device_handle_ue(l2c, 1, 0, 1,
 				      "Data Double bit error (detected)");
 		l2d_err_reset.s.ded_err = 1;
 	}
@@ -76,25 +76,25 @@ static void _octeon_l2c_poll_oct2(struct edac_device_ctl_info *l2c, int tad)
 		snprintf(buf2, sizeof(buf2),
 			 "L2D Double bit error (detected):%s", buf1);
 		err_tdtx_reset.s.dbe = 1;
-		edac_device_handle_ue(l2c, tad, 1, buf2);
+		edac_device_handle_ue(l2c, 1, tad, 1, buf2);
 	}
 	if (err_tdtx.s.sbe) {
 		snprintf(buf2, sizeof(buf2),
 			 "L2D Single bit error (corrected):%s", buf1);
 		err_tdtx_reset.s.sbe = 1;
-		edac_device_handle_ce(l2c, tad, 1, buf2);
+		edac_device_handle_ce(l2c, 1, tad, 1, buf2);
 	}
 	if (err_tdtx.s.vdbe) {
 		snprintf(buf2, sizeof(buf2),
 			 "VBF Double bit error (detected):%s", buf1);
 		err_tdtx_reset.s.vdbe = 1;
-		edac_device_handle_ue(l2c, tad, 1, buf2);
+		edac_device_handle_ue(l2c, 1, tad, 1, buf2);
 	}
 	if (err_tdtx.s.vsbe) {
 		snprintf(buf2, sizeof(buf2),
 			 "VBF Single bit error (corrected):%s", buf1);
 		err_tdtx_reset.s.vsbe = 1;
-		edac_device_handle_ce(l2c, tad, 1, buf2);
+		edac_device_handle_ce(l2c, 1, tad, 1, buf2);
 	}
 	if (err_tdtx_reset.u64)
 		cvmx_write_csr(CVMX_L2C_ERR_TDTX(tad), err_tdtx_reset.u64);
@@ -111,13 +111,13 @@ static void _octeon_l2c_poll_oct2(struct edac_device_ctl_info *l2c, int tad)
 		snprintf(buf2, sizeof(buf2),
 			 "Tag Double bit error (detected):%s", buf1);
 		err_ttgx_reset.s.dbe = 1;
-		edac_device_handle_ue(l2c, tad, 0, buf2);
+		edac_device_handle_ue(l2c, 1, tad, 0, buf2);
 	}
 	if (err_ttgx.s.sbe) {
 		snprintf(buf2, sizeof(buf2),
 			 "Tag Single bit error (corrected):%s", buf1);
 		err_ttgx_reset.s.sbe = 1;
-		edac_device_handle_ce(l2c, tad, 0, buf2);
+		edac_device_handle_ce(l2c, 1, tad, 0, buf2);
 	}
 	if (err_ttgx_reset.u64)
 		cvmx_write_csr(CVMX_L2C_ERR_TTGX(tad), err_ttgx_reset.u64);
diff --git a/drivers/edac/octeon_edac-pc.c b/drivers/edac/octeon_edac-pc.c
index 754eced..efd0bbc 100644
--- a/drivers/edac/octeon_edac-pc.c
+++ b/drivers/edac/octeon_edac-pc.c
@@ -59,7 +59,7 @@ static int  co_cache_error_event(struct notifier_block *this,
 				   (unsigned long long)icache_err, core, cpu,
 				   read_c0_errorepc());
 		write_octeon_c0_icacheerr(0);
-		edac_device_handle_ce(p->ed, cpu, 1, "icache");
+		edac_device_handle_ce(p->ed, 1, cpu, 1, "icache");
 	}
 	if (dcache_err & 1) {
 		edac_device_printk(p->ed, KERN_ERR,
@@ -67,9 +67,9 @@ static int  co_cache_error_event(struct notifier_block *this,
 				   (unsigned long long)dcache_err, core, cpu,
 				   read_c0_errorepc());
 		if (event)
-			edac_device_handle_ue(p->ed, cpu, 0, "dcache");
+			edac_device_handle_ue(p->ed, 1, cpu, 0, "dcache");
 		else
-			edac_device_handle_ce(p->ed, cpu, 0, "dcache");
+			edac_device_handle_ce(p->ed, 1, cpu, 0, "dcache");
 
 		/* Clear the error indication */
 		if (OCTEON_IS_OCTEON2())
diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c
index 97a27e4..8730eed 100644
--- a/drivers/edac/qcom_edac.c
+++ b/drivers/edac/qcom_edac.c
@@ -261,19 +261,19 @@ dump_syn_reg(struct edac_device_ctl_info *edev_ctl, int err_type, u32 bank)
 
 	switch (err_type) {
 	case LLCC_DRAM_CE:
-		edac_device_handle_ce(edev_ctl, 0, bank,
+		edac_device_handle_ce(edev_ctl, 1, 0, bank,
 				      "LLCC Data RAM correctable Error");
 		break;
 	case LLCC_DRAM_UE:
-		edac_device_handle_ue(edev_ctl, 0, bank,
+		edac_device_handle_ue(edev_ctl, 1, 0, bank,
 				      "LLCC Data RAM uncorrectable Error");
 		break;
 	case LLCC_TRAM_CE:
-		edac_device_handle_ce(edev_ctl, 0, bank,
+		edac_device_handle_ce(edev_ctl, 1, 0, bank,
 				      "LLCC Tag RAM correctable Error");
 		break;
 	case LLCC_TRAM_UE:
-		edac_device_handle_ue(edev_ctl, 0, bank,
+		edac_device_handle_ue(edev_ctl, 1, 0, bank,
 				      "LLCC Tag RAM uncorrectable Error");
 		break;
 	default:
diff --git a/drivers/edac/thunderx_edac.c b/drivers/edac/thunderx_edac.c
index 34be60f..35a186f 100644
--- a/drivers/edac/thunderx_edac.c
+++ b/drivers/edac/thunderx_edac.c
@@ -1151,7 +1151,7 @@ static irqreturn_t thunderx_ocx_com_threaded_isr(int irq, void *irq_id)
 			}
 
 		if (ctx->reg_com_int & OCX_COM_INT_CE)
-			edac_device_handle_ce(ocx->edac_dev, 0, 0, msg);
+			edac_device_handle_ce(ocx->edac_dev, 1, 0, 0, msg);
 
 		ocx->com_ring_tail++;
 	}
@@ -1220,9 +1220,9 @@ static irqreturn_t thunderx_ocx_lnk_threaded_isr(int irq, void *irq_id)
 		strncat(msg, other, OCX_MESSAGE_SIZE);
 
 		if (ctx->reg_com_link_int & OCX_COM_LINK_INT_UE)
-			edac_device_handle_ue(ocx->edac_dev, 0, 0, msg);
+			edac_device_handle_ue(ocx->edac_dev, 1, 0, 0, msg);
 		else if (ctx->reg_com_link_int & OCX_COM_LINK_INT_CE)
-			edac_device_handle_ce(ocx->edac_dev, 0, 0, msg);
+			edac_device_handle_ce(ocx->edac_dev, 1, 0, 0, msg);
 
 		ocx->link_ring_tail++;
 	}
@@ -1899,9 +1899,9 @@ static irqreturn_t thunderx_l2c_threaded_isr(int irq, void *irq_id)
 		strncat(msg, other, L2C_MESSAGE_SIZE);
 
 		if (ctx->reg_int & mask_ue)
-			edac_device_handle_ue(l2c->edac_dev, 0, 0, msg);
+			edac_device_handle_ue(l2c->edac_dev, 1, 0, 0, msg);
 		else if (ctx->reg_int & mask_ce)
-			edac_device_handle_ce(l2c->edac_dev, 0, 0, msg);
+			edac_device_handle_ce(l2c->edac_dev, 1, 0, 0, msg);
 
 		l2c->ring_tail++;
 	}
diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c
index e8b81d7..d31a8bb 100644
--- a/drivers/edac/xgene_edac.c
+++ b/drivers/edac/xgene_edac.c
@@ -574,7 +574,7 @@ static void xgene_edac_pmd_l1_check(struct edac_device_ctl_info *edac_dev,
 
 	if (val & (MEMERR_CPU_ICFESR_CERR_MASK |
 		   MEMERR_CPU_ICFESR_MULTCERR_MASK))
-		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 
 chk_lsu:
 	val = readl(pg_f + MEMERR_CPU_LSUESR_PAGE_OFFSET);
@@ -618,7 +618,7 @@ static void xgene_edac_pmd_l1_check(struct edac_device_ctl_info *edac_dev,
 
 	if (val & (MEMERR_CPU_LSUESR_CERR_MASK |
 		   MEMERR_CPU_LSUESR_MULTCERR_MASK))
-		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 
 chk_mmu:
 	val = readl(pg_f + MEMERR_CPU_MMUESR_PAGE_OFFSET);
@@ -665,7 +665,7 @@ static void xgene_edac_pmd_l1_check(struct edac_device_ctl_info *edac_dev,
 	/* Clear any HW errors */
 	writel(val, pg_f + MEMERR_CPU_MMUESR_PAGE_OFFSET);
 
-	edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
+	edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 }
 
 static void xgene_edac_pmd_l2_check(struct edac_device_ctl_info *edac_dev)
@@ -724,10 +724,10 @@ static void xgene_edac_pmd_l2_check(struct edac_device_ctl_info *edac_dev)
 
 	if (val & (MEMERR_L2C_L2ESR_ERR_MASK |
 		   MEMERR_L2C_L2ESR_MULTICERR_MASK))
-		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 	if (val & (MEMERR_L2C_L2ESR_UCERR_MASK |
 		   MEMERR_L2C_L2ESR_MULTUCERR_MASK))
-		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 
 chk_l2c:
 	/* Check if any memory request timed out on L2 cache */
@@ -1113,13 +1113,13 @@ static void xgene_edac_l3_check(struct edac_device_ctl_info *edac_dev)
 
 	if (ctx->version <= 1 &&
 	    xgene_edac_l3_promote_to_uc_err(l3cesr, l3celr)) {
-		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 		return;
 	}
 	if (l3cesr & L3C_ESR_CERR_MASK)
-		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 	if (l3cesr & L3C_ESR_UCERR_MASK)
-		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 }
 
 static void xgene_edac_l3_hw_init(struct edac_device_ctl_info *edac_dev,
@@ -1449,7 +1449,7 @@ static void xgene_edac_iob_gic_report(struct edac_device_ctl_info *edac_dev)
 		writel(err_addr_hi, ctx->dev_csr + GLBL_MSEC_ERRH);
 	}
 	if (reg & (SEC_ERR_MASK | MSEC_ERR_MASK))
-		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 
 	if (reg & DED_ERR_MASK) {
 		err_addr_lo = readl(ctx->dev_csr + GLBL_DED_ERRL);
@@ -1470,7 +1470,7 @@ static void xgene_edac_iob_gic_report(struct edac_device_ctl_info *edac_dev)
 		writel(err_addr_hi, ctx->dev_csr + GLBL_MDED_ERRH);
 	}
 	if (reg & (DED_ERR_MASK | MDED_ERR_MASK))
-		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 }
 
 static void xgene_edac_rb_report(struct edac_device_ctl_info *edac_dev)
@@ -1675,7 +1675,7 @@ static void xgene_edac_soc_check(struct edac_device_ctl_info *edac_dev)
 	if (pcp_lp_stat & CSW_SWITCH_TRACE_ERR_MASK) {
 		dev_info(edac_dev->dev,
 			 "CSW switch trace correctable memory parity error\n");
-		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 	}
 
 	if (!reg)
@@ -1685,14 +1685,14 @@ static void xgene_edac_soc_check(struct edac_device_ctl_info *edac_dev)
 	if (!soc_mem_err) {
 		dev_err(edac_dev->dev, "SoC memory parity error 0x%08X\n",
 			reg);
-		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
+		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
 		return;
 	}
 	for (i = 0; i < 31; i++) {
 		if (reg & (1 << i)) {
 			dev_err(edac_dev->dev, "%s memory parity error\n",
 				soc_mem_err[i]);
-			edac_device_handle_ue(edac_dev, 0, 0,
+			edac_device_handle_ue(edac_dev, 1, 0, 0,
 					      edac_dev->ctl_name);
 		}
 	}
-- 
2.7.4


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce()
  2019-07-15 10:53 [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce() Hanna Hawa
@ 2019-07-17 12:06 ` Jan Glauber
  2019-07-17 14:45   ` Hawa, Hanna
  2019-07-25 18:36 ` Mauro Carvalho Chehab
  2019-08-01 11:35 ` Robert Richter
  2 siblings, 1 reply; 9+ messages in thread
From: Jan Glauber @ 2019-07-17 12:06 UTC (permalink / raw)
  To: Hanna Hawa
  Cc: thor.thayer, bp, mchehab, james.morse, rric, morbidrsa, ralf,
	david.daney, andy.gross, david.brown, ckadabi, vnkgutta,
	jglauber, khuong, dwmw, benh, ronenk, talel, jonnyc, hanochu,
	linux-edac, linux-kernel, linux-mips, linux-arm-msm

On Mon, Jul 15, 2019 at 01:53:07PM +0300, Hanna Hawa wrote:
> Add a counter parameter in order to avoid losing errors count for edac
> device, the error count reports the number of errors reported by an edac
> device similar to the way MC_EDAC do.
> 
> Signed-off-by: Hanna Hawa <hhhawa@amazon.com>
> ---
>  drivers/edac/altera_edac.c      | 20 ++++++++++++--------
>  drivers/edac/amd8111_edac.c     |  6 +++---
>  drivers/edac/cpc925_edac.c      |  4 ++--
>  drivers/edac/edac_device.c      | 18 ++++++++++--------
>  drivers/edac/edac_device.h      |  8 ++++++--
>  drivers/edac/highbank_l2_edac.c |  4 ++--
>  drivers/edac/mpc85xx_edac.c     |  4 ++--
>  drivers/edac/mv64x60_edac.c     |  4 ++--
>  drivers/edac/octeon_edac-l2c.c  | 20 ++++++++++----------
>  drivers/edac/octeon_edac-pc.c   |  6 +++---
>  drivers/edac/qcom_edac.c        |  8 ++++----
>  drivers/edac/thunderx_edac.c    | 10 +++++-----
>  drivers/edac/xgene_edac.c       | 26 +++++++++++++-------------
>  13 files changed, 74 insertions(+), 64 deletions(-)

Hi Hanna,

I'm probably missing something but this patch looks like while it adds
the error_count parameter the passed values all seem to be 1. So is the
new parameter used otherwise, maybe in another patch?

thanks,
Jan

> diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
> index 8816f74..747dd43 100644
> --- a/drivers/edac/altera_edac.c
> +++ b/drivers/edac/altera_edac.c
> @@ -616,12 +616,12 @@ static irqreturn_t altr_edac_device_handler(int irq, void *dev_id)
>  	if (irq == drvdata->sb_irq) {
>  		if (priv->ce_clear_mask)
>  			writel(priv->ce_clear_mask, drvdata->base);
> -		edac_device_handle_ce(dci, 0, 0, drvdata->edac_dev_name);
> +		edac_device_handle_ce(dci, 1, 0, 0, drvdata->edac_dev_name);
>  		ret_value = IRQ_HANDLED;
>  	} else if (irq == drvdata->db_irq) {
>  		if (priv->ue_clear_mask)
>  			writel(priv->ue_clear_mask, drvdata->base);
> -		edac_device_handle_ue(dci, 0, 0, drvdata->edac_dev_name);
> +		edac_device_handle_ue(dci, 1, 0, 0, drvdata->edac_dev_name);
>  		panic("\nEDAC:ECC_DEVICE[Uncorrectable errors]\n");
>  		ret_value = IRQ_HANDLED;
>  	} else {
> @@ -919,13 +919,15 @@ static irqreturn_t __maybe_unused altr_edac_a10_ecc_irq(int irq, void *dev_id)
>  	if (irq == dci->sb_irq) {
>  		writel(ALTR_A10_ECC_SERRPENA,
>  		       base + ALTR_A10_ECC_INTSTAT_OFST);
> -		edac_device_handle_ce(dci->edac_dev, 0, 0, dci->edac_dev_name);
> +		edac_device_handle_ce(dci->edac_dev, 1, 0, 0,
> +				      dci->edac_dev_name);
>  
>  		return IRQ_HANDLED;
>  	} else if (irq == dci->db_irq) {
>  		writel(ALTR_A10_ECC_DERRPENA,
>  		       base + ALTR_A10_ECC_INTSTAT_OFST);
> -		edac_device_handle_ue(dci->edac_dev, 0, 0, dci->edac_dev_name);
> +		edac_device_handle_ue(dci->edac_dev, 1, 0, 0,
> +				      dci->edac_dev_name);
>  		if (dci->data->panic)
>  			panic("\nEDAC:ECC_DEVICE[Uncorrectable errors]\n");
>  
> @@ -1308,14 +1310,16 @@ static irqreturn_t altr_edac_a10_l2_irq(int irq, void *dev_id)
>  		regmap_write(dci->edac->ecc_mgr_map,
>  			     A10_SYSGMR_MPU_CLEAR_L2_ECC_OFST,
>  			     A10_SYSGMR_MPU_CLEAR_L2_ECC_SB);
> -		edac_device_handle_ce(dci->edac_dev, 0, 0, dci->edac_dev_name);
> +		edac_device_handle_ce(dci->edac_dev, 1, 0, 0,
> +				      dci->edac_dev_name);
>  
>  		return IRQ_HANDLED;
>  	} else if (irq == dci->db_irq) {
>  		regmap_write(dci->edac->ecc_mgr_map,
>  			     A10_SYSGMR_MPU_CLEAR_L2_ECC_OFST,
>  			     A10_SYSGMR_MPU_CLEAR_L2_ECC_MB);
> -		edac_device_handle_ue(dci->edac_dev, 0, 0, dci->edac_dev_name);
> +		edac_device_handle_ue(dci->edac_dev, 1, 0, 0,
> +				      dci->edac_dev_name);
>  		panic("\nEDAC:ECC_DEVICE[Uncorrectable errors]\n");
>  
>  		return IRQ_HANDLED;
> @@ -1652,12 +1656,12 @@ static irqreturn_t altr_edac_a10_ecc_irq_portb(int irq, void *dev_id)
>  	if (irq == ad->sb_irq) {
>  		writel(priv->ce_clear_mask,
>  		       base + ALTR_A10_ECC_INTSTAT_OFST);
> -		edac_device_handle_ce(ad->edac_dev, 0, 0, ad->edac_dev_name);
> +		edac_device_handle_ce(ad->edac_dev, 1, 0, 0, ad->edac_dev_name);
>  		return IRQ_HANDLED;
>  	} else if (irq == ad->db_irq) {
>  		writel(priv->ue_clear_mask,
>  		       base + ALTR_A10_ECC_INTSTAT_OFST);
> -		edac_device_handle_ue(ad->edac_dev, 0, 0, ad->edac_dev_name);
> +		edac_device_handle_ue(ad->edac_dev, 1, 0, 0, ad->edac_dev_name);
>  		return IRQ_HANDLED;
>  	}
>  
> diff --git a/drivers/edac/amd8111_edac.c b/drivers/edac/amd8111_edac.c
> index b5786cf..e595fab 100644
> --- a/drivers/edac/amd8111_edac.c
> +++ b/drivers/edac/amd8111_edac.c
> @@ -303,7 +303,7 @@ static void amd8111_lpc_bridge_check(struct edac_device_ctl_info *edac_dev)
>  		val8 |= IO_CTRL_1_CLEAR_MASK;
>  		edac_pci_write_byte(dev, REG_IO_CTRL_1, val8);
>  
> -		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  	}
>  
>  	if (at_compat_reg_broken == 0) {
> @@ -315,8 +315,8 @@ static void amd8111_lpc_bridge_check(struct edac_device_ctl_info *edac_dev)
>  			out8 |= AT_COMPAT_CLRIOCHK;
>  		if (out8 > 0) {
>  			__do_outb(out8, REG_AT_COMPAT);
> -			edac_device_handle_ue(edac_dev, 0, 0,
> -						edac_dev->ctl_name);
> +			edac_device_handle_ue(edac_dev, 1, 0, 0,
> +					      edac_dev->ctl_name);
>  		}
>  	}
>  }
> diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c
> index 3c0881a..eb74865 100644
> --- a/drivers/edac/cpc925_edac.c
> +++ b/drivers/edac/cpc925_edac.c
> @@ -682,7 +682,7 @@ static void cpc925_cpu_check(struct edac_device_ctl_info *edac_dev)
>  	cpc925_printk(KERN_INFO, "APIMASK		0x%08x\n", apimask);
>  	cpc925_printk(KERN_INFO, "APIEXCP		0x%08x\n", apiexcp);
>  
> -	edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +	edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  }
>  
>  /******************** HT Link err device****************************/
> @@ -756,7 +756,7 @@ static void cpc925_htlink_check(struct edac_device_ctl_info *edac_dev)
>  		__raw_writel(HT_LINKERR_DETECTED,
>  				dev_info->vbase + REG_LINKERR_OFFSET);
>  
> -	edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
> +	edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  }
>  
>  static struct cpc925_dev_info cpc925_devs[] = {
> diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
> index 65cf2b9..d1de296 100644
> --- a/drivers/edac/edac_device.c
> +++ b/drivers/edac/edac_device.c
> @@ -556,7 +556,8 @@ static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
>  }
>  
>  void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> -			int inst_nr, int block_nr, const char *msg)
> +			   u16 error_count, int inst_nr, int block_nr,
> +			   const char *msg)
>  {
>  	struct edac_device_instance *instance;
>  	struct edac_device_block *block = NULL;
> @@ -582,12 +583,12 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>  
>  	if (instance->nr_blocks > 0) {
>  		block = instance->blocks + block_nr;
> -		block->counters.ce_count++;
> +		block->counters.ce_count += error_count;
>  	}
>  
>  	/* Propagate the count up the 'totals' tree */
> -	instance->counters.ce_count++;
> -	edac_dev->counters.ce_count++;
> +	instance->counters.ce_count += error_count;
> +	edac_dev->counters.ce_count += error_count;
>  
>  	if (edac_device_get_log_ce(edac_dev))
>  		edac_device_printk(edac_dev, KERN_WARNING,
> @@ -598,7 +599,8 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>  EXPORT_SYMBOL_GPL(edac_device_handle_ce);
>  
>  void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
> -			int inst_nr, int block_nr, const char *msg)
> +			   u16 error_count, int inst_nr, int block_nr,
> +			   const char *msg)
>  {
>  	struct edac_device_instance *instance;
>  	struct edac_device_block *block = NULL;
> @@ -624,12 +626,12 @@ void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>  
>  	if (instance->nr_blocks > 0) {
>  		block = instance->blocks + block_nr;
> -		block->counters.ue_count++;
> +		block->counters.ue_count += error_count;
>  	}
>  
>  	/* Propagate the count up the 'totals' tree */
> -	instance->counters.ue_count++;
> -	edac_dev->counters.ue_count++;
> +	instance->counters.ue_count += error_count;
> +	edac_dev->counters.ue_count += error_count;
>  
>  	if (edac_device_get_log_ue(edac_dev))
>  		edac_device_printk(edac_dev, KERN_EMERG,
> diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
> index 1aaba74..cf1a1da 100644
> --- a/drivers/edac/edac_device.h
> +++ b/drivers/edac/edac_device.h
> @@ -290,23 +290,27 @@ extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
>   *	perform a common output and handling of an 'edac_dev' UE event
>   *
>   * @edac_dev: pointer to struct &edac_device_ctl_info
> + * @error_count: number of errors of the same type
>   * @inst_nr: number of the instance where the UE error happened
>   * @block_nr: number of the block where the UE error happened
>   * @msg: message to be printed
>   */
>  extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
> -				int inst_nr, int block_nr, const char *msg);
> +				  u16 error_count, int inst_nr, int block_nr,
> +				  const char *msg);
>  /**
>   * edac_device_handle_ce():
>   *	perform a common output and handling of an 'edac_dev' CE event
>   *
>   * @edac_dev: pointer to struct &edac_device_ctl_info
> + * @error_count: number of errors of the same type
>   * @inst_nr: number of the instance where the CE error happened
>   * @block_nr: number of the block where the CE error happened
>   * @msg: message to be printed
>   */
>  extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> -				int inst_nr, int block_nr, const char *msg);
> +				  u16 error_count, int inst_nr, int block_nr,
> +				  const char *msg);
>  
>  /**
>   * edac_device_alloc_index: Allocate a unique device index number
> diff --git a/drivers/edac/highbank_l2_edac.c b/drivers/edac/highbank_l2_edac.c
> index cd9a2bb..65f016a 100644
> --- a/drivers/edac/highbank_l2_edac.c
> +++ b/drivers/edac/highbank_l2_edac.c
> @@ -39,11 +39,11 @@ static irqreturn_t highbank_l2_err_handler(int irq, void *dev_id)
>  
>  	if (irq == drvdata->sb_irq) {
>  		writel(1, drvdata->base + SR_CLR_SB_ECC_INTR);
> -		edac_device_handle_ce(dci, 0, 0, dci->ctl_name);
> +		edac_device_handle_ce(dci, 1, 0, 0, dci->ctl_name);
>  	}
>  	if (irq == drvdata->db_irq) {
>  		writel(1, drvdata->base + SR_CLR_DB_ECC_INTR);
> -		edac_device_handle_ue(dci, 0, 0, dci->ctl_name);
> +		edac_device_handle_ue(dci, 1, 0, 0, dci->ctl_name);
>  	}
>  
>  	return IRQ_HANDLED;
> diff --git a/drivers/edac/mpc85xx_edac.c b/drivers/edac/mpc85xx_edac.c
> index 67f7bc3..0618a06 100644
> --- a/drivers/edac/mpc85xx_edac.c
> +++ b/drivers/edac/mpc85xx_edac.c
> @@ -464,10 +464,10 @@ static void mpc85xx_l2_check(struct edac_device_ctl_info *edac_dev)
>  	out_be32(pdata->l2_vbase + MPC85XX_L2_ERRDET, err_detect);
>  
>  	if (err_detect & L2_EDE_CE_MASK)
> -		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  
>  	if (err_detect & L2_EDE_UE_MASK)
> -		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  }
>  
>  static irqreturn_t mpc85xx_l2_isr(int irq, void *dev_id)
> diff --git a/drivers/edac/mv64x60_edac.c b/drivers/edac/mv64x60_edac.c
> index 3c68bb5..005b012 100644
> --- a/drivers/edac/mv64x60_edac.c
> +++ b/drivers/edac/mv64x60_edac.c
> @@ -251,7 +251,7 @@ static void mv64x60_sram_check(struct edac_device_ctl_info *edac_dev)
>  	       readl(pdata->sram_vbase + MV64X60_SRAM_ERR_PARITY));
>  	writel(0, pdata->sram_vbase + MV64X60_SRAM_ERR_CAUSE);
>  
> -	edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +	edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  }
>  
>  static irqreturn_t mv64x60_sram_isr(int irq, void *dev_id)
> @@ -417,7 +417,7 @@ static void mv64x60_cpu_check(struct edac_device_ctl_info *edac_dev)
>  	       readl(pdata->cpu_vbase[1] + MV64x60_CPU_ERR_PARITY));
>  	writel(0, pdata->cpu_vbase[1] + MV64x60_CPU_ERR_CAUSE);
>  
> -	edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +	edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  }
>  
>  static irqreturn_t mv64x60_cpu_isr(int irq, void *dev_id)
> diff --git a/drivers/edac/octeon_edac-l2c.c b/drivers/edac/octeon_edac-l2c.c
> index c33059e..8e58531 100644
> --- a/drivers/edac/octeon_edac-l2c.c
> +++ b/drivers/edac/octeon_edac-l2c.c
> @@ -28,12 +28,12 @@ static void octeon_l2c_poll_oct1(struct edac_device_ctl_info *l2c)
>  	l2t_err_reset.u64 = 0;
>  	l2t_err.u64 = cvmx_read_csr(CVMX_L2T_ERR);
>  	if (l2t_err.s.sec_err) {
> -		edac_device_handle_ce(l2c, 0, 0,
> +		edac_device_handle_ce(l2c, 1, 0, 0,
>  				      "Tag Single bit error (corrected)");
>  		l2t_err_reset.s.sec_err = 1;
>  	}
>  	if (l2t_err.s.ded_err) {
> -		edac_device_handle_ue(l2c, 0, 0,
> +		edac_device_handle_ue(l2c, 1, 0, 0,
>  				      "Tag Double bit error (detected)");
>  		l2t_err_reset.s.ded_err = 1;
>  	}
> @@ -43,12 +43,12 @@ static void octeon_l2c_poll_oct1(struct edac_device_ctl_info *l2c)
>  	l2d_err_reset.u64 = 0;
>  	l2d_err.u64 = cvmx_read_csr(CVMX_L2D_ERR);
>  	if (l2d_err.s.sec_err) {
> -		edac_device_handle_ce(l2c, 0, 1,
> +		edac_device_handle_ce(l2c, 1, 0, 1,
>  				      "Data Single bit error (corrected)");
>  		l2d_err_reset.s.sec_err = 1;
>  	}
>  	if (l2d_err.s.ded_err) {
> -		edac_device_handle_ue(l2c, 0, 1,
> +		edac_device_handle_ue(l2c, 1, 0, 1,
>  				      "Data Double bit error (detected)");
>  		l2d_err_reset.s.ded_err = 1;
>  	}
> @@ -76,25 +76,25 @@ static void _octeon_l2c_poll_oct2(struct edac_device_ctl_info *l2c, int tad)
>  		snprintf(buf2, sizeof(buf2),
>  			 "L2D Double bit error (detected):%s", buf1);
>  		err_tdtx_reset.s.dbe = 1;
> -		edac_device_handle_ue(l2c, tad, 1, buf2);
> +		edac_device_handle_ue(l2c, 1, tad, 1, buf2);
>  	}
>  	if (err_tdtx.s.sbe) {
>  		snprintf(buf2, sizeof(buf2),
>  			 "L2D Single bit error (corrected):%s", buf1);
>  		err_tdtx_reset.s.sbe = 1;
> -		edac_device_handle_ce(l2c, tad, 1, buf2);
> +		edac_device_handle_ce(l2c, 1, tad, 1, buf2);
>  	}
>  	if (err_tdtx.s.vdbe) {
>  		snprintf(buf2, sizeof(buf2),
>  			 "VBF Double bit error (detected):%s", buf1);
>  		err_tdtx_reset.s.vdbe = 1;
> -		edac_device_handle_ue(l2c, tad, 1, buf2);
> +		edac_device_handle_ue(l2c, 1, tad, 1, buf2);
>  	}
>  	if (err_tdtx.s.vsbe) {
>  		snprintf(buf2, sizeof(buf2),
>  			 "VBF Single bit error (corrected):%s", buf1);
>  		err_tdtx_reset.s.vsbe = 1;
> -		edac_device_handle_ce(l2c, tad, 1, buf2);
> +		edac_device_handle_ce(l2c, 1, tad, 1, buf2);
>  	}
>  	if (err_tdtx_reset.u64)
>  		cvmx_write_csr(CVMX_L2C_ERR_TDTX(tad), err_tdtx_reset.u64);
> @@ -111,13 +111,13 @@ static void _octeon_l2c_poll_oct2(struct edac_device_ctl_info *l2c, int tad)
>  		snprintf(buf2, sizeof(buf2),
>  			 "Tag Double bit error (detected):%s", buf1);
>  		err_ttgx_reset.s.dbe = 1;
> -		edac_device_handle_ue(l2c, tad, 0, buf2);
> +		edac_device_handle_ue(l2c, 1, tad, 0, buf2);
>  	}
>  	if (err_ttgx.s.sbe) {
>  		snprintf(buf2, sizeof(buf2),
>  			 "Tag Single bit error (corrected):%s", buf1);
>  		err_ttgx_reset.s.sbe = 1;
> -		edac_device_handle_ce(l2c, tad, 0, buf2);
> +		edac_device_handle_ce(l2c, 1, tad, 0, buf2);
>  	}
>  	if (err_ttgx_reset.u64)
>  		cvmx_write_csr(CVMX_L2C_ERR_TTGX(tad), err_ttgx_reset.u64);
> diff --git a/drivers/edac/octeon_edac-pc.c b/drivers/edac/octeon_edac-pc.c
> index 754eced..efd0bbc 100644
> --- a/drivers/edac/octeon_edac-pc.c
> +++ b/drivers/edac/octeon_edac-pc.c
> @@ -59,7 +59,7 @@ static int  co_cache_error_event(struct notifier_block *this,
>  				   (unsigned long long)icache_err, core, cpu,
>  				   read_c0_errorepc());
>  		write_octeon_c0_icacheerr(0);
> -		edac_device_handle_ce(p->ed, cpu, 1, "icache");
> +		edac_device_handle_ce(p->ed, 1, cpu, 1, "icache");
>  	}
>  	if (dcache_err & 1) {
>  		edac_device_printk(p->ed, KERN_ERR,
> @@ -67,9 +67,9 @@ static int  co_cache_error_event(struct notifier_block *this,
>  				   (unsigned long long)dcache_err, core, cpu,
>  				   read_c0_errorepc());
>  		if (event)
> -			edac_device_handle_ue(p->ed, cpu, 0, "dcache");
> +			edac_device_handle_ue(p->ed, 1, cpu, 0, "dcache");
>  		else
> -			edac_device_handle_ce(p->ed, cpu, 0, "dcache");
> +			edac_device_handle_ce(p->ed, 1, cpu, 0, "dcache");
>  
>  		/* Clear the error indication */
>  		if (OCTEON_IS_OCTEON2())
> diff --git a/drivers/edac/qcom_edac.c b/drivers/edac/qcom_edac.c
> index 97a27e4..8730eed 100644
> --- a/drivers/edac/qcom_edac.c
> +++ b/drivers/edac/qcom_edac.c
> @@ -261,19 +261,19 @@ dump_syn_reg(struct edac_device_ctl_info *edev_ctl, int err_type, u32 bank)
>  
>  	switch (err_type) {
>  	case LLCC_DRAM_CE:
> -		edac_device_handle_ce(edev_ctl, 0, bank,
> +		edac_device_handle_ce(edev_ctl, 1, 0, bank,
>  				      "LLCC Data RAM correctable Error");
>  		break;
>  	case LLCC_DRAM_UE:
> -		edac_device_handle_ue(edev_ctl, 0, bank,
> +		edac_device_handle_ue(edev_ctl, 1, 0, bank,
>  				      "LLCC Data RAM uncorrectable Error");
>  		break;
>  	case LLCC_TRAM_CE:
> -		edac_device_handle_ce(edev_ctl, 0, bank,
> +		edac_device_handle_ce(edev_ctl, 1, 0, bank,
>  				      "LLCC Tag RAM correctable Error");
>  		break;
>  	case LLCC_TRAM_UE:
> -		edac_device_handle_ue(edev_ctl, 0, bank,
> +		edac_device_handle_ue(edev_ctl, 1, 0, bank,
>  				      "LLCC Tag RAM uncorrectable Error");
>  		break;
>  	default:
> diff --git a/drivers/edac/thunderx_edac.c b/drivers/edac/thunderx_edac.c
> index 34be60f..35a186f 100644
> --- a/drivers/edac/thunderx_edac.c
> +++ b/drivers/edac/thunderx_edac.c
> @@ -1151,7 +1151,7 @@ static irqreturn_t thunderx_ocx_com_threaded_isr(int irq, void *irq_id)
>  			}
>  
>  		if (ctx->reg_com_int & OCX_COM_INT_CE)
> -			edac_device_handle_ce(ocx->edac_dev, 0, 0, msg);
> +			edac_device_handle_ce(ocx->edac_dev, 1, 0, 0, msg);
>  
>  		ocx->com_ring_tail++;
>  	}
> @@ -1220,9 +1220,9 @@ static irqreturn_t thunderx_ocx_lnk_threaded_isr(int irq, void *irq_id)
>  		strncat(msg, other, OCX_MESSAGE_SIZE);
>  
>  		if (ctx->reg_com_link_int & OCX_COM_LINK_INT_UE)
> -			edac_device_handle_ue(ocx->edac_dev, 0, 0, msg);
> +			edac_device_handle_ue(ocx->edac_dev, 1, 0, 0, msg);
>  		else if (ctx->reg_com_link_int & OCX_COM_LINK_INT_CE)
> -			edac_device_handle_ce(ocx->edac_dev, 0, 0, msg);
> +			edac_device_handle_ce(ocx->edac_dev, 1, 0, 0, msg);
>  
>  		ocx->link_ring_tail++;
>  	}
> @@ -1899,9 +1899,9 @@ static irqreturn_t thunderx_l2c_threaded_isr(int irq, void *irq_id)
>  		strncat(msg, other, L2C_MESSAGE_SIZE);
>  
>  		if (ctx->reg_int & mask_ue)
> -			edac_device_handle_ue(l2c->edac_dev, 0, 0, msg);
> +			edac_device_handle_ue(l2c->edac_dev, 1, 0, 0, msg);
>  		else if (ctx->reg_int & mask_ce)
> -			edac_device_handle_ce(l2c->edac_dev, 0, 0, msg);
> +			edac_device_handle_ce(l2c->edac_dev, 1, 0, 0, msg);
>  
>  		l2c->ring_tail++;
>  	}
> diff --git a/drivers/edac/xgene_edac.c b/drivers/edac/xgene_edac.c
> index e8b81d7..d31a8bb 100644
> --- a/drivers/edac/xgene_edac.c
> +++ b/drivers/edac/xgene_edac.c
> @@ -574,7 +574,7 @@ static void xgene_edac_pmd_l1_check(struct edac_device_ctl_info *edac_dev,
>  
>  	if (val & (MEMERR_CPU_ICFESR_CERR_MASK |
>  		   MEMERR_CPU_ICFESR_MULTCERR_MASK))
> -		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  
>  chk_lsu:
>  	val = readl(pg_f + MEMERR_CPU_LSUESR_PAGE_OFFSET);
> @@ -618,7 +618,7 @@ static void xgene_edac_pmd_l1_check(struct edac_device_ctl_info *edac_dev,
>  
>  	if (val & (MEMERR_CPU_LSUESR_CERR_MASK |
>  		   MEMERR_CPU_LSUESR_MULTCERR_MASK))
> -		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  
>  chk_mmu:
>  	val = readl(pg_f + MEMERR_CPU_MMUESR_PAGE_OFFSET);
> @@ -665,7 +665,7 @@ static void xgene_edac_pmd_l1_check(struct edac_device_ctl_info *edac_dev,
>  	/* Clear any HW errors */
>  	writel(val, pg_f + MEMERR_CPU_MMUESR_PAGE_OFFSET);
>  
> -	edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
> +	edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  }
>  
>  static void xgene_edac_pmd_l2_check(struct edac_device_ctl_info *edac_dev)
> @@ -724,10 +724,10 @@ static void xgene_edac_pmd_l2_check(struct edac_device_ctl_info *edac_dev)
>  
>  	if (val & (MEMERR_L2C_L2ESR_ERR_MASK |
>  		   MEMERR_L2C_L2ESR_MULTICERR_MASK))
> -		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  	if (val & (MEMERR_L2C_L2ESR_UCERR_MASK |
>  		   MEMERR_L2C_L2ESR_MULTUCERR_MASK))
> -		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  
>  chk_l2c:
>  	/* Check if any memory request timed out on L2 cache */
> @@ -1113,13 +1113,13 @@ static void xgene_edac_l3_check(struct edac_device_ctl_info *edac_dev)
>  
>  	if (ctx->version <= 1 &&
>  	    xgene_edac_l3_promote_to_uc_err(l3cesr, l3celr)) {
> -		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  		return;
>  	}
>  	if (l3cesr & L3C_ESR_CERR_MASK)
> -		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  	if (l3cesr & L3C_ESR_UCERR_MASK)
> -		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  }
>  
>  static void xgene_edac_l3_hw_init(struct edac_device_ctl_info *edac_dev,
> @@ -1449,7 +1449,7 @@ static void xgene_edac_iob_gic_report(struct edac_device_ctl_info *edac_dev)
>  		writel(err_addr_hi, ctx->dev_csr + GLBL_MSEC_ERRH);
>  	}
>  	if (reg & (SEC_ERR_MASK | MSEC_ERR_MASK))
> -		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  
>  	if (reg & DED_ERR_MASK) {
>  		err_addr_lo = readl(ctx->dev_csr + GLBL_DED_ERRL);
> @@ -1470,7 +1470,7 @@ static void xgene_edac_iob_gic_report(struct edac_device_ctl_info *edac_dev)
>  		writel(err_addr_hi, ctx->dev_csr + GLBL_MDED_ERRH);
>  	}
>  	if (reg & (DED_ERR_MASK | MDED_ERR_MASK))
> -		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  }
>  
>  static void xgene_edac_rb_report(struct edac_device_ctl_info *edac_dev)
> @@ -1675,7 +1675,7 @@ static void xgene_edac_soc_check(struct edac_device_ctl_info *edac_dev)
>  	if (pcp_lp_stat & CSW_SWITCH_TRACE_ERR_MASK) {
>  		dev_info(edac_dev->dev,
>  			 "CSW switch trace correctable memory parity error\n");
> -		edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  	}
>  
>  	if (!reg)
> @@ -1685,14 +1685,14 @@ static void xgene_edac_soc_check(struct edac_device_ctl_info *edac_dev)
>  	if (!soc_mem_err) {
>  		dev_err(edac_dev->dev, "SoC memory parity error 0x%08X\n",
>  			reg);
> -		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  		return;
>  	}
>  	for (i = 0; i < 31; i++) {
>  		if (reg & (1 << i)) {
>  			dev_err(edac_dev->dev, "%s memory parity error\n",
>  				soc_mem_err[i]);
> -			edac_device_handle_ue(edac_dev, 0, 0,
> +			edac_device_handle_ue(edac_dev, 1, 0, 0,
>  					      edac_dev->ctl_name);
>  		}
>  	}
> -- 
> 2.7.4

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce()
  2019-07-17 12:06 ` Jan Glauber
@ 2019-07-17 14:45   ` Hawa, Hanna
  0 siblings, 0 replies; 9+ messages in thread
From: Hawa, Hanna @ 2019-07-17 14:45 UTC (permalink / raw)
  To: Jan Glauber
  Cc: thor.thayer, bp, mchehab, james.morse, rric, morbidrsa, ralf,
	david.daney, andy.gross, david.brown, ckadabi, vnkgutta,
	jglauber, khuong, dwmw, benh, ronenk, talel, jonnyc, hanochu,
	linux-edac, linux-kernel, linux-mips, linux-arm-msm

Hi Jan,

On 7/17/2019 3:06 PM, Jan Glauber wrote:
> Hi Hanna,
> 
> I'm probably missing something but this patch looks like while it adds
> the error_count parameter the passed values all seem to be 1. So is the
> new parameter used otherwise, maybe in another patch?

Yes in another patch. In Amazon L1/L2 edac driver [1], I'm using loop to 
report on multiple L1 or L2 errors. After this patch I'll remove the 
loop and pass the errors count.

[1]: https://lkml.org/lkml/2019/7/15/349

Thanks,
Hanna
> 
> thanks,
> Jan


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce()
  2019-07-15 10:53 [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce() Hanna Hawa
  2019-07-17 12:06 ` Jan Glauber
@ 2019-07-25 18:36 ` Mauro Carvalho Chehab
  2019-07-28  9:34   ` [UNVERIFIED SENDER] " Hawa, Hanna
  2019-08-01 11:35 ` Robert Richter
  2 siblings, 1 reply; 9+ messages in thread
From: Mauro Carvalho Chehab @ 2019-07-25 18:36 UTC (permalink / raw)
  To: Hanna Hawa
  Cc: thor.thayer, bp, james.morse, rric, morbidrsa, ralf, david.daney,
	andy.gross, david.brown, ckadabi, vnkgutta, jglauber, khuong,
	dwmw, benh, ronenk, talel, jonnyc, hanochu, linux-edac,
	linux-kernel, linux-mips, linux-arm-msm

Em Mon, 15 Jul 2019 13:53:07 +0300
Hanna Hawa <hhhawa@amazon.com> escreveu:

> Add a counter parameter in order to avoid losing errors count for edac
> device, the error count reports the number of errors reported by an edac
> device similar to the way MC_EDAC do.
> 
> Signed-off-by: Hanna Hawa <hhhawa@amazon.com>
> ---
>  drivers/edac/altera_edac.c      | 20 ++++++++++++--------
>  drivers/edac/amd8111_edac.c     |  6 +++---
>  drivers/edac/cpc925_edac.c      |  4 ++--
>  drivers/edac/edac_device.c      | 18 ++++++++++--------
>  drivers/edac/edac_device.h      |  8 ++++++--
>  drivers/edac/highbank_l2_edac.c |  4 ++--
>  drivers/edac/mpc85xx_edac.c     |  4 ++--
>  drivers/edac/mv64x60_edac.c     |  4 ++--
>  drivers/edac/octeon_edac-l2c.c  | 20 ++++++++++----------
>  drivers/edac/octeon_edac-pc.c   |  6 +++---
>  drivers/edac/qcom_edac.c        |  8 ++++----
>  drivers/edac/thunderx_edac.c    | 10 +++++-----
>  drivers/edac/xgene_edac.c       | 26 +++++++++++++-------------
>  13 files changed, 74 insertions(+), 64 deletions(-)
> 
> diff --git a/drivers/edac/altera_edac.c b/drivers/edac/altera_edac.c
> index 8816f74..747dd43 100644
> --- a/drivers/edac/altera_edac.c
> +++ b/drivers/edac/altera_edac.c
> @@ -616,12 +616,12 @@ static irqreturn_t altr_edac_device_handler(int irq, void *dev_id)
>  	if (irq == drvdata->sb_irq) {
>  		if (priv->ce_clear_mask)
>  			writel(priv->ce_clear_mask, drvdata->base);
> -		edac_device_handle_ce(dci, 0, 0, drvdata->edac_dev_name);
> +		edac_device_handle_ce(dci, 1, 0, 0, drvdata->edac_dev_name);
>  		ret_value = IRQ_HANDLED;
>  	} else if (irq == drvdata->db_irq) {
>  		if (priv->ue_clear_mask)
>  			writel(priv->ue_clear_mask, drvdata->base);
> -		edac_device_handle_ue(dci, 0, 0, drvdata->edac_dev_name);
> +		edac_device_handle_ue(dci, 1, 0, 0, drvdata->edac_dev_name);
>  		panic("\nEDAC:ECC_DEVICE[Uncorrectable errors]\n");
>  		ret_value = IRQ_HANDLED;
>  	} else {
> @@ -919,13 +919,15 @@ static irqreturn_t __maybe_unused altr_edac_a10_ecc_irq(int irq, void *dev_id)
>  	if (irq == dci->sb_irq) {
>  		writel(ALTR_A10_ECC_SERRPENA,
>  		       base + ALTR_A10_ECC_INTSTAT_OFST);
> -		edac_device_handle_ce(dci->edac_dev, 0, 0, dci->edac_dev_name);
> +		edac_device_handle_ce(dci->edac_dev, 1, 0, 0,
> +				      dci->edac_dev_name);
>  
>  		return IRQ_HANDLED;
>  	} else if (irq == dci->db_irq) {
>  		writel(ALTR_A10_ECC_DERRPENA,
>  		       base + ALTR_A10_ECC_INTSTAT_OFST);
> -		edac_device_handle_ue(dci->edac_dev, 0, 0, dci->edac_dev_name);
> +		edac_device_handle_ue(dci->edac_dev, 1, 0, 0,
> +				      dci->edac_dev_name);
>  		if (dci->data->panic)
>  			panic("\nEDAC:ECC_DEVICE[Uncorrectable errors]\n");
>  
> @@ -1308,14 +1310,16 @@ static irqreturn_t altr_edac_a10_l2_irq(int irq, void *dev_id)
>  		regmap_write(dci->edac->ecc_mgr_map,
>  			     A10_SYSGMR_MPU_CLEAR_L2_ECC_OFST,
>  			     A10_SYSGMR_MPU_CLEAR_L2_ECC_SB);
> -		edac_device_handle_ce(dci->edac_dev, 0, 0, dci->edac_dev_name);
> +		edac_device_handle_ce(dci->edac_dev, 1, 0, 0,
> +				      dci->edac_dev_name);
>  
>  		return IRQ_HANDLED;
>  	} else if (irq == dci->db_irq) {
>  		regmap_write(dci->edac->ecc_mgr_map,
>  			     A10_SYSGMR_MPU_CLEAR_L2_ECC_OFST,
>  			     A10_SYSGMR_MPU_CLEAR_L2_ECC_MB);
> -		edac_device_handle_ue(dci->edac_dev, 0, 0, dci->edac_dev_name);
> +		edac_device_handle_ue(dci->edac_dev, 1, 0, 0,
> +				      dci->edac_dev_name);
>  		panic("\nEDAC:ECC_DEVICE[Uncorrectable errors]\n");
>  
>  		return IRQ_HANDLED;
> @@ -1652,12 +1656,12 @@ static irqreturn_t altr_edac_a10_ecc_irq_portb(int irq, void *dev_id)
>  	if (irq == ad->sb_irq) {
>  		writel(priv->ce_clear_mask,
>  		       base + ALTR_A10_ECC_INTSTAT_OFST);
> -		edac_device_handle_ce(ad->edac_dev, 0, 0, ad->edac_dev_name);
> +		edac_device_handle_ce(ad->edac_dev, 1, 0, 0, ad->edac_dev_name);
>  		return IRQ_HANDLED;
>  	} else if (irq == ad->db_irq) {
>  		writel(priv->ue_clear_mask,
>  		       base + ALTR_A10_ECC_INTSTAT_OFST);
> -		edac_device_handle_ue(ad->edac_dev, 0, 0, ad->edac_dev_name);
> +		edac_device_handle_ue(ad->edac_dev, 1, 0, 0, ad->edac_dev_name);
>  		return IRQ_HANDLED;
>  	}
>  
> diff --git a/drivers/edac/amd8111_edac.c b/drivers/edac/amd8111_edac.c
> index b5786cf..e595fab 100644
> --- a/drivers/edac/amd8111_edac.c
> +++ b/drivers/edac/amd8111_edac.c
> @@ -303,7 +303,7 @@ static void amd8111_lpc_bridge_check(struct edac_device_ctl_info *edac_dev)
>  		val8 |= IO_CTRL_1_CLEAR_MASK;
>  		edac_pci_write_byte(dev, REG_IO_CTRL_1, val8);
>  
> -		edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +		edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  	}
>  
>  	if (at_compat_reg_broken == 0) {
> @@ -315,8 +315,8 @@ static void amd8111_lpc_bridge_check(struct edac_device_ctl_info *edac_dev)
>  			out8 |= AT_COMPAT_CLRIOCHK;
>  		if (out8 > 0) {
>  			__do_outb(out8, REG_AT_COMPAT);
> -			edac_device_handle_ue(edac_dev, 0, 0,
> -						edac_dev->ctl_name);
> +			edac_device_handle_ue(edac_dev, 1, 0, 0,
> +					      edac_dev->ctl_name);
>  		}
>  	}
>  }
> diff --git a/drivers/edac/cpc925_edac.c b/drivers/edac/cpc925_edac.c
> index 3c0881a..eb74865 100644
> --- a/drivers/edac/cpc925_edac.c
> +++ b/drivers/edac/cpc925_edac.c
> @@ -682,7 +682,7 @@ static void cpc925_cpu_check(struct edac_device_ctl_info *edac_dev)
>  	cpc925_printk(KERN_INFO, "APIMASK		0x%08x\n", apimask);
>  	cpc925_printk(KERN_INFO, "APIEXCP		0x%08x\n", apiexcp);
>  
> -	edac_device_handle_ue(edac_dev, 0, 0, edac_dev->ctl_name);
> +	edac_device_handle_ue(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  }
>  
>  /******************** HT Link err device****************************/
> @@ -756,7 +756,7 @@ static void cpc925_htlink_check(struct edac_device_ctl_info *edac_dev)
>  		__raw_writel(HT_LINKERR_DETECTED,
>  				dev_info->vbase + REG_LINKERR_OFFSET);
>  
> -	edac_device_handle_ce(edac_dev, 0, 0, edac_dev->ctl_name);
> +	edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev->ctl_name);
>  }
>  
>  static struct cpc925_dev_info cpc925_devs[] = {
> diff --git a/drivers/edac/edac_device.c b/drivers/edac/edac_device.c
> index 65cf2b9..d1de296 100644
> --- a/drivers/edac/edac_device.c
> +++ b/drivers/edac/edac_device.c
> @@ -556,7 +556,8 @@ static inline int edac_device_get_panic_on_ue(struct edac_device_ctl_info
>  }
>  
>  void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> -			int inst_nr, int block_nr, const char *msg)
> +			   u16 error_count, int inst_nr, int block_nr,
> +			   const char *msg)
>  {
>  	struct edac_device_instance *instance;
>  	struct edac_device_block *block = NULL;
> @@ -582,12 +583,12 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>  
>  	if (instance->nr_blocks > 0) {
>  		block = instance->blocks + block_nr;
> -		block->counters.ce_count++;
> +		block->counters.ce_count += error_count;
>  	}
>  
>  	/* Propagate the count up the 'totals' tree */
> -	instance->counters.ce_count++;
> -	edac_dev->counters.ce_count++;
> +	instance->counters.ce_count += error_count;
> +	edac_dev->counters.ce_count += error_count;
>  
>  	if (edac_device_get_log_ce(edac_dev))
>  		edac_device_printk(edac_dev, KERN_WARNING,
> @@ -598,7 +599,8 @@ void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
>  EXPORT_SYMBOL_GPL(edac_device_handle_ce);
>  
>  void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
> -			int inst_nr, int block_nr, const char *msg)
> +			   u16 error_count, int inst_nr, int block_nr,
> +			   const char *msg)
>  {
>  	struct edac_device_instance *instance;
>  	struct edac_device_block *block = NULL;
> @@ -624,12 +626,12 @@ void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>  
>  	if (instance->nr_blocks > 0) {
>  		block = instance->blocks + block_nr;
> -		block->counters.ue_count++;
> +		block->counters.ue_count += error_count;
>  	}
>  
>  	/* Propagate the count up the 'totals' tree */
> -	instance->counters.ue_count++;
> -	edac_dev->counters.ue_count++;
> +	instance->counters.ue_count += error_count;
> +	edac_dev->counters.ue_count += error_count;

Patch itself looks a good idea, but maybe it should rise a WARN()
if error_count == 0.

That applies for both CE and UE error logic.

Thanks,
Mauro

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [UNVERIFIED SENDER] Re: [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce()
  2019-07-25 18:36 ` Mauro Carvalho Chehab
@ 2019-07-28  9:34   ` " Hawa, Hanna
  0 siblings, 0 replies; 9+ messages in thread
From: Hawa, Hanna @ 2019-07-28  9:34 UTC (permalink / raw)
  To: Mauro Carvalho Chehab
  Cc: thor.thayer, bp, james.morse, rric, morbidrsa, ralf, david.daney,
	andy.gross, david.brown, ckadabi, vnkgutta, jglauber, khuong,
	dwmw, benh, ronenk, talel, jonnyc, hanochu, linux-edac,
	linux-kernel, linux-mips, linux-arm-msm



On 7/25/2019 9:36 PM, Mauro Carvalho Chehab wrote:
>>   	/* Propagate the count up the 'totals' tree */
>> -	instance->counters.ue_count++;
>> -	edac_dev->counters.ue_count++;
>> +	instance->counters.ue_count += error_count;
>> +	edac_dev->counters.ue_count += error_count;
> Patch itself looks a good idea, but maybe it should rise a WARN()
> if error_count == 0.
Good point, shouldn't we use WARN_ONCE here? if the user call 
edac_device_handle_ue() with error count == 0, it not be change in 
run-time, only if the error count parameter is calculated somehow, and 
it'll be the *caller* issue that didn't check the error count.
What you think?

> 
> That applies for both CE and UE error logic.
Sure.

Thanks,
Hanna
> 
> Thanks,
> Mauro


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce()
  2019-07-15 10:53 [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce() Hanna Hawa
  2019-07-17 12:06 ` Jan Glauber
  2019-07-25 18:36 ` Mauro Carvalho Chehab
@ 2019-08-01 11:35 ` Robert Richter
  2019-08-01 12:29   ` Hawa, Hanna
  2 siblings, 1 reply; 9+ messages in thread
From: Robert Richter @ 2019-08-01 11:35 UTC (permalink / raw)
  To: Hanna Hawa
  Cc: thor.thayer, bp, mchehab, james.morse, morbidrsa, ralf,
	david.daney, andy.gross, david.brown, ckadabi, vnkgutta,
	jglauber, khuong, dwmw, benh, ronenk, talel, jonnyc, hanochu,
	linux-edac, linux-kernel, linux-mips, linux-arm-msm

On 15.07.19 13:53:07, Hanna Hawa wrote:
> Add a counter parameter in order to avoid losing errors count for edac
> device, the error count reports the number of errors reported by an edac
> device similar to the way MC_EDAC do.
> 
> Signed-off-by: Hanna Hawa <hhhawa@amazon.com>
> ---
>  drivers/edac/altera_edac.c      | 20 ++++++++++++--------
>  drivers/edac/amd8111_edac.c     |  6 +++---
>  drivers/edac/cpc925_edac.c      |  4 ++--
>  drivers/edac/edac_device.c      | 18 ++++++++++--------
>  drivers/edac/edac_device.h      |  8 ++++++--
>  drivers/edac/highbank_l2_edac.c |  4 ++--
>  drivers/edac/mpc85xx_edac.c     |  4 ++--
>  drivers/edac/mv64x60_edac.c     |  4 ++--
>  drivers/edac/octeon_edac-l2c.c  | 20 ++++++++++----------
>  drivers/edac/octeon_edac-pc.c   |  6 +++---
>  drivers/edac/qcom_edac.c        |  8 ++++----
>  drivers/edac/thunderx_edac.c    | 10 +++++-----
>  drivers/edac/xgene_edac.c       | 26 +++++++++++++-------------
>  13 files changed, 74 insertions(+), 64 deletions(-)

> diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
> index 1aaba74..cf1a1da 100644
> --- a/drivers/edac/edac_device.h
> +++ b/drivers/edac/edac_device.h
> @@ -290,23 +290,27 @@ extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
>   *	perform a common output and handling of an 'edac_dev' UE event
>   *
>   * @edac_dev: pointer to struct &edac_device_ctl_info
> + * @error_count: number of errors of the same type
>   * @inst_nr: number of the instance where the UE error happened
>   * @block_nr: number of the block where the UE error happened
>   * @msg: message to be printed
>   */
>  extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
> -				int inst_nr, int block_nr, const char *msg);
> +				  u16 error_count, int inst_nr, int block_nr,
> +				  const char *msg);
>  /**
>   * edac_device_handle_ce():
>   *	perform a common output and handling of an 'edac_dev' CE event
>   *
>   * @edac_dev: pointer to struct &edac_device_ctl_info
> + * @error_count: number of errors of the same type
>   * @inst_nr: number of the instance where the CE error happened
>   * @block_nr: number of the block where the CE error happened
>   * @msg: message to be printed
>   */
>  extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,

How about renaming this to __edac_device_handle_ce() and then have 2
macros for:

 * edac_device_handle_ce() to keep old i/f.

 * edac_device_handle_ce_count(), with count parameter added.

Same for uncorrectable errors.

Code of other driver can be kept as it is then.

Thanks,

-Robert

> -				int inst_nr, int block_nr, const char *msg);
> +				  u16 error_count, int inst_nr, int block_nr,
> +				  const char *msg);

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce()
  2019-08-01 11:35 ` Robert Richter
@ 2019-08-01 12:29   ` Hawa, Hanna
  2019-08-01 14:17     ` Robert Richter
  0 siblings, 1 reply; 9+ messages in thread
From: Hawa, Hanna @ 2019-08-01 12:29 UTC (permalink / raw)
  To: Robert Richter
  Cc: thor.thayer, bp, mchehab, james.morse, morbidrsa, ralf,
	david.daney, andy.gross, david.brown, ckadabi, vnkgutta,
	jglauber, khuong, dwmw, benh, ronenk, talel, jonnyc, hanochu,
	linux-edac, linux-kernel, linux-mips, linux-arm-msm



On 8/1/2019 2:35 PM, Robert Richter wrote:
> On 15.07.19 13:53:07, Hanna Hawa wrote:
>> Add a counter parameter in order to avoid losing errors count for edac
>> device, the error count reports the number of errors reported by an edac
>> device similar to the way MC_EDAC do.
>>
>> Signed-off-by: Hanna Hawa <hhhawa@amazon.com>
>> ---
>>   drivers/edac/altera_edac.c      | 20 ++++++++++++--------
>>   drivers/edac/amd8111_edac.c     |  6 +++---
>>   drivers/edac/cpc925_edac.c      |  4 ++--
>>   drivers/edac/edac_device.c      | 18 ++++++++++--------
>>   drivers/edac/edac_device.h      |  8 ++++++--
>>   drivers/edac/highbank_l2_edac.c |  4 ++--
>>   drivers/edac/mpc85xx_edac.c     |  4 ++--
>>   drivers/edac/mv64x60_edac.c     |  4 ++--
>>   drivers/edac/octeon_edac-l2c.c  | 20 ++++++++++----------
>>   drivers/edac/octeon_edac-pc.c   |  6 +++---
>>   drivers/edac/qcom_edac.c        |  8 ++++----
>>   drivers/edac/thunderx_edac.c    | 10 +++++-----
>>   drivers/edac/xgene_edac.c       | 26 +++++++++++++-------------
>>   13 files changed, 74 insertions(+), 64 deletions(-)
> 
>> diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
>> index 1aaba74..cf1a1da 100644
>> --- a/drivers/edac/edac_device.h
>> +++ b/drivers/edac/edac_device.h
>> @@ -290,23 +290,27 @@ extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
>>    *	perform a common output and handling of an 'edac_dev' UE event
>>    *
>>    * @edac_dev: pointer to struct &edac_device_ctl_info
>> + * @error_count: number of errors of the same type
>>    * @inst_nr: number of the instance where the UE error happened
>>    * @block_nr: number of the block where the UE error happened
>>    * @msg: message to be printed
>>    */
>>   extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
>> -				int inst_nr, int block_nr, const char *msg);
>> +				  u16 error_count, int inst_nr, int block_nr,
>> +				  const char *msg);
>>   /**
>>    * edac_device_handle_ce():
>>    *	perform a common output and handling of an 'edac_dev' CE event
>>    *
>>    * @edac_dev: pointer to struct &edac_device_ctl_info
>> + * @error_count: number of errors of the same type
>>    * @inst_nr: number of the instance where the CE error happened
>>    * @block_nr: number of the block where the CE error happened
>>    * @msg: message to be printed
>>    */
>>   extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> 
> How about renaming this to __edac_device_handle_ce() and then have 2
> macros for:
> 
>   * edac_device_handle_ce() to keep old i/f.
> 
>   * edac_device_handle_ce_count(), with count parameter added.
> 
> Same for uncorrectable errors.
> 
> Code of other driver can be kept as it is then.

Don't you think it'll be confused to have different APIs between EDAC_MC 
and EDAC_DEVICE?
(in MC the count passed as part of edac_mc_handle_error())

I don't have strong objection, the change for other drivers is not that 
hard.

> 
> Thanks,
> 
> -Robert
> 
>> -				int inst_nr, int block_nr, const char *msg);
>> +				  u16 error_count, int inst_nr, int block_nr,
>> +				  const char *msg);

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce()
  2019-08-01 12:29   ` Hawa, Hanna
@ 2019-08-01 14:17     ` Robert Richter
  2019-08-01 14:30       ` Hawa, Hanna
  0 siblings, 1 reply; 9+ messages in thread
From: Robert Richter @ 2019-08-01 14:17 UTC (permalink / raw)
  To: Hawa, Hanna
  Cc: thor.thayer, bp, mchehab, james.morse, morbidrsa, ralf,
	david.daney, andy.gross, david.brown, ckadabi, vnkgutta,
	jglauber, khuong, dwmw, benh, ronenk, talel, jonnyc, hanochu,
	linux-edac, linux-kernel, linux-mips, linux-arm-msm

On 01.08.19 15:29:03, Hawa, Hanna wrote:
> On 8/1/2019 2:35 PM, Robert Richter wrote:
> > On 15.07.19 13:53:07, Hanna Hawa wrote:
> > > Add a counter parameter in order to avoid losing errors count for edac
> > > device, the error count reports the number of errors reported by an edac
> > > device similar to the way MC_EDAC do.
> > > 
> > > Signed-off-by: Hanna Hawa <hhhawa@amazon.com>
> > > ---
> > >   drivers/edac/altera_edac.c      | 20 ++++++++++++--------
> > >   drivers/edac/amd8111_edac.c     |  6 +++---
> > >   drivers/edac/cpc925_edac.c      |  4 ++--
> > >   drivers/edac/edac_device.c      | 18 ++++++++++--------
> > >   drivers/edac/edac_device.h      |  8 ++++++--
> > >   drivers/edac/highbank_l2_edac.c |  4 ++--
> > >   drivers/edac/mpc85xx_edac.c     |  4 ++--
> > >   drivers/edac/mv64x60_edac.c     |  4 ++--
> > >   drivers/edac/octeon_edac-l2c.c  | 20 ++++++++++----------
> > >   drivers/edac/octeon_edac-pc.c   |  6 +++---
> > >   drivers/edac/qcom_edac.c        |  8 ++++----
> > >   drivers/edac/thunderx_edac.c    | 10 +++++-----
> > >   drivers/edac/xgene_edac.c       | 26 +++++++++++++-------------
> > >   13 files changed, 74 insertions(+), 64 deletions(-)
> > 
> > > diff --git a/drivers/edac/edac_device.h b/drivers/edac/edac_device.h
> > > index 1aaba74..cf1a1da 100644
> > > --- a/drivers/edac/edac_device.h
> > > +++ b/drivers/edac/edac_device.h
> > > @@ -290,23 +290,27 @@ extern struct edac_device_ctl_info *edac_device_del_device(struct device *dev);
> > >    *	perform a common output and handling of an 'edac_dev' UE event
> > >    *
> > >    * @edac_dev: pointer to struct &edac_device_ctl_info
> > > + * @error_count: number of errors of the same type
> > >    * @inst_nr: number of the instance where the UE error happened
> > >    * @block_nr: number of the block where the UE error happened
> > >    * @msg: message to be printed
> > >    */
> > >   extern void edac_device_handle_ue(struct edac_device_ctl_info *edac_dev,
> > > -				int inst_nr, int block_nr, const char *msg);
> > > +				  u16 error_count, int inst_nr, int block_nr,
> > > +				  const char *msg);
> > >   /**
> > >    * edac_device_handle_ce():
> > >    *	perform a common output and handling of an 'edac_dev' CE event
> > >    *
> > >    * @edac_dev: pointer to struct &edac_device_ctl_info
> > > + * @error_count: number of errors of the same type
> > >    * @inst_nr: number of the instance where the CE error happened
> > >    * @block_nr: number of the block where the CE error happened
> > >    * @msg: message to be printed
> > >    */
> > >   extern void edac_device_handle_ce(struct edac_device_ctl_info *edac_dev,
> > 
> > How about renaming this to __edac_device_handle_ce() and then have 2
> > macros for:
> > 
> >   * edac_device_handle_ce() to keep old i/f.
> > 
> >   * edac_device_handle_ce_count(), with count parameter added.
> > 
> > Same for uncorrectable errors.
> > 
> > Code of other driver can be kept as it is then.
> 
> Don't you think it'll be confused to have different APIs between EDAC_MC and
> EDAC_DEVICE?
> (in MC the count passed as part of edac_mc_handle_error())

I don't think edac_mc_handle_error() with 11 function arguments is a
good reference for somethin we want to adopt. For the majority of
drivers you just introduce another useless argument with the following
pattern:

	edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev_name);

IMO, the api should be improved when touching it.

-Robert

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce()
  2019-08-01 14:17     ` Robert Richter
@ 2019-08-01 14:30       ` Hawa, Hanna
  0 siblings, 0 replies; 9+ messages in thread
From: Hawa, Hanna @ 2019-08-01 14:30 UTC (permalink / raw)
  To: Robert Richter
  Cc: thor.thayer, bp, mchehab, james.morse, morbidrsa, ralf,
	david.daney, andy.gross, david.brown, ckadabi, vnkgutta,
	jglauber, khuong, dwmw, benh, ronenk, talel, jonnyc, hanochu,
	linux-edac, linux-kernel, linux-mips, linux-arm-msm



On 8/1/2019 5:17 PM, Robert Richter wrote:
>> Don't you think it'll be confused to have different APIs between EDAC_MC and
>> EDAC_DEVICE?
>> (in MC the count passed as part of edac_mc_handle_error())
> I don't think edac_mc_handle_error() with 11 function arguments is a
> good reference for somethin we want to adopt. For the majority of
> drivers you just introduce another useless argument with the following
> pattern:
> 
> 	edac_device_handle_ce(edac_dev, 1, 0, 0, edac_dev_name);
> 
> IMO, the api should be improved when touching it.

Got it, I'll update the patch as you suggested.

Thanks,
Hanna

> 
> -Robert

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, back to index

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-15 10:53 [RFC 1/1] edac: Add a counter parameter for edac_device_handle_ue/ce() Hanna Hawa
2019-07-17 12:06 ` Jan Glauber
2019-07-17 14:45   ` Hawa, Hanna
2019-07-25 18:36 ` Mauro Carvalho Chehab
2019-07-28  9:34   ` [UNVERIFIED SENDER] " Hawa, Hanna
2019-08-01 11:35 ` Robert Richter
2019-08-01 12:29   ` Hawa, Hanna
2019-08-01 14:17     ` Robert Richter
2019-08-01 14:30       ` Hawa, Hanna

Linux-EDAC Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-edac/0 linux-edac/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-edac linux-edac/ https://lore.kernel.org/linux-edac \
		linux-edac@vger.kernel.org linux-edac@archiver.kernel.org
	public-inbox-index linux-edac


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-edac


AGPL code for this site: git clone https://public-inbox.org/ public-inbox