intel-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
From: Lucas De Marchi <lucas.demarchi@intel.com>
To: intel-gfx@lists.freedesktop.org
Cc: fernando.pacheco@intel.com, Matthew Auld <matthew.auld@intel.com>
Subject: [Intel-gfx] [PATCH 26/37] drm/i915/dg1: Handle GRF/IC ECC error irq
Date: Wed, 20 May 2020 17:37:52 -0700	[thread overview]
Message-ID: <20200521003803.18936-27-lucas.demarchi@intel.com> (raw)
In-Reply-To: <20200521003803.18936-1-lucas.demarchi@intel.com>

From: Fernando Pacheco <fernando.pacheco@intel.com>

The error detection and correction capability
for GRF and instruction cache (IC) will utilize
the new interrupt and error handling infrastructure
for dgfx products. The GFX device can generate
a number of classes of error under the new
infrastructure: correctable, non-fatal, and
fatal errors.

The non-fatal and fatal error classes distinguish
between levels of severity for uncorrectable errors.
All ECC uncorrectable errors will be reported as
fatal to produce the desired system response. Fatal
errors are expected to route as PCIe error messages
which should result in OS issuing a GFX device FLR.
But the option exists to route fatal errors as
interrupts.

Driver will only handle logging of errors. Anything
more will be handled at system level.

For errors that will route as interrupts, three
bits in the Master Interrupt Register will be used
to convey the class of error.

For each class of error:
1. Determine source of error (IP block) by reading
   the Device Error Source Register (RW1C) that
   corresponds to the class of error being serviced.
2. If the generating IP block is GT, read and log the
   GT Error Register (RW1C) that corresponds to the
   class of error being serviced. Non-GT errors will
   be logged in aggregate for now.

Bspec: 50875

Cc: Paulo Zanoni <paulo.r.zanoni@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Cc: Fernando Pacheco <fernando.pacheco@intel.com>
Cc: Radhakrishna Sripada <radhakrishna.sripada@intel.com>
Signed-off-by: Fernando Pacheco <fernando.pacheco@intel.com>
Signed-off-by: Lucas De Marchi <lucas.demarchi@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c | 121 ++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_reg.h |  28 ++++++++
 2 files changed, 149 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index ebc80e8b1599..17e679b910da 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2515,6 +2515,124 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
+static const char *
+hardware_error_type_to_str(const enum hardware_error hw_err)
+{
+	switch (hw_err) {
+	case HARDWARE_ERROR_CORRECTABLE:
+		return "CORRECTABLE";
+	case HARDWARE_ERROR_NONFATAL:
+		return "NONFATAL";
+	case HARDWARE_ERROR_FATAL:
+		return "FATAL";
+	default:
+		return "UNKNOWN";
+	}
+}
+
+static void
+gen12_gt_hw_error_handler(struct drm_i915_private * const i915,
+			  const enum hardware_error hw_err)
+{
+	void __iomem * const regs = i915->uncore.regs;
+	const char *hw_err_str = hardware_error_type_to_str(hw_err);
+	u32 other_errors = ~(EU_GRF_ERROR | EU_IC_ERROR);
+	u32 errstat;
+
+	lockdep_assert_held(&i915->irq_lock);
+
+	errstat = raw_reg_read(regs, ERR_STAT_GT_REG(hw_err));
+
+	if (unlikely(!errstat)) {
+		DRM_ERROR("ERR_STAT_GT_REG_%s blank!\n", hw_err_str);
+		return;
+	}
+
+	/*
+	 * TODO: The GT Non Fatal Error Status Register
+	 * only has reserved bitfields defined.
+	 * Remove once there is something to service.
+	 */
+	if (hw_err == HARDWARE_ERROR_NONFATAL) {
+		DRM_ERROR("detected Non-Fatal hardware error\n");
+		raw_reg_write(regs, ERR_STAT_GT_REG(hw_err), errstat);
+		return;
+	}
+
+	if (errstat & EU_GRF_ERROR)
+		DRM_ERROR("detected EU GRF %s hardware error\n", hw_err_str);
+
+	if (errstat & EU_IC_ERROR)
+		DRM_ERROR("detected EU IC %s hardware error\n", hw_err_str);
+
+	/*
+	 * TODO: The remaining GT errors don't have a
+	 * need for targeted logging at the moment. We
+	 * still want to log detection of these errors, but
+	 * let's aggregate them until someone has a need for them.
+	 */
+	if (errstat & other_errors)
+		DRM_ERROR("detected hardware error(s) in ERR_STAT_GT_REG_%s: 0x%08x\n",
+			  hw_err_str, errstat & other_errors);
+
+	raw_reg_write(regs, ERR_STAT_GT_REG(hw_err), errstat);
+}
+
+static void
+gen12_hw_error_source_handler(struct drm_i915_private * const i915,
+			      const enum hardware_error hw_err)
+{
+	void __iomem * const regs = i915->uncore.regs;
+	const char *hw_err_str = hardware_error_type_to_str(hw_err);
+	u32 errsrc;
+
+	spin_lock(&i915->irq_lock);
+	errsrc = raw_reg_read(regs, DEV_ERR_STAT_REG(hw_err));
+
+	if (unlikely(!errsrc)) {
+		DRM_ERROR("DEV_ERR_STAT_REG_%s blank!\n", hw_err_str);
+		goto out_unlock;
+	}
+
+	if (errsrc & DEV_ERR_STAT_GT_ERROR)
+		gen12_gt_hw_error_handler(i915, hw_err);
+
+	if (errsrc & ~DEV_ERR_STAT_GT_ERROR)
+		DRM_ERROR("non-GT hardware error(s) in DEV_ERR_STAT_REG_%s: 0x%08x\n",
+			  hw_err_str, errsrc & ~DEV_ERR_STAT_GT_ERROR);
+
+	raw_reg_write(regs, DEV_ERR_STAT_REG(hw_err), errsrc);
+
+out_unlock:
+	spin_unlock(&i915->irq_lock);
+}
+
+/*
+ * GEN12+ adds three Error bits to the Master Interrupt
+ * Register to support dgfx card error handling.
+ * These three bits are used to convey the class of error:
+ * FATAL, NONFATAL, or CORRECTABLE.
+ *
+ * To process an interrupt:
+ *	1. Determine source of error (IP block) by reading
+ *	   the Device Error Source Register (RW1C) that
+ *	   corresponds to the class of error being serviced.
+ *	2. For GT as the generating IP block, read and log
+ *	   the GT Error Register (RW1C) that corresponds to
+ *	   the class of error being serviced.
+ */
+static void
+gen12_hw_error_irq_handler(struct drm_i915_private * const i915,
+			   const u32 master_ctl)
+{
+	enum hardware_error hw_err;
+
+	for (hw_err = 0; hw_err < HARDWARE_ERROR_MAX; hw_err++) {
+		if (master_ctl & GEN12_ERROR_IRQ(hw_err))
+			gen12_hw_error_source_handler(i915, hw_err);
+	}
+}
+
 static u32
 gen11_gu_misc_irq_ack(struct intel_gt *gt, const u32 master_ctl)
 {
@@ -2597,6 +2715,9 @@ __gen11_irq_handler(struct drm_i915_private * const i915,
 	/* Find, queue (onto bottom-halves), then clear each source */
 	gen11_gt_irq_handler(gt, master_ctl);
 
+	if (IS_DG1(i915))
+		gen12_hw_error_irq_handler(i915, master_ctl);
+
 	/* IRQs are synced during runtime_suspend, we don't require a wakeref */
 	if (master_ctl & GEN11_DISPLAY_IRQ)
 		gen11_display_irq_handler(i915);
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index e0bd9e02c3d1..40cb361b4254 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -7647,6 +7647,10 @@ enum {
 #define  GEN11_MASTER_IRQ		(1 << 31)
 #define  GEN11_PCU_IRQ			(1 << 30)
 #define  GEN11_GU_MISC_IRQ		(1 << 29)
+#define  GEN12_FATAL_ERROR_IRQ		(1 << 28)
+#define  GEN12_NON_FATAL_ERROR_IRQ	(1 << 27)
+#define  GEN12_CORRECTABLE_ERROR_IRQ	(1 << 26)
+#define  GEN12_ERROR_IRQ(x)		(1 << (26 + (x)))
 #define  GEN11_DISPLAY_IRQ		(1 << 16)
 #define  GEN11_GT_DW_IRQ(x)		(1 << (x))
 #define  GEN11_GT_DW1_IRQ		(1 << 1)
@@ -7738,6 +7742,30 @@ enum {
 
 #define GEN11_IIR_REG_SELECTOR(x)	_MMIO(0x190070 + ((x) * 4))
 
+enum hardware_error {
+	HARDWARE_ERROR_CORRECTABLE = 0,
+	HARDWARE_ERROR_NONFATAL = 1,
+	HARDWARE_ERROR_FATAL = 2,
+	HARDWARE_ERROR_MAX,
+};
+
+#define _DEV_ERR_STAT_FATAL		0x100174
+#define _DEV_ERR_STAT_NONFATAL		0x100178
+#define _DEV_ERR_STAT_CORRECTABLE	0x10017c
+#define DEV_ERR_STAT_REG(x)		_MMIO(_PICK_EVEN((x), \
+						_DEV_ERR_STAT_CORRECTABLE, \
+						_DEV_ERR_STAT_NONFATAL))
+#define  DEV_ERR_STAT_GT_ERROR		(1 << 0)
+
+#define _ERR_STAT_GT_COR		0x100160
+#define _ERR_STAT_GT_NONFATAL		0x100164
+#define _ERR_STAT_GT_FATAL		0x100168
+#define ERR_STAT_GT_REG(x)		_MMIO(_PICK_EVEN((x), \
+						_ERR_STAT_GT_COR, \
+						_ERR_STAT_GT_NONFATAL))
+#define  EU_GRF_ERROR			(1 << 15)
+#define  EU_IC_ERROR			(1 << 14)
+
 #define GEN11_RENDER_COPY_INTR_ENABLE	_MMIO(0x190030)
 #define GEN11_VCS_VECS_INTR_ENABLE	_MMIO(0x190034)
 #define GEN11_GUC_SG_INTR_ENABLE	_MMIO(0x190038)
-- 
2.26.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  parent reply	other threads:[~2020-05-21  0:39 UTC|newest]

Thread overview: 56+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-05-21  0:37 [Intel-gfx] [PATCH 00/37] Introduce DG1 Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 01/37] drm/i915/rkl: Add DPLL4 support Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 02/37] drm/i915/rkl: Add DDC pin mapping Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 03/37] drm/i915/rkl: Setup ports/phys Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 04/37] drm/i915/rkl: provide port/phy mapping for vbt Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 05/37] drm/i915/rkl: Handle HTI Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 06/37] drm/i915/rkl: Handle comp master/slave relationships for PHYs Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 07/37] drm/i915/rkl: Add initial workarounds Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 08/37] drm/i915: make intel_{uncore, de}_rmw() more useful Lucas De Marchi
2020-05-21 17:24   ` Souza, Jose
2020-05-21 17:30     ` Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 09/37] drm/i915: Add has_master_unit_irq flag Lucas De Marchi
2020-05-26 18:10   ` Souza, Jose
2020-05-21  0:37 ` [Intel-gfx] [PATCH 10/37] drm/i915: add pcie snoop flag Lucas De Marchi
2020-05-21  8:15   ` Chris Wilson
2020-05-21  0:37 ` [Intel-gfx] [PATCH 11/37] drm/i915/dg1: add initial DG-1 definitions Lucas De Marchi
2020-05-26 17:34   ` Souza, Jose
2020-05-26 17:51     ` Lucas De Marchi
2020-05-26 18:02       ` Souza, Jose
2020-05-26 17:51   ` Souza, Jose
2020-05-21  0:37 ` [Intel-gfx] [PATCH 12/37] drm/i915/dg1: Add DG1 PCI IDs Lucas De Marchi
2020-05-26 17:35   ` Souza, Jose
2020-05-21  0:37 ` [Intel-gfx] [PATCH 13/37] drm/i915/dg1: Add fake PCH Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 14/37] drm/i915/dg1: Initialize RAWCLK properly Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 15/37] drm/i915/dg1: Define MOCS table for DG1 Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 16/37] drm/i915/dg1: Add DG1 power wells Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 17/37] drm/i915/dg1: Increase mmio size to 4MB Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 18/37] drm/i915/dg1: add support for the master unit interrupt Lucas De Marchi
2020-05-26 18:02   ` Souza, Jose
2020-05-21  0:37 ` [Intel-gfx] [PATCH 19/37] drm/i915/dg1: Wait for pcode/uncore handshake at startup Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 20/37] drm/i915/dg1: Add DPLL macros for DG1 Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 21/37] drm/i915/dg1: Add and setup DPLLs " Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 22/37] drm/i915/dg1: Enable DPLL " Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 23/37] drm/i915/dg1: add hpd interrupt handling Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 24/37] drm/i915/dg1: invert HPD pins Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 25/37] drm/i915/dg1: gmbus pin mapping Lucas De Marchi
2020-05-21  0:37 ` Lucas De Marchi [this message]
2020-05-21  8:19   ` [Intel-gfx] [PATCH 26/37] drm/i915/dg1: Handle GRF/IC ECC error irq Chris Wilson
2020-05-21  0:37 ` [Intel-gfx] [PATCH 27/37] drm/i915/dg1: Log counter on SLM ECC error Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 28/37] drm/i915/dg1: Enable first 2 ports for DG1 Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 29/37] drm/i915/dg1: Don't program PHY_MISC for PHY-C and PHY-D Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 30/37] drm/i915/dg1: Update comp master/slave relationships for PHYs Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 31/37] drm/i915/dg1: Update voltage swing tables for DP Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 32/37] drm/i915/dg1: provide port/phy mapping for vbt Lucas De Marchi
2020-05-21  0:37 ` [Intel-gfx] [PATCH 33/37] drm/i915/dg1: map/unmap pll clocks Lucas De Marchi
2020-05-21  0:38 ` [Intel-gfx] [PATCH 34/37] drm/i915/dg1: enable PORT C/D aka D/E Lucas De Marchi
2020-05-21  0:38 ` [Intel-gfx] [PATCH 35/37] drm/i915/dg1: Load DMC Lucas De Marchi
2020-05-26 17:42   ` Souza, Jose
2020-05-26 17:49     ` Lucas De Marchi
2020-05-21  0:38 ` [Intel-gfx] [PATCH 36/37] drm/i915/dg1: Add initial DG1 workarounds Lucas De Marchi
2020-05-21  0:38 ` [Intel-gfx] [PATCH 37/37] drm/i915/dg1: Remove SHPD_FILTER_CNT register programming Lucas De Marchi
2020-05-26 17:44   ` Souza, Jose
2020-05-21  1:05 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for Introduce DG1 Patchwork
2020-05-21  1:06 ` [Intel-gfx] ✗ Fi.CI.SPARSE: " Patchwork
2020-05-21  1:26 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2020-05-21 18:34 ` [Intel-gfx] ✓ Fi.CI.IGT: " Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200521003803.18936-27-lucas.demarchi@intel.com \
    --to=lucas.demarchi@intel.com \
    --cc=fernando.pacheco@intel.com \
    --cc=intel-gfx@lists.freedesktop.org \
    --cc=matthew.auld@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).