[PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-05-24  9:43 ` Tvrtko Ursulin
  0 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-05-24  9:43 UTC (permalink / raw)
  To: Intel-gfx; +Cc: dri-devel, Tvrtko Ursulin

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Catch and log any garbage in the register, including no tiles marked, or
multiple tiles marked.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
---
We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
during glmark and more badness. So I thought lets log all possible failure
modes from here and also use per device logging.
---
 drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
 drivers/gpu/drm/i915/i915_reg.h |  1 +
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 73cebc6aa650..79853d3fc1ed 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
 	u32 gu_misc_iir;
 
 	if (!intel_irqs_enabled(i915))
-		return IRQ_NONE;
+		goto none;
 
 	master_tile_ctl = dg1_master_intr_disable(regs);
-	if (!master_tile_ctl) {
-		dg1_master_intr_enable(regs);
-		return IRQ_NONE;
+	if (!master_tile_ctl)
+		goto enable_none;
+
+	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
+		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
+			 master_tile_ctl);
+		goto enable_none;
 	}
 
 	/* FIXME: we only support tile 0 for now. */
-	if (master_tile_ctl & DG1_MSTR_TILE(0)) {
-		master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
-		raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
-	} else {
-		DRM_ERROR("Tile not supported: 0x%08x\n", master_tile_ctl);
-		dg1_master_intr_enable(regs);
-		return IRQ_NONE;
+	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
+	    DG1_MSTR_TILE(0)) {
+		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
+			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
+					     master_tile_ctl)));
+		goto enable_none;
 	}
 
+	master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
+	raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
+
 	gen11_gt_irq_handler(gt, master_ctl);
 
 	if (master_ctl & GEN11_DISPLAY_IRQ)
@@ -2810,6 +2816,11 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
 	pmu_irq_stats(i915, IRQ_HANDLED);
 
 	return IRQ_HANDLED;
+
+enable_none:
+	dg1_master_intr_enable(regs);
+none:
+	return IRQ_NONE;
 }
 
 /* Called from drm generic code, passed 'crtc' which
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index d8579ab9384c..eefa301c6430 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -5774,6 +5774,7 @@
 
 #define DG1_MSTR_TILE_INTR		_MMIO(0x190008)
 #define   DG1_MSTR_IRQ			REG_BIT(31)
+#define   DG1_MSTR_TILE_MASK		REG_GENMASK(3, 0)
 #define   DG1_MSTR_TILE(t)		REG_BIT(t)
 
 #define GEN11_DISPLAY_INT_CTL		_MMIO(0x44200)
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-05-24  9:43 ` Tvrtko Ursulin
  0 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-05-24  9:43 UTC (permalink / raw)
  To: Intel-gfx; +Cc: dri-devel

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Catch and log any garbage in the register, including no tiles marked, or
multiple tiles marked.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
---
We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
during glmark and more badness. So I thought lets log all possible failure
modes from here and also use per device logging.
---
 drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
 drivers/gpu/drm/i915/i915_reg.h |  1 +
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 73cebc6aa650..79853d3fc1ed 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
 	u32 gu_misc_iir;
 
 	if (!intel_irqs_enabled(i915))
-		return IRQ_NONE;
+		goto none;
 
 	master_tile_ctl = dg1_master_intr_disable(regs);
-	if (!master_tile_ctl) {
-		dg1_master_intr_enable(regs);
-		return IRQ_NONE;
+	if (!master_tile_ctl)
+		goto enable_none;
+
+	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
+		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
+			 master_tile_ctl);
+		goto enable_none;
 	}
 
 	/* FIXME: we only support tile 0 for now. */
-	if (master_tile_ctl & DG1_MSTR_TILE(0)) {
-		master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
-		raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
-	} else {
-		DRM_ERROR("Tile not supported: 0x%08x\n", master_tile_ctl);
-		dg1_master_intr_enable(regs);
-		return IRQ_NONE;
+	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
+	    DG1_MSTR_TILE(0)) {
+		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
+			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
+					     master_tile_ctl)));
+		goto enable_none;
 	}
 
+	master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
+	raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
+
 	gen11_gt_irq_handler(gt, master_ctl);
 
 	if (master_ctl & GEN11_DISPLAY_IRQ)
@@ -2810,6 +2816,11 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
 	pmu_irq_stats(i915, IRQ_HANDLED);
 
 	return IRQ_HANDLED;
+
+enable_none:
+	dg1_master_intr_enable(regs);
+none:
+	return IRQ_NONE;
 }
 
 /* Called from drm generic code, passed 'crtc' which
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index d8579ab9384c..eefa301c6430 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -5774,6 +5774,7 @@
 
 #define DG1_MSTR_TILE_INTR		_MMIO(0x190008)
 #define   DG1_MSTR_IRQ			REG_BIT(31)
+#define   DG1_MSTR_TILE_MASK		REG_GENMASK(3, 0)
 #define   DG1_MSTR_TILE(t)		REG_BIT(t)
 
 #define GEN11_DISPLAY_INT_CTL		_MMIO(0x44200)
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [Intel-gfx] ✗ Fi.CI.SPARSE: warning for drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-24  9:43 ` [Intel-gfx] " Tvrtko Ursulin
  (?)
@ 2022-05-24 10:01 ` Patchwork
  -1 siblings, 0 replies; 25+ messages in thread
From: Patchwork @ 2022-05-24 10:01 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
URL   : https://patchwork.freedesktop.org/series/104305/
State : warning

== Summary ==

Error: dim sparse failed
Sparse version: v0.6.2
Fast mode used, each commit won't be checked separately.



^ permalink raw reply	[flat|nested] 25+ messages in thread

* [Intel-gfx] ✓ Fi.CI.BAT: success for drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-24  9:43 ` [Intel-gfx] " Tvrtko Ursulin
  (?)
  (?)
@ 2022-05-24 10:22 ` Patchwork
  -1 siblings, 0 replies; 25+ messages in thread
From: Patchwork @ 2022-05-24 10:22 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 14016 bytes --]

== Series Details ==

Series: drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
URL   : https://patchwork.freedesktop.org/series/104305/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_11693 -> Patchwork_104305v1
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/index.html

Participating hosts (44 -> 46)
------------------------------

  Additional (4): fi-tgl-u2 bat-adlm-1 fi-icl-u2 bat-dg1-5 
  Missing    (2): bat-dg2-8 fi-rkl-11600 

Known issues
------------

  Here are the changes found in Patchwork_104305v1 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@fbdev@write:
    - bat-dg1-5:          NOTRUN -> [SKIP][1] ([i915#2582]) +4 similar issues
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@fbdev@write.html

  * igt@gem_huc_copy@huc-copy:
    - fi-tgl-u2:          NOTRUN -> [SKIP][2] ([i915#2190])
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-tgl-u2/igt@gem_huc_copy@huc-copy.html
    - fi-icl-u2:          NOTRUN -> [SKIP][3] ([i915#2190])
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-icl-u2/igt@gem_huc_copy@huc-copy.html

  * igt@gem_lmem_swapping@parallel-random-engines:
    - fi-icl-u2:          NOTRUN -> [SKIP][4] ([i915#4613]) +3 similar issues
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-icl-u2/igt@gem_lmem_swapping@parallel-random-engines.html

  * igt@gem_mmap@basic:
    - bat-dg1-5:          NOTRUN -> [SKIP][5] ([i915#4083])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@gem_mmap@basic.html

  * igt@gem_tiled_fence_blits@basic:
    - bat-dg1-5:          NOTRUN -> [SKIP][6] ([i915#4077]) +2 similar issues
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@gem_tiled_fence_blits@basic.html

  * igt@gem_tiled_pread_basic:
    - bat-dg1-5:          NOTRUN -> [SKIP][7] ([i915#4079]) +1 similar issue
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@gem_tiled_pread_basic.html

  * igt@i915_module_load@reload:
    - fi-kbl-soraka:      [PASS][8] -> [DMESG-WARN][9] ([i915#1982])
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/fi-kbl-soraka/igt@i915_module_load@reload.html
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-kbl-soraka/igt@i915_module_load@reload.html

  * igt@i915_pm_backlight@basic-brightness:
    - bat-dg1-5:          NOTRUN -> [SKIP][10] ([i915#1155])
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@i915_pm_backlight@basic-brightness.html

  * igt@i915_selftest@live@hangcheck:
    - bat-dg1-6:          [PASS][11] -> [DMESG-FAIL][12] ([i915#4494] / [i915#4957])
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/bat-dg1-6/igt@i915_selftest@live@hangcheck.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-6/igt@i915_selftest@live@hangcheck.html

  * igt@i915_suspend@basic-s2idle-without-i915:
    - bat-dg1-5:          NOTRUN -> [INCOMPLETE][13] ([i915#6011])
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@i915_suspend@basic-s2idle-without-i915.html

  * igt@i915_suspend@basic-s3-without-i915:
    - fi-icl-u2:          NOTRUN -> [SKIP][14] ([i915#5903])
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-icl-u2/igt@i915_suspend@basic-s3-without-i915.html

  * igt@kms_addfb_basic@basic-y-tiled-legacy:
    - bat-dg1-5:          NOTRUN -> [SKIP][15] ([i915#4215])
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@kms_addfb_basic@basic-y-tiled-legacy.html

  * igt@kms_addfb_basic@tile-pitch-mismatch:
    - bat-dg1-5:          NOTRUN -> [SKIP][16] ([i915#4212]) +7 similar issues
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@kms_addfb_basic@tile-pitch-mismatch.html

  * igt@kms_busy@basic:
    - bat-dg1-5:          NOTRUN -> [SKIP][17] ([i915#4303])
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@kms_busy@basic.html

  * igt@kms_busy@basic@flip:
    - fi-tgl-u2:          NOTRUN -> [DMESG-WARN][18] ([i915#402]) +2 similar issues
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-tgl-u2/igt@kms_busy@basic@flip.html

  * igt@kms_chamelium@common-hpd-after-suspend:
    - fi-hsw-4770:        NOTRUN -> [SKIP][19] ([fdo#109271] / [fdo#111827])
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-hsw-4770/igt@kms_chamelium@common-hpd-after-suspend.html
    - fi-blb-e6850:       NOTRUN -> [SKIP][20] ([fdo#109271])
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-blb-e6850/igt@kms_chamelium@common-hpd-after-suspend.html

  * igt@kms_chamelium@dp-hpd-fast:
    - fi-tgl-u2:          NOTRUN -> [SKIP][21] ([fdo#109284] / [fdo#111827]) +7 similar issues
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-tgl-u2/igt@kms_chamelium@dp-hpd-fast.html

  * igt@kms_chamelium@hdmi-hpd-fast:
    - fi-icl-u2:          NOTRUN -> [SKIP][22] ([fdo#111827]) +8 similar issues
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-icl-u2/igt@kms_chamelium@hdmi-hpd-fast.html
    - bat-dg1-5:          NOTRUN -> [SKIP][23] ([fdo#111827]) +7 similar issues
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@kms_chamelium@hdmi-hpd-fast.html

  * igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic:
    - fi-tgl-u2:          NOTRUN -> [SKIP][24] ([i915#4103]) +1 similar issue
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-tgl-u2/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic.html

  * igt@kms_cursor_legacy@basic-busy-flip-before-cursor-legacy:
    - fi-icl-u2:          NOTRUN -> [SKIP][25] ([fdo#109278]) +2 similar issues
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-icl-u2/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-legacy.html
    - bat-dg1-5:          NOTRUN -> [SKIP][26] ([i915#4103] / [i915#4213]) +1 similar issue
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-legacy.html

  * igt@kms_flip@basic-flip-vs-dpms:
    - bat-dg1-5:          NOTRUN -> [SKIP][27] ([i915#4078]) +21 similar issues
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@kms_flip@basic-flip-vs-dpms.html

  * igt@kms_force_connector_basic@force-load-detect:
    - fi-tgl-u2:          NOTRUN -> [SKIP][28] ([fdo#109285])
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-tgl-u2/igt@kms_force_connector_basic@force-load-detect.html
    - bat-dg1-5:          NOTRUN -> [SKIP][29] ([fdo#109285])
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@kms_force_connector_basic@force-load-detect.html
    - fi-icl-u2:          NOTRUN -> [SKIP][30] ([fdo#109285])
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-icl-u2/igt@kms_force_connector_basic@force-load-detect.html

  * igt@kms_psr@primary_page_flip:
    - bat-dg1-5:          NOTRUN -> [SKIP][31] ([i915#1072] / [i915#4078]) +3 similar issues
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@kms_psr@primary_page_flip.html
    - fi-tgl-u2:          NOTRUN -> [SKIP][32] ([i915#668]) +3 similar issues
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-tgl-u2/igt@kms_psr@primary_page_flip.html

  * igt@kms_setmode@basic-clone-single-crtc:
    - fi-icl-u2:          NOTRUN -> [SKIP][33] ([i915#3555])
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-icl-u2/igt@kms_setmode@basic-clone-single-crtc.html
    - fi-tgl-u2:          NOTRUN -> [SKIP][34] ([i915#3555])
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-tgl-u2/igt@kms_setmode@basic-clone-single-crtc.html
    - bat-dg1-5:          NOTRUN -> [SKIP][35] ([i915#3555])
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@kms_setmode@basic-clone-single-crtc.html

  * igt@prime_vgem@basic-fence-mmap:
    - bat-dg1-5:          NOTRUN -> [SKIP][36] ([i915#3708] / [i915#4077]) +1 similar issue
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@prime_vgem@basic-fence-mmap.html

  * igt@prime_vgem@basic-userptr:
    - fi-icl-u2:          NOTRUN -> [SKIP][37] ([i915#3301])
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-icl-u2/igt@prime_vgem@basic-userptr.html
    - fi-tgl-u2:          NOTRUN -> [SKIP][38] ([i915#3301])
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-tgl-u2/igt@prime_vgem@basic-userptr.html
    - bat-dg1-5:          NOTRUN -> [SKIP][39] ([i915#3708] / [i915#4873])
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@prime_vgem@basic-userptr.html

  * igt@prime_vgem@basic-write:
    - bat-dg1-5:          NOTRUN -> [SKIP][40] ([i915#3708]) +3 similar issues
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@prime_vgem@basic-write.html

  * igt@runner@aborted:
    - bat-dg1-5:          NOTRUN -> [FAIL][41] ([i915#4312] / [i915#5257])
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-dg1-5/igt@runner@aborted.html

  
#### Possible fixes ####

  * igt@i915_selftest@live@hangcheck:
    - fi-hsw-4770:        [INCOMPLETE][42] ([i915#4785]) -> [PASS][43]
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/fi-hsw-4770/igt@i915_selftest@live@hangcheck.html
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-hsw-4770/igt@i915_selftest@live@hangcheck.html

  * igt@i915_selftest@live@requests:
    - fi-blb-e6850:       [DMESG-FAIL][44] ([i915#4528]) -> [PASS][45]
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/fi-blb-e6850/igt@i915_selftest@live@requests.html
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/fi-blb-e6850/igt@i915_selftest@live@requests.html

  * igt@kms_flip@basic-plain-flip@a-edp1:
    - bat-adlp-4:         [DMESG-WARN][46] ([i915#3576]) -> [PASS][47]
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/bat-adlp-4/igt@kms_flip@basic-plain-flip@a-edp1.html
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/bat-adlp-4/igt@kms_flip@basic-plain-flip@a-edp1.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109284]: https://bugs.freedesktop.org/show_bug.cgi?id=109284
  [fdo#109285]: https://bugs.freedesktop.org/show_bug.cgi?id=109285
  [fdo#111827]: https://bugs.freedesktop.org/show_bug.cgi?id=111827
  [i915#1072]: https://gitlab.freedesktop.org/drm/intel/issues/1072
  [i915#1155]: https://gitlab.freedesktop.org/drm/intel/issues/1155
  [i915#1982]: https://gitlab.freedesktop.org/drm/intel/issues/1982
  [i915#2190]: https://gitlab.freedesktop.org/drm/intel/issues/2190
  [i915#2582]: https://gitlab.freedesktop.org/drm/intel/issues/2582
  [i915#3282]: https://gitlab.freedesktop.org/drm/intel/issues/3282
  [i915#3301]: https://gitlab.freedesktop.org/drm/intel/issues/3301
  [i915#3555]: https://gitlab.freedesktop.org/drm/intel/issues/3555
  [i915#3576]: https://gitlab.freedesktop.org/drm/intel/issues/3576
  [i915#3708]: https://gitlab.freedesktop.org/drm/intel/issues/3708
  [i915#402]: https://gitlab.freedesktop.org/drm/intel/issues/402
  [i915#4077]: https://gitlab.freedesktop.org/drm/intel/issues/4077
  [i915#4078]: https://gitlab.freedesktop.org/drm/intel/issues/4078
  [i915#4079]: https://gitlab.freedesktop.org/drm/intel/issues/4079
  [i915#4083]: https://gitlab.freedesktop.org/drm/intel/issues/4083
  [i915#4103]: https://gitlab.freedesktop.org/drm/intel/issues/4103
  [i915#4212]: https://gitlab.freedesktop.org/drm/intel/issues/4212
  [i915#4213]: https://gitlab.freedesktop.org/drm/intel/issues/4213
  [i915#4215]: https://gitlab.freedesktop.org/drm/intel/issues/4215
  [i915#4303]: https://gitlab.freedesktop.org/drm/intel/issues/4303
  [i915#4312]: https://gitlab.freedesktop.org/drm/intel/issues/4312
  [i915#4494]: https://gitlab.freedesktop.org/drm/intel/issues/4494
  [i915#4528]: https://gitlab.freedesktop.org/drm/intel/issues/4528
  [i915#4613]: https://gitlab.freedesktop.org/drm/intel/issues/4613
  [i915#4785]: https://gitlab.freedesktop.org/drm/intel/issues/4785
  [i915#4873]: https://gitlab.freedesktop.org/drm/intel/issues/4873
  [i915#4957]: https://gitlab.freedesktop.org/drm/intel/issues/4957
  [i915#5257]: https://gitlab.freedesktop.org/drm/intel/issues/5257
  [i915#5334]: https://gitlab.freedesktop.org/drm/intel/issues/5334
  [i915#5414]: https://gitlab.freedesktop.org/drm/intel/issues/5414
  [i915#5903]: https://gitlab.freedesktop.org/drm/intel/issues/5903
  [i915#6011]: https://gitlab.freedesktop.org/drm/intel/issues/6011
  [i915#668]: https://gitlab.freedesktop.org/drm/intel/issues/668


Build changes
-------------

  * Linux: CI_DRM_11693 -> Patchwork_104305v1

  CI-20190529: 20190529
  CI_DRM_11693: 14289bc81309b2126f4ba9f339837dacf34ddf9c @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_6485: 51663917b40d36086cc1c555ce4f67b22937694d @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_104305v1: 14289bc81309b2126f4ba9f339837dacf34ddf9c @ git://anongit.freedesktop.org/gfx-ci/linux


### Linux commits

ee55d4de59e7 drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/index.html

[-- Attachment #2: Type: text/html, Size: 16408 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [Intel-gfx] ✗ Fi.CI.IGT: failure for drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-24  9:43 ` [Intel-gfx] " Tvrtko Ursulin
                   ` (2 preceding siblings ...)
  (?)
@ 2022-05-24 11:46 ` Patchwork
  -1 siblings, 0 replies; 25+ messages in thread
From: Patchwork @ 2022-05-24 11:46 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

[-- Attachment #1: Type: text/plain, Size: 52900 bytes --]

== Series Details ==

Series: drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
URL   : https://patchwork.freedesktop.org/series/104305/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_11693_full -> Patchwork_104305v1_full
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_104305v1_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_104305v1_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

Participating hosts (12 -> 13)
------------------------------

  Additional (1): shard-rkl 

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_104305v1_full:

### IGT changes ###

#### Possible regressions ####

  * igt@api_intel_allocator@fork-simple-stress-signal:
    - shard-tglb:         [PASS][1] -> [INCOMPLETE][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-tglb1/igt@api_intel_allocator@fork-simple-stress-signal.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb6/igt@api_intel_allocator@fork-simple-stress-signal.html

  
#### Suppressed ####

  The following results come from untrusted machines, tests, or statuses.
  They do not affect the overall result.

  * {igt@kms_hdr@bpc-switch@pipe-a-hdmi-a-1}:
    - {shard-dg1}:        NOTRUN -> [SKIP][3]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-dg1-15/igt@kms_hdr@bpc-switch@pipe-a-hdmi-a-1.html

  
Known issues
------------

  Here are the changes found in Patchwork_104305v1_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@feature_discovery@display-2x:
    - shard-iclb:         NOTRUN -> [SKIP][4] ([i915#1839])
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@feature_discovery@display-2x.html

  * igt@gem_exec_fair@basic-deadline:
    - shard-glk:          NOTRUN -> [FAIL][5] ([i915#2846])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-glk6/igt@gem_exec_fair@basic-deadline.html

  * igt@gem_exec_fair@basic-pace-share@rcs0:
    - shard-apl:          [PASS][6] -> [FAIL][7] ([i915#2842])
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-apl8/igt@gem_exec_fair@basic-pace-share@rcs0.html
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl3/igt@gem_exec_fair@basic-pace-share@rcs0.html

  * igt@gem_exec_fair@basic-pace-solo@rcs0:
    - shard-tglb:         NOTRUN -> [FAIL][8] ([i915#2842])
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@gem_exec_fair@basic-pace-solo@rcs0.html

  * igt@gem_exec_fair@basic-pace@vecs0:
    - shard-kbl:          [PASS][9] -> [FAIL][10] ([i915#2842]) +3 similar issues
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl6/igt@gem_exec_fair@basic-pace@vecs0.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl4/igt@gem_exec_fair@basic-pace@vecs0.html

  * igt@gem_exec_flush@basic-batch-kernel-default-uc:
    - shard-snb:          [PASS][11] -> [SKIP][12] ([fdo#109271]) +4 similar issues
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-snb2/igt@gem_exec_flush@basic-batch-kernel-default-uc.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-snb6/igt@gem_exec_flush@basic-batch-kernel-default-uc.html

  * igt@gem_lmem_swapping@basic:
    - shard-tglb:         NOTRUN -> [SKIP][13] ([i915#4613])
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@gem_lmem_swapping@basic.html

  * igt@gem_lmem_swapping@random:
    - shard-apl:          NOTRUN -> [SKIP][14] ([fdo#109271] / [i915#4613]) +1 similar issue
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl1/igt@gem_lmem_swapping@random.html

  * igt@gem_lmem_swapping@verify-random-ccs:
    - shard-skl:          NOTRUN -> [SKIP][15] ([fdo#109271] / [i915#4613]) +3 similar issues
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl2/igt@gem_lmem_swapping@verify-random-ccs.html

  * igt@gem_pxp@create-regular-context-1:
    - shard-iclb:         NOTRUN -> [SKIP][16] ([i915#4270]) +1 similar issue
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@gem_pxp@create-regular-context-1.html

  * igt@gem_pxp@verify-pxp-stale-buf-optout-execution:
    - shard-tglb:         NOTRUN -> [SKIP][17] ([i915#4270])
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@gem_pxp@verify-pxp-stale-buf-optout-execution.html

  * igt@gem_render_copy@yf-tiled-to-vebox-yf-tiled:
    - shard-iclb:         NOTRUN -> [SKIP][18] ([i915#768])
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@gem_render_copy@yf-tiled-to-vebox-yf-tiled.html

  * igt@gem_userptr_blits@input-checking:
    - shard-skl:          NOTRUN -> [DMESG-WARN][19] ([i915#4991]) +1 similar issue
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl4/igt@gem_userptr_blits@input-checking.html

  * igt@gen7_exec_parse@basic-rejected:
    - shard-iclb:         NOTRUN -> [SKIP][20] ([fdo#109289])
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@gen7_exec_parse@basic-rejected.html

  * igt@gen9_exec_parse@unaligned-jump:
    - shard-iclb:         NOTRUN -> [SKIP][21] ([i915#2856])
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@gen9_exec_parse@unaligned-jump.html

  * igt@i915_pm_dc@dc3co-vpb-simulation:
    - shard-iclb:         NOTRUN -> [SKIP][22] ([i915#658])
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@i915_pm_dc@dc3co-vpb-simulation.html

  * igt@i915_pm_dc@dc6-dpms:
    - shard-skl:          NOTRUN -> [FAIL][23] ([i915#454])
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl10/igt@i915_pm_dc@dc6-dpms.html

  * igt@i915_pm_dc@dc6-psr:
    - shard-iclb:         [PASS][24] -> [FAIL][25] ([i915#454])
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb5/igt@i915_pm_dc@dc6-psr.html
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb3/igt@i915_pm_dc@dc6-psr.html

  * igt@kms_atomic_transition@plane-all-modeset-transition:
    - shard-iclb:         NOTRUN -> [SKIP][26] ([i915#1769])
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_atomic_transition@plane-all-modeset-transition.html

  * igt@kms_big_fb@4-tiled-32bpp-rotate-90:
    - shard-iclb:         NOTRUN -> [SKIP][27] ([i915#5286])
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_big_fb@4-tiled-32bpp-rotate-90.html

  * igt@kms_big_fb@4-tiled-8bpp-rotate-270:
    - shard-tglb:         NOTRUN -> [SKIP][28] ([i915#5286])
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_big_fb@4-tiled-8bpp-rotate-270.html

  * igt@kms_big_fb@linear-32bpp-rotate-90:
    - shard-iclb:         NOTRUN -> [SKIP][29] ([fdo#110725] / [fdo#111614]) +1 similar issue
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_big_fb@linear-32bpp-rotate-90.html

  * igt@kms_big_fb@x-tiled-64bpp-rotate-270:
    - shard-tglb:         NOTRUN -> [SKIP][30] ([fdo#111614])
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_big_fb@x-tiled-64bpp-rotate-270.html

  * igt@kms_big_fb@x-tiled-max-hw-stride-64bpp-rotate-180-async-flip:
    - shard-skl:          NOTRUN -> [FAIL][31] ([i915#3743]) +1 similar issue
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl2/igt@kms_big_fb@x-tiled-max-hw-stride-64bpp-rotate-180-async-flip.html

  * igt@kms_big_fb@y-tiled-max-hw-stride-64bpp-rotate-180-async-flip:
    - shard-skl:          NOTRUN -> [FAIL][32] ([i915#3763])
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl2/igt@kms_big_fb@y-tiled-max-hw-stride-64bpp-rotate-180-async-flip.html

  * igt@kms_big_fb@yf-tiled-8bpp-rotate-90:
    - shard-iclb:         NOTRUN -> [SKIP][33] ([fdo#110723])
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_big_fb@yf-tiled-8bpp-rotate-90.html

  * igt@kms_ccs@pipe-a-bad-aux-stride-y_tiled_gen12_rc_ccs_cc:
    - shard-iclb:         NOTRUN -> [SKIP][34] ([fdo#109278] / [i915#3886]) +2 similar issues
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_ccs@pipe-a-bad-aux-stride-y_tiled_gen12_rc_ccs_cc.html

  * igt@kms_ccs@pipe-b-bad-pixel-format-y_tiled_gen12_mc_ccs:
    - shard-kbl:          NOTRUN -> [SKIP][35] ([fdo#109271] / [i915#3886])
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl6/igt@kms_ccs@pipe-b-bad-pixel-format-y_tiled_gen12_mc_ccs.html

  * igt@kms_ccs@pipe-c-crc-primary-rotation-180-y_tiled_gen12_rc_ccs_cc:
    - shard-apl:          NOTRUN -> [SKIP][36] ([fdo#109271] / [i915#3886]) +1 similar issue
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl2/igt@kms_ccs@pipe-c-crc-primary-rotation-180-y_tiled_gen12_rc_ccs_cc.html

  * igt@kms_ccs@pipe-c-crc-sprite-planes-basic-y_tiled_gen12_mc_ccs:
    - shard-skl:          NOTRUN -> [SKIP][37] ([fdo#109271] / [i915#3886]) +7 similar issues
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl10/igt@kms_ccs@pipe-c-crc-sprite-planes-basic-y_tiled_gen12_mc_ccs.html

  * igt@kms_ccs@pipe-c-missing-ccs-buffer-y_tiled_gen12_rc_ccs_cc:
    - shard-glk:          NOTRUN -> [SKIP][38] ([fdo#109271] / [i915#3886])
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-glk6/igt@kms_ccs@pipe-c-missing-ccs-buffer-y_tiled_gen12_rc_ccs_cc.html

  * igt@kms_ccs@pipe-c-random-ccs-data-y_tiled_gen12_mc_ccs:
    - shard-tglb:         NOTRUN -> [SKIP][39] ([i915#3689] / [i915#3886])
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_ccs@pipe-c-random-ccs-data-y_tiled_gen12_mc_ccs.html

  * igt@kms_ccs@pipe-d-bad-pixel-format-y_tiled_gen12_rc_ccs:
    - shard-kbl:          NOTRUN -> [SKIP][40] ([fdo#109271]) +7 similar issues
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl6/igt@kms_ccs@pipe-d-bad-pixel-format-y_tiled_gen12_rc_ccs.html

  * igt@kms_ccs@pipe-d-crc-primary-basic-y_tiled_ccs:
    - shard-tglb:         NOTRUN -> [SKIP][41] ([i915#3689])
   [41]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_ccs@pipe-d-crc-primary-basic-y_tiled_ccs.html

  * igt@kms_ccs@pipe-d-random-ccs-data-yf_tiled_ccs:
    - shard-tglb:         NOTRUN -> [SKIP][42] ([fdo#111615] / [i915#3689])
   [42]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_ccs@pipe-d-random-ccs-data-yf_tiled_ccs.html

  * igt@kms_chamelium@hdmi-audio:
    - shard-apl:          NOTRUN -> [SKIP][43] ([fdo#109271] / [fdo#111827]) +6 similar issues
   [43]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl2/igt@kms_chamelium@hdmi-audio.html

  * igt@kms_chamelium@vga-hpd-for-each-pipe:
    - shard-kbl:          NOTRUN -> [SKIP][44] ([fdo#109271] / [fdo#111827])
   [44]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl6/igt@kms_chamelium@vga-hpd-for-each-pipe.html

  * igt@kms_color@pipe-c-deep-color:
    - shard-tglb:         NOTRUN -> [SKIP][45] ([i915#3555])
   [45]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_color@pipe-c-deep-color.html

  * igt@kms_color_chamelium@pipe-a-ctm-negative:
    - shard-tglb:         NOTRUN -> [SKIP][46] ([fdo#109284] / [fdo#111827])
   [46]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_color_chamelium@pipe-a-ctm-negative.html

  * igt@kms_color_chamelium@pipe-b-ctm-max:
    - shard-skl:          NOTRUN -> [SKIP][47] ([fdo#109271] / [fdo#111827]) +15 similar issues
   [47]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl9/igt@kms_color_chamelium@pipe-b-ctm-max.html

  * igt@kms_color_chamelium@pipe-c-gamma:
    - shard-iclb:         NOTRUN -> [SKIP][48] ([fdo#109284] / [fdo#111827])
   [48]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_color_chamelium@pipe-c-gamma.html

  * igt@kms_color_chamelium@pipe-d-degamma:
    - shard-glk:          NOTRUN -> [SKIP][49] ([fdo#109271] / [fdo#111827]) +1 similar issue
   [49]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-glk6/igt@kms_color_chamelium@pipe-d-degamma.html

  * igt@kms_content_protection@atomic:
    - shard-iclb:         NOTRUN -> [SKIP][50] ([fdo#109300] / [fdo#111066])
   [50]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_content_protection@atomic.html

  * igt@kms_cursor_crc@pipe-a-cursor-32x10-sliding:
    - shard-tglb:         NOTRUN -> [SKIP][51] ([i915#3359])
   [51]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_cursor_crc@pipe-a-cursor-32x10-sliding.html

  * igt@kms_cursor_crc@pipe-a-cursor-suspend:
    - shard-kbl:          [PASS][52] -> [DMESG-WARN][53] ([i915#180]) +6 similar issues
   [52]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl3/igt@kms_cursor_crc@pipe-a-cursor-suspend.html
   [53]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl1/igt@kms_cursor_crc@pipe-a-cursor-suspend.html

  * igt@kms_cursor_crc@pipe-d-cursor-512x512-sliding:
    - shard-tglb:         NOTRUN -> [SKIP][54] ([fdo#109279] / [i915#3359]) +1 similar issue
   [54]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_cursor_crc@pipe-d-cursor-512x512-sliding.html

  * igt@kms_cursor_legacy@2x-cursor-vs-flip-legacy:
    - shard-iclb:         NOTRUN -> [SKIP][55] ([fdo#109274] / [fdo#109278])
   [55]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_cursor_legacy@2x-cursor-vs-flip-legacy.html

  * igt@kms_cursor_legacy@cursorb-vs-flipb-atomic-transitions:
    - shard-tglb:         NOTRUN -> [SKIP][56] ([fdo#109274] / [fdo#111825])
   [56]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_cursor_legacy@cursorb-vs-flipb-atomic-transitions.html

  * igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions:
    - shard-glk:          [PASS][57] -> [FAIL][58] ([i915#2346])
   [57]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-glk5/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html
   [58]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-glk5/igt@kms_cursor_legacy@flip-vs-cursor-atomic-transitions.html

  * igt@kms_cursor_legacy@flip-vs-cursor-varying-size:
    - shard-iclb:         [PASS][59] -> [FAIL][60] ([i915#2346]) +1 similar issue
   [59]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb5/igt@kms_cursor_legacy@flip-vs-cursor-varying-size.html
   [60]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_cursor_legacy@flip-vs-cursor-varying-size.html

  * igt@kms_draw_crc@draw-method-xrgb2101010-pwrite-4tiled:
    - shard-iclb:         NOTRUN -> [SKIP][61] ([i915#5287])
   [61]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_draw_crc@draw-method-xrgb2101010-pwrite-4tiled.html

  * igt@kms_draw_crc@draw-method-xrgb8888-mmap-wc-4tiled:
    - shard-tglb:         NOTRUN -> [SKIP][62] ([i915#5287])
   [62]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_draw_crc@draw-method-xrgb8888-mmap-wc-4tiled.html

  * igt@kms_fbcon_fbt@psr-suspend:
    - shard-skl:          NOTRUN -> [FAIL][63] ([i915#4767])
   [63]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl1/igt@kms_fbcon_fbt@psr-suspend.html

  * igt@kms_flip@2x-absolute-wf_vblank:
    - shard-tglb:         NOTRUN -> [SKIP][64] ([fdo#109274] / [fdo#111825] / [i915#3966])
   [64]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_flip@2x-absolute-wf_vblank.html

  * igt@kms_flip@2x-flip-vs-dpms-off-vs-modeset:
    - shard-iclb:         NOTRUN -> [SKIP][65] ([fdo#109274])
   [65]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_flip@2x-flip-vs-dpms-off-vs-modeset.html

  * igt@kms_flip_scaled_crc@flip-32bpp-ytile-to-32bpp-ytilegen12rcccs-downscaling:
    - shard-skl:          NOTRUN -> [SKIP][66] ([fdo#109271] / [i915#3701]) +1 similar issue
   [66]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl9/igt@kms_flip_scaled_crc@flip-32bpp-ytile-to-32bpp-ytilegen12rcccs-downscaling.html

  * igt@kms_frontbuffer_tracking@fbc-2p-primscrn-pri-shrfb-draw-mmap-wc:
    - shard-tglb:         NOTRUN -> [SKIP][67] ([fdo#109280] / [fdo#111825]) +3 similar issues
   [67]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_frontbuffer_tracking@fbc-2p-primscrn-pri-shrfb-draw-mmap-wc.html

  * igt@kms_frontbuffer_tracking@fbc-2p-scndscrn-spr-indfb-draw-mmap-wc:
    - shard-apl:          NOTRUN -> [SKIP][68] ([fdo#109271]) +56 similar issues
   [68]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl2/igt@kms_frontbuffer_tracking@fbc-2p-scndscrn-spr-indfb-draw-mmap-wc.html

  * igt@kms_frontbuffer_tracking@fbcpsr-2p-shrfb-fliptrack-mmap-gtt:
    - shard-glk:          NOTRUN -> [SKIP][69] ([fdo#109271]) +24 similar issues
   [69]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-glk6/igt@kms_frontbuffer_tracking@fbcpsr-2p-shrfb-fliptrack-mmap-gtt.html

  * igt@kms_frontbuffer_tracking@psr-2p-primscrn-pri-shrfb-draw-mmap-wc:
    - shard-iclb:         NOTRUN -> [SKIP][70] ([fdo#109280]) +4 similar issues
   [70]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_frontbuffer_tracking@psr-2p-primscrn-pri-shrfb-draw-mmap-wc.html

  * igt@kms_hdr@static-swap:
    - shard-iclb:         NOTRUN -> [SKIP][71] ([i915#3555])
   [71]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_hdr@static-swap.html

  * igt@kms_pipe_crc_basic@read-crc-pipe-d:
    - shard-apl:          NOTRUN -> [SKIP][72] ([fdo#109271] / [i915#533])
   [72]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl2/igt@kms_pipe_crc_basic@read-crc-pipe-d.html

  * igt@kms_plane_alpha_blend@pipe-a-alpha-opaque-fb:
    - shard-glk:          NOTRUN -> [FAIL][73] ([fdo#108145] / [i915#265])
   [73]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-glk6/igt@kms_plane_alpha_blend@pipe-a-alpha-opaque-fb.html

  * igt@kms_plane_alpha_blend@pipe-a-constant-alpha-max:
    - shard-skl:          NOTRUN -> [FAIL][74] ([fdo#108145] / [i915#265]) +2 similar issues
   [74]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl10/igt@kms_plane_alpha_blend@pipe-a-constant-alpha-max.html

  * igt@kms_plane_alpha_blend@pipe-c-coverage-7efc:
    - shard-skl:          [PASS][75] -> [FAIL][76] ([fdo#108145] / [i915#265])
   [75]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-skl10/igt@kms_plane_alpha_blend@pipe-c-coverage-7efc.html
   [76]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl1/igt@kms_plane_alpha_blend@pipe-c-coverage-7efc.html

  * igt@kms_plane_cursor@pipe-d-primary-size-64:
    - shard-iclb:         NOTRUN -> [SKIP][77] ([fdo#109278]) +9 similar issues
   [77]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_plane_cursor@pipe-d-primary-size-64.html

  * igt@kms_plane_lowres@pipe-a-tiling-none:
    - shard-tglb:         NOTRUN -> [SKIP][78] ([i915#3536])
   [78]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_plane_lowres@pipe-a-tiling-none.html

  * igt@kms_plane_lowres@pipe-c-tiling-4:
    - shard-tglb:         NOTRUN -> [SKIP][79] ([i915#5288])
   [79]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@kms_plane_lowres@pipe-c-tiling-4.html

  * igt@kms_plane_scaling@planes-upscale-factor-0-25-downscale-factor-0-25@pipe-a-edp-1-planes-upscale-downscale:
    - shard-skl:          NOTRUN -> [SKIP][80] ([fdo#109271]) +259 similar issues
   [80]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl10/igt@kms_plane_scaling@planes-upscale-factor-0-25-downscale-factor-0-25@pipe-a-edp-1-planes-upscale-downscale.html

  * igt@kms_psr2_sf@overlay-plane-move-continuous-exceed-fully-sf:
    - shard-skl:          NOTRUN -> [SKIP][81] ([fdo#109271] / [i915#658]) +2 similar issues
   [81]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl9/igt@kms_psr2_sf@overlay-plane-move-continuous-exceed-fully-sf.html

  * igt@kms_psr2_su@frontbuffer-xrgb8888:
    - shard-iclb:         NOTRUN -> [SKIP][82] ([fdo#109642] / [fdo#111068] / [i915#658])
   [82]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@kms_psr2_su@frontbuffer-xrgb8888.html

  * igt@kms_psr2_su@page_flip-xrgb8888:
    - shard-glk:          NOTRUN -> [SKIP][83] ([fdo#109271] / [i915#658]) +1 similar issue
   [83]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-glk6/igt@kms_psr2_su@page_flip-xrgb8888.html

  * igt@kms_psr@psr2_sprite_plane_move:
    - shard-iclb:         [PASS][84] -> [SKIP][85] ([fdo#109441]) +1 similar issue
   [84]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb2/igt@kms_psr@psr2_sprite_plane_move.html
   [85]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb3/igt@kms_psr@psr2_sprite_plane_move.html

  * igt@kms_psr_stress_test@flip-primary-invalidate-overlay:
    - shard-tglb:         [PASS][86] -> [SKIP][87] ([i915#5519])
   [86]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-tglb3/igt@kms_psr_stress_test@flip-primary-invalidate-overlay.html
   [87]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb7/igt@kms_psr_stress_test@flip-primary-invalidate-overlay.html

  * igt@kms_writeback@writeback-pixel-formats:
    - shard-skl:          NOTRUN -> [SKIP][88] ([fdo#109271] / [i915#2437])
   [88]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl9/igt@kms_writeback@writeback-pixel-formats.html

  * igt@nouveau_crc@ctx-flip-threshold-reset-after-capture:
    - shard-tglb:         NOTRUN -> [SKIP][89] ([i915#2530])
   [89]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@nouveau_crc@ctx-flip-threshold-reset-after-capture.html

  * igt@nouveau_crc@pipe-c-ctx-flip-detection:
    - shard-iclb:         NOTRUN -> [SKIP][90] ([i915#2530])
   [90]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@nouveau_crc@pipe-c-ctx-flip-detection.html

  * igt@prime_nv_api@i915_self_import_to_different_fd:
    - shard-tglb:         NOTRUN -> [SKIP][91] ([fdo#109291])
   [91]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@prime_nv_api@i915_self_import_to_different_fd.html

  * igt@prime_nv_pcopy@test2:
    - shard-iclb:         NOTRUN -> [SKIP][92] ([fdo#109291])
   [92]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@prime_nv_pcopy@test2.html

  * igt@prime_vgem@fence-flip-hang:
    - shard-iclb:         NOTRUN -> [SKIP][93] ([fdo#109295])
   [93]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@prime_vgem@fence-flip-hang.html

  * igt@sysfs_clients@fair-3:
    - shard-glk:          NOTRUN -> [SKIP][94] ([fdo#109271] / [i915#2994])
   [94]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-glk6/igt@sysfs_clients@fair-3.html

  * igt@sysfs_clients@fair-7:
    - shard-apl:          NOTRUN -> [SKIP][95] ([fdo#109271] / [i915#2994])
   [95]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl2/igt@sysfs_clients@fair-7.html
    - shard-skl:          NOTRUN -> [SKIP][96] ([fdo#109271] / [i915#2994])
   [96]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl9/igt@sysfs_clients@fair-7.html

  * igt@sysfs_clients@pidname:
    - shard-tglb:         NOTRUN -> [SKIP][97] ([i915#2994])
   [97]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb5/igt@sysfs_clients@pidname.html

  
#### Possible fixes ####

  * igt@gem_eio@in-flight-contexts-10ms:
    - shard-iclb:         [TIMEOUT][98] ([i915#3070]) -> [PASS][99]
   [98]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb3/igt@gem_eio@in-flight-contexts-10ms.html
   [99]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb1/igt@gem_eio@in-flight-contexts-10ms.html

  * igt@gem_eio@in-flight-immediate:
    - shard-tglb:         [TIMEOUT][100] ([i915#3063]) -> [PASS][101]
   [100]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-tglb1/igt@gem_eio@in-flight-immediate.html
   [101]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb2/igt@gem_eio@in-flight-immediate.html

  * igt@gem_eio@kms:
    - shard-tglb:         [FAIL][102] ([i915#5784]) -> [PASS][103]
   [102]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-tglb5/igt@gem_eio@kms.html
   [103]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb7/igt@gem_eio@kms.html

  * igt@gem_eio@reset-stress:
    - shard-apl:          [FAIL][104] -> [PASS][105]
   [104]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-apl2/igt@gem_eio@reset-stress.html
   [105]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl1/igt@gem_eio@reset-stress.html

  * igt@gem_eio@unwedge-stress:
    - {shard-tglu}:       [TIMEOUT][106] ([i915#3063]) -> [PASS][107]
   [106]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-tglu-3/igt@gem_eio@unwedge-stress.html
   [107]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglu-6/igt@gem_eio@unwedge-stress.html

  * igt@gem_exec_balancer@parallel-balancer:
    - shard-iclb:         [SKIP][108] ([i915#4525]) -> [PASS][109]
   [108]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb6/igt@gem_exec_balancer@parallel-balancer.html
   [109]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb1/igt@gem_exec_balancer@parallel-balancer.html

  * igt@gem_exec_fair@basic-throttle@rcs0:
    - shard-iclb:         [FAIL][110] ([i915#2849]) -> [PASS][111]
   [110]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb6/igt@gem_exec_fair@basic-throttle@rcs0.html
   [111]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb1/igt@gem_exec_fair@basic-throttle@rcs0.html

  * igt@gem_exec_flush@basic-batch-kernel-default-wb:
    - shard-snb:          [SKIP][112] ([fdo#109271]) -> [PASS][113]
   [112]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-snb6/igt@gem_exec_flush@basic-batch-kernel-default-wb.html
   [113]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-snb2/igt@gem_exec_flush@basic-batch-kernel-default-wb.html

  * igt@gen9_exec_parse@allowed-single:
    - shard-glk:          [DMESG-WARN][114] ([i915#5566] / [i915#716]) -> [PASS][115]
   [114]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-glk3/igt@gen9_exec_parse@allowed-single.html
   [115]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-glk6/igt@gen9_exec_parse@allowed-single.html

  * igt@i915_pm_rpm@system-suspend:
    - shard-skl:          [INCOMPLETE][116] ([i915#5420]) -> [PASS][117]
   [116]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-skl8/igt@i915_pm_rpm@system-suspend.html
   [117]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl9/igt@i915_pm_rpm@system-suspend.html

  * igt@i915_pm_rps@basic-api:
    - {shard-dg1}:        [FAIL][118] ([i915#4032]) -> [PASS][119]
   [118]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-dg1-12/igt@i915_pm_rps@basic-api.html
   [119]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-dg1-16/igt@i915_pm_rps@basic-api.html

  * igt@kms_flip@flip-vs-expired-vblank-interruptible@a-edp1:
    - shard-skl:          [FAIL][120] ([i915#79]) -> [PASS][121]
   [120]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-skl4/igt@kms_flip@flip-vs-expired-vblank-interruptible@a-edp1.html
   [121]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl6/igt@kms_flip@flip-vs-expired-vblank-interruptible@a-edp1.html

  * igt@kms_flip@flip-vs-expired-vblank-interruptible@b-hdmi-a1:
    - shard-glk:          [FAIL][122] ([i915#79]) -> [PASS][123]
   [122]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-glk8/igt@kms_flip@flip-vs-expired-vblank-interruptible@b-hdmi-a1.html
   [123]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-glk9/igt@kms_flip@flip-vs-expired-vblank-interruptible@b-hdmi-a1.html

  * igt@kms_flip@flip-vs-suspend-interruptible@c-dp1:
    - shard-apl:          [DMESG-WARN][124] ([i915#180]) -> [PASS][125] +1 similar issue
   [124]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-apl4/igt@kms_flip@flip-vs-suspend-interruptible@c-dp1.html
   [125]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl1/igt@kms_flip@flip-vs-suspend-interruptible@c-dp1.html

  * igt@kms_flip@flip-vs-suspend@c-dp1:
    - shard-kbl:          [DMESG-WARN][126] ([i915#180]) -> [PASS][127] +2 similar issues
   [126]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl7/igt@kms_flip@flip-vs-suspend@c-dp1.html
   [127]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl6/igt@kms_flip@flip-vs-suspend@c-dp1.html

  * igt@kms_flip@plain-flip-fb-recreate-interruptible@a-edp1:
    - shard-skl:          [FAIL][128] ([i915#2122]) -> [PASS][129] +1 similar issue
   [128]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-skl9/igt@kms_flip@plain-flip-fb-recreate-interruptible@a-edp1.html
   [129]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl8/igt@kms_flip@plain-flip-fb-recreate-interruptible@a-edp1.html

  * igt@kms_frontbuffer_tracking@fbc-rgb101010-draw-pwrite:
    - {shard-dg1}:        [SKIP][130] ([i915#5721]) -> [PASS][131] +1 similar issue
   [130]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-dg1-13/igt@kms_frontbuffer_tracking@fbc-rgb101010-draw-pwrite.html
   [131]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-dg1-12/igt@kms_frontbuffer_tracking@fbc-rgb101010-draw-pwrite.html

  * igt@kms_plane_alpha_blend@pipe-b-coverage-7efc:
    - shard-skl:          [FAIL][132] ([fdo#108145] / [i915#265]) -> [PASS][133] +1 similar issue
   [132]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-skl9/igt@kms_plane_alpha_blend@pipe-b-coverage-7efc.html
   [133]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl2/igt@kms_plane_alpha_blend@pipe-b-coverage-7efc.html

  * igt@kms_plane_scaling@planes-unity-scaling-downscale-factor-0-5@pipe-a-edp-1-planes-upscale-downscale:
    - shard-iclb:         [SKIP][134] ([i915#5235]) -> [PASS][135] +2 similar issues
   [134]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb2/igt@kms_plane_scaling@planes-unity-scaling-downscale-factor-0-5@pipe-a-edp-1-planes-upscale-downscale.html
   [135]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb3/igt@kms_plane_scaling@planes-unity-scaling-downscale-factor-0-5@pipe-a-edp-1-planes-upscale-downscale.html

  * igt@kms_psr@psr2_primary_mmap_cpu:
    - shard-iclb:         [SKIP][136] ([fdo#109441]) -> [PASS][137]
   [136]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb3/igt@kms_psr@psr2_primary_mmap_cpu.html
   [137]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb2/igt@kms_psr@psr2_primary_mmap_cpu.html

  * igt@kms_psr_stress_test@invalidate-primary-flip-overlay:
    - shard-iclb:         [SKIP][138] ([i915#5519]) -> [PASS][139]
   [138]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb8/igt@kms_psr_stress_test@invalidate-primary-flip-overlay.html
   [139]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb8/igt@kms_psr_stress_test@invalidate-primary-flip-overlay.html

  * igt@kms_rotation_crc@sprite-rotation-90-pos-100-0:
    - {shard-dg1}:        [SKIP][140] ([i915#1836]) -> [PASS][141] +3 similar issues
   [140]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-dg1-13/igt@kms_rotation_crc@sprite-rotation-90-pos-100-0.html
   [141]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-dg1-12/igt@kms_rotation_crc@sprite-rotation-90-pos-100-0.html

  * igt@kms_sequence@get-idle@hdmi-a-1-pipe-c:
    - {shard-dg1}:        [FAIL][142] -> [PASS][143] +4 similar issues
   [142]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-dg1-13/igt@kms_sequence@get-idle@hdmi-a-1-pipe-c.html
   [143]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-dg1-12/igt@kms_sequence@get-idle@hdmi-a-1-pipe-c.html

  * igt@kms_vblank@pipe-b-accuracy-idle:
    - {shard-dg1}:        [FAIL][144] ([i915#4241]) -> [PASS][145]
   [144]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-dg1-13/igt@kms_vblank@pipe-b-accuracy-idle.html
   [145]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-dg1-12/igt@kms_vblank@pipe-b-accuracy-idle.html

  
#### Warnings ####

  * igt@gem_eio@unwedge-stress:
    - shard-tglb:         [FAIL][146] ([i915#5784]) -> [TIMEOUT][147] ([i915#3063])
   [146]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-tglb1/igt@gem_eio@unwedge-stress.html
   [147]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-tglb6/igt@gem_eio@unwedge-stress.html

  * igt@gem_exec_balancer@parallel-bb-first:
    - shard-iclb:         [DMESG-WARN][148] ([i915#5614]) -> [SKIP][149] ([i915#4525])
   [148]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb2/igt@gem_exec_balancer@parallel-bb-first.html
   [149]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb7/igt@gem_exec_balancer@parallel-bb-first.html

  * igt@gem_exec_balancer@parallel-keep-in-fence:
    - shard-iclb:         [SKIP][150] ([i915#4525]) -> [DMESG-WARN][151] ([i915#5614])
   [150]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb6/igt@gem_exec_balancer@parallel-keep-in-fence.html
   [151]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb2/igt@gem_exec_balancer@parallel-keep-in-fence.html

  * igt@kms_fbcon_fbt@fbc-suspend:
    - shard-kbl:          [FAIL][152] ([i915#4767]) -> [INCOMPLETE][153] ([i915#180])
   [152]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl1/igt@kms_fbcon_fbt@fbc-suspend.html
   [153]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl7/igt@kms_fbcon_fbt@fbc-suspend.html

  * igt@kms_psr2_sf@cursor-plane-move-continuous-sf:
    - shard-iclb:         [SKIP][154] ([i915#658]) -> [SKIP][155] ([i915#2920])
   [154]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb3/igt@kms_psr2_sf@cursor-plane-move-continuous-sf.html
   [155]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb2/igt@kms_psr2_sf@cursor-plane-move-continuous-sf.html

  * igt@kms_psr2_sf@overlay-plane-move-continuous-exceed-fully-sf:
    - shard-iclb:         [SKIP][156] ([i915#2920]) -> [SKIP][157] ([i915#658])
   [156]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb2/igt@kms_psr2_sf@overlay-plane-move-continuous-exceed-fully-sf.html
   [157]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb3/igt@kms_psr2_sf@overlay-plane-move-continuous-exceed-fully-sf.html

  * igt@kms_psr2_sf@plane-move-sf-dmg-area:
    - shard-iclb:         [SKIP][158] ([fdo#111068] / [i915#658]) -> [SKIP][159] ([i915#2920])
   [158]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-iclb6/igt@kms_psr2_sf@plane-move-sf-dmg-area.html
   [159]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-iclb2/igt@kms_psr2_sf@plane-move-sf-dmg-area.html

  * igt@runner@aborted:
    - shard-kbl:          ([FAIL][160], [FAIL][161], [FAIL][162], [FAIL][163], [FAIL][164], [FAIL][165], [FAIL][166], [FAIL][167], [FAIL][168]) ([i915#180] / [i915#3002] / [i915#4312] / [i915#5257]) -> ([FAIL][169], [FAIL][170], [FAIL][171], [FAIL][172], [FAIL][173], [FAIL][174], [FAIL][175], [FAIL][176], [FAIL][177], [FAIL][178], [FAIL][179]) ([i915#180] / [i915#3002] / [i915#4312] / [i915#5257] / [i915#92])
   [160]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl1/igt@runner@aborted.html
   [161]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl4/igt@runner@aborted.html
   [162]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl6/igt@runner@aborted.html
   [163]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl7/igt@runner@aborted.html
   [164]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl4/igt@runner@aborted.html
   [165]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl1/igt@runner@aborted.html
   [166]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl7/igt@runner@aborted.html
   [167]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl7/igt@runner@aborted.html
   [168]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-kbl1/igt@runner@aborted.html
   [169]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl3/igt@runner@aborted.html
   [170]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl1/igt@runner@aborted.html
   [171]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl7/igt@runner@aborted.html
   [172]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl6/igt@runner@aborted.html
   [173]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl1/igt@runner@aborted.html
   [174]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl1/igt@runner@aborted.html
   [175]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl4/igt@runner@aborted.html
   [176]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl4/igt@runner@aborted.html
   [177]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl3/igt@runner@aborted.html
   [178]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl1/igt@runner@aborted.html
   [179]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-kbl1/igt@runner@aborted.html
    - shard-apl:          ([FAIL][180], [FAIL][181], [FAIL][182], [FAIL][183], [FAIL][184], [FAIL][185], [FAIL][186]) ([fdo#109271] / [i915#180] / [i915#3002] / [i915#4312] / [i915#5257]) -> ([FAIL][187], [FAIL][188], [FAIL][189], [FAIL][190], [FAIL][191]) ([i915#3002] / [i915#4312] / [i915#5257])
   [180]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-apl2/igt@runner@aborted.html
   [181]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-apl3/igt@runner@aborted.html
   [182]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-apl6/igt@runner@aborted.html
   [183]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-apl3/igt@runner@aborted.html
   [184]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-apl7/igt@runner@aborted.html
   [185]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-apl7/igt@runner@aborted.html
   [186]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-apl4/igt@runner@aborted.html
   [187]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl3/igt@runner@aborted.html
   [188]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl8/igt@runner@aborted.html
   [189]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl8/igt@runner@aborted.html
   [190]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl4/igt@runner@aborted.html
   [191]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-apl1/igt@runner@aborted.html
    - shard-skl:          ([FAIL][192], [FAIL][193], [FAIL][194], [FAIL][195]) ([i915#2029] / [i915#4312] / [i915#5257]) -> ([FAIL][196], [FAIL][197], [FAIL][198], [FAIL][199]) ([i915#3002] / [i915#4312] / [i915#5257])
   [192]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-skl6/igt@runner@aborted.html
   [193]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-skl2/igt@runner@aborted.html
   [194]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-skl8/igt@runner@aborted.html
   [195]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_11693/shard-skl5/igt@runner@aborted.html
   [196]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl7/igt@runner@aborted.html
   [197]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl6/igt@runner@aborted.html
   [198]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl6/igt@runner@aborted.html
   [199]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/shard-skl4/igt@runner@aborted.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#103375]: https://bugs.freedesktop.org/show_bug.cgi?id=103375
  [fdo#108145]: https://bugs.freedesktop.org/show_bug.cgi?id=108145
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109274]: https://bugs.freedesktop.org/show_bug.cgi?id=109274
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109279]: https://bugs.freedesktop.org/show_bug.cgi?id=109279
  [fdo#109280]: https://bugs.freedesktop.org/show_bug.cgi?id=109280
  [fdo#109283]: https://bugs.freedesktop.org/show_bug.cgi?id=109283
  [fdo#109284]: https://bugs.freedesktop.org/show_bug.cgi?id=109284
  [fdo#109285]: https://bugs.freedesktop.org/show_bug.cgi?id=109285
  [fdo#109289]: https://bugs.freedesktop.org/show_bug.cgi?id=109289
  [fdo#109291]: https://bugs.freedesktop.org/show_bug.cgi?id=109291
  [fdo#109295]: https://bugs.freedesktop.org/show_bug.cgi?id=109295
  [fdo#109300]: https://bugs.freedesktop.org/show_bug.cgi?id=109300
  [fdo#109302]: https://bugs.freedesktop.org/show_bug.cgi?id=109302
  [fdo#109307]: https://bugs.freedesktop.org/show_bug.cgi?id=109307
  [fdo#109308]: https://bugs.freedesktop.org/show_bug.cgi?id=109308
  [fdo#109309]: https://bugs.freedesktop.org/show_bug.cgi?id=109309
  [fdo#109313]: https://bugs.freedesktop.org/show_bug.cgi?id=109313
  [fdo#109314]: https://bugs.freedesktop.org/show_bug.cgi?id=109314
  [fdo#109441]: https://bugs.freedesktop.org/show_bug.cgi?id=109441
  [fdo#109506]: https://bugs.freedesktop.org/show_bug.cgi?id=109506
  [fdo#109642]: https://bugs.freedesktop.org/show_bug.cgi?id=109642
  [fdo#110189]: https://bugs.freedesktop.org/show_bug.cgi?id=110189
  [fdo#110542]: https://bugs.freedesktop.org/show_bug.cgi?id=110542
  [fdo#110723]: https://bugs.freedesktop.org/show_bug.cgi?id=110723
  [fdo#110725]: https://bugs.freedesktop.org/show_bug.cgi?id=110725
  [fdo#111066]: https://bugs.freedesktop.org/show_bug.cgi?id=111066
  [fdo#111068]: https://bugs.freedesktop.org/show_bug.cgi?id=111068
  [fdo#111314]: https://bugs.freedesktop.org/show_bug.cgi?id=111314
  [fdo#111614]: https://bugs.freedesktop.org/show_bug.cgi?id=111614
  [fdo#111615]: https://bugs.freedesktop.org/show_bug.cgi?id=111615
  [fdo#111656]: https://bugs.freedesktop.org/show_bug.cgi?id=111656
  [fdo#111825]: https://bugs.freedesktop.org/show_bug.cgi?id=111825
  [fdo#111827]: https://bugs.freedesktop.org/show_bug.cgi?id=111827
  [fdo#112022]: https://bugs.freedesktop.org/show_bug.cgi?id=112022
  [fdo#112283]: https://bugs.freedesktop.org/show_bug.cgi?id=112283
  [i915#1072]: https://gitlab.freedesktop.org/drm/intel/issues/1072
  [i915#1149]: https://gitlab.freedesktop.org/drm/intel/issues/1149
  [i915#132]: https://gitlab.freedesktop.org/drm/intel/issues/132
  [i915#1397]: https://gitlab.freedesktop.org/drm/intel/issues/1397
  [i915#1722]: https://gitlab.freedesktop.org/drm/intel/issues/1722
  [i915#1769]: https://gitlab.freedesktop.org/drm/intel/issues/1769
  [i915#180]: https://gitlab.freedesktop.org/drm/intel/issues/180
  [i915#1825]: https://gitlab.freedesktop.org/drm/intel/issues/1825
  [i915#1836]: https://gitlab.freedesktop.org/drm/intel/issues/1836
  [i915#1839]: https://gitlab.freedesktop.org/drm/intel/issues/1839
  [i915#1845]: https://gitlab.freedesktop.org/drm/intel/issues/1845
  [i915#1849]: https://gitlab.freedesktop.org/drm/intel/issues/1849
  [i915#2029]: https://gitlab.freedesktop.org/drm/intel/issues/2029
  [i915#2122]: https://gitlab.freedesktop.org/drm/intel/issues/2122
  [i915#2346]: https://gitlab.freedesktop.org/drm/intel/issues/2346
  [i915#2410]: https://gitlab.freedesktop.org/drm/intel/issues/2410
  [i915#2433]: https://gitlab.freedesktop.org/drm/intel/issues/2433
  [i915#2434]: https://gitlab.freedesktop.org/drm/intel/issues/2434
  [i915#2435]: https://gitlab.freedesktop.org/drm/intel/issues/2435
  [i915#2436]: https://gitlab.freedesktop.org/drm/intel/issues/2436
  [i915#2437]: https://gitlab.freedesktop.org/drm/intel/issues/2437
  [i915#2527]: https://gitlab.freedesktop.org/drm/intel/issues/2527
  [i915#2530]: https://gitlab.freedesktop.org/drm/intel/issues/2530
  [i915#2582]: https://gitlab.freedesktop.org/drm/intel/issues/2582
  [i915#265]: https://gitlab.freedesktop.org/drm/intel/issues/265
  [i915#2705]: https://gitlab.freedesktop.org/drm/intel/issues/2705
  [i915#280]: https://gitlab.freedesktop.org/drm/intel/issues/280
  [i915#2842]: https://gitlab.freedesktop.org/drm/intel/issues/2842
  [i915#2846]: https://gitlab.freedesktop.org/drm/intel/issues/2846
  [i915#2849]: https://gitlab.freedesktop.org/drm/intel/issues/2849
  [i915#2856]: https://gitlab.freedesktop.org/drm/intel/issues/2856
  [i915#2920]: https://gitlab.freedesktop.org/drm/intel/issues/2920
  [i915#2994]: https://gitlab.freedesktop.org/drm/intel/issues/2994
  [i915#3002]: https://gitlab.freedesktop.org/drm/intel/issues/3002
  [i915#3012]: https://gitlab.freedesktop.org/drm/intel/issues/3012
  [i915#3063]: https://gitlab.freedesktop.org/drm/intel/issues/3063
  [i915#3070]: https://gitlab.freedesktop.org/drm/intel/issues/3070
  [i915#3116]: https://gitlab.freedesktop.org/drm/intel/issues/3116
  [i915#3281]: https://gitlab.freedesktop.org/drm/intel/issues/3281
  [i915#3282]: https://gitlab.freedesktop.org/drm/intel/issues/3282
  [i915#3291]: https://gitlab.freedesktop.org/drm/intel/issues/3291
  [i915#3297]: https://gitlab.freedesktop.org/drm/intel/issues/3297
  [i915#3301]: https://gitlab.freedesktop.org/drm/intel/issues/3301
  [i915#3318]: https://gitlab.freedesktop.org/drm/intel/issues/3318
  [i915#3319]: https://gitlab.freedesktop.org/drm/intel/issues/3319
  [i915#3323]: https://gitlab.freedesktop.org/drm/intel/issues/3323
  [i915#3359]: https://gitlab.freedesktop.org/drm/intel/issues/3359
  [i915#3376]: https://gitlab.freedesktop.org/drm/intel/issues/3376
  [i915#3458]: https://gitlab.freedesktop.org/drm/intel/issues/3458
  [i915#3536]: https://gitlab.freedesktop.org/drm/intel/issues/3536
  [i915#3555]: https://gitlab.freedesktop.org/drm/intel/issues/3555
  [i915#3558]: https://gitlab.freedesktop.org/drm/intel/issues/3558
  [i915#3637]: https://gitlab.freedesktop.org/drm/intel/issues/3637
  [i915#3638]: https://gitlab.freedesktop.org/drm/intel/issues/3638
  [i915#3639]: https://gitlab.freedesktop.org/drm/intel/issues/3639
  [i915#3689]: https://gitlab.freedesktop.org/drm/intel/issues/3689
  [i915#3701]: https://gitlab.freedesktop.org/drm/intel/issues/3701
  [i915#3708]: https://gitlab.freedesktop.org/drm/intel/issues/3708
  [i915#3734]: https://gitlab.freedesktop.org/drm/intel/issues/3734
  [i915#3742]: https://gitlab.freedesktop.org/drm/intel/issues/3742
  [i915#3743]: https://gitlab.freedesktop.org/drm/intel/issues/3743
  [i915#3763]: https://gitlab.freedesktop.org/drm/intel/issues/3763
  [i915#3826]: https://gitlab.freedesktop.org/drm/intel/issues/3826
  [i915#3840]: https://gitlab.freedesktop.org/drm/intel/issues/3840
  [i915#3886]: https://gitlab.freedesktop.org/drm/intel/issues/3886
  [i915#3936]: https://gitlab.freedesktop.org/drm/intel/issues/3936
  [i915#3952]: https://gitlab.freedesktop.org/drm/intel/issues/3952
  [i915#3955]: https://gitlab.freedesktop.org/drm/intel/issues/3955
  [i915#3966]: https://gitlab.freedesktop.org/drm/intel/issues/3966
  [i915#4016]: https://gitlab.freedesktop.org/drm/intel/issues/4016
  [i915#4032]: https://gitlab.freedesktop.org/drm/intel/issues/4032
  [i915#4070]: https://gitlab.freedesktop.org/drm/intel/issues/4070
  [i915#4077]: https://gitlab.freedesktop.org/drm/intel/issues/4077
  [i915#4078]: https://gitlab.freedesktop.org/drm/intel/issues/4078
  [i915#4083]: https://gitlab.freedesktop.org/drm/intel/issues/4083
  [i915#4098]: https://gitlab.freedesktop.org/drm/intel/issues/4098
  [i915#4103]: https://gitlab.freedesktop.org/drm/intel/issues/4103
  [i915#4212]: https://gitlab.freedesktop.org/drm/intel/issues/4212
  [i915#4241]: https://gitlab.freedesktop.org/drm/intel/issues/4241
  [i915#4270]: https://gitlab.freedesktop.org/drm/intel/issues/4270
  [i915#4278]: https://gitlab.freedesktop.org/drm/intel/issues/4278
  [i915#4312]: https://gitlab.freedesktop.org/drm/intel/issues/4312
  [i915#433]: https://gitlab.freedesktop.org/drm/intel/issues/433
  [i915#4369]: https://gitlab.freedesktop.org/drm/intel/issues/4369
  [i915#4387]: https://gitlab.freedesktop.org/drm/intel/issues/4387
  [i915#4525]: https://gitlab.freedesktop.org/drm/intel/issues/4525
  [i915#4538]: https://gitlab.freedesktop.org/drm/intel/issues/4538
  [i915#454]: https://gitlab.freedesktop.org/drm/intel/issues/454
  [i915#4613]: https://gitlab.freedesktop.org/drm/intel/issues/4613
  [i915#4767]: https://gitlab.freedesktop.org/drm/intel/issues/4767
  [i915#4833]: https://gitlab.freedesktop.org/drm/intel/issues/4833
  [i915#4842]: https://gitlab.freedesktop.org/drm/intel/issues/4842
  [i915#4853]: https://gitlab.freedesktop.org/drm/intel/issues/4853
  [i915#4860]: https://gitlab.freedesktop.org/drm/intel/issues/4860
  [i915#4880]: https://gitlab.freedesktop.org/drm/intel/issues/4880
  [i915#4883]: https://gitlab.freedesktop.org/drm/intel/issues/4883
  [i915#4991]: https://gitlab.freedesktop.org/drm/intel/issues/4991
  [i915#5098]: https://gitlab.freedesktop.org/drm/intel/issues/5098
  [i915#5176]: https://gitlab.freedesktop.org/drm/intel/issues/5176
  [i915#5182]: https://gitlab.freedesktop.org/drm/intel/issues/5182
  [i915#5235]: https://gitlab.freedesktop.org/drm/intel/issues/5235
  [i915#5257]: https://gitlab.freedesktop.org/drm/intel/issues/5257
  [i915#5286]: https://gitlab.freedesktop.org/drm/intel/issues/5286
  [i915#5287]: https://gitlab.freedesktop.org/drm/intel/issues/5287
  [i915#5288]: https://gitlab.freedesktop.org/drm/intel/issues/5288
  [i915#5289]: https://gitlab.freedesktop.org/drm/intel/issues/5289
  [i915#5325]: https://gitlab.freedesktop.org/drm/intel/issues/5325
  [i915#533]: https://gitlab.freedesktop.org/drm/intel/issues/533
  [i915#5420]: https://gitlab.freedesktop.org/drm/intel/issues/5420
  [i915#5439]: https://gitlab.freedesktop.org/drm/intel/issues/5439
  [i915#5461]: https://gitlab.freedesktop.org/drm/intel/issues/5461
  [i915#5519]: https://gitlab.freedesktop.org/drm/intel/issues/5519
  [i915#5566]: https://gitlab.freedesktop.org/drm/intel/issues/5566
  [i915#5614]: https://gitlab.freedesktop.org/drm/intel/issues/5614
  [i915#5721]: https://gitlab.freedesktop.org/drm/intel/issues/5721
  [i915#5784]: https://gitlab.freedesktop.org/drm/intel/issues/5784
  [i915#5903]: https://gitlab.freedesktop.org/drm/intel/issues/5903
  [i915#658]: https://gitlab.freedesktop.org/drm/intel/issues/658
  [i915#716]: https://gitlab.freedesktop.org/drm/intel/issues/716
  [i915#768]: https://gitlab.freedesktop.org/drm/intel/issues/768
  [i915#79]: https://gitlab.freedesktop.org/drm/intel/issues/79
  [i915#92]: https://gitlab.freedesktop.org/drm/intel/issues/92


Build changes
-------------

  * Linux: CI_DRM_11693 -> Patchwork_104305v1

  CI-20190529: 20190529
  CI_DRM_11693: 14289bc81309b2126f4ba9f339837dacf34ddf9c @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_6485: 51663917b40d36086cc1c555ce4f67b22937694d @ https://gitlab.freedesktop.org/drm/igt-gpu-tools.git
  Patchwork_104305v1: 14289bc81309b2126f4ba9f339837dacf34ddf9c @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_104305v1/index.html

[-- Attachment #2: Type: text/html, Size: 58427 bytes --]

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-24  9:43 ` [Intel-gfx] " Tvrtko Ursulin
@ 2022-05-24 17:51   ` Matt Roper
  -1 siblings, 0 replies; 25+ messages in thread
From: Matt Roper @ 2022-05-24 17:51 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx, dri-devel, Tvrtko Ursulin

On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Catch and log any garbage in the register, including no tiles marked, or
> multiple tiles marked.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> ---
> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
> during glmark and more badness. So I thought lets log all possible failure
> modes from here and also use per device logging.
> ---
>  drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>  drivers/gpu/drm/i915/i915_reg.h |  1 +
>  2 files changed, 23 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 73cebc6aa650..79853d3fc1ed 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>  	u32 gu_misc_iir;
>  
>  	if (!intel_irqs_enabled(i915))
> -		return IRQ_NONE;
> +		goto none;
>  
>  	master_tile_ctl = dg1_master_intr_disable(regs);
> -	if (!master_tile_ctl) {
> -		dg1_master_intr_enable(regs);
> -		return IRQ_NONE;
> +	if (!master_tile_ctl)
> +		goto enable_none;
> +
> +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
> +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
> +			 master_tile_ctl);

I know we have a bunch of them already, but shouldn't we be avoiding
printk-based stuff like this inside interrupt handlers?  Should we be
migrating all these error messages over to trace_printk or something
similar that's safer to use?


Matt

> +		goto enable_none;
>  	}
>  
>  	/* FIXME: we only support tile 0 for now. */
> -	if (master_tile_ctl & DG1_MSTR_TILE(0)) {
> -		master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
> -		raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
> -	} else {
> -		DRM_ERROR("Tile not supported: 0x%08x\n", master_tile_ctl);
> -		dg1_master_intr_enable(regs);
> -		return IRQ_NONE;
> +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
> +	    DG1_MSTR_TILE(0)) {
> +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
> +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
> +					     master_tile_ctl)));
> +		goto enable_none;
>  	}
>  
> +	master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
> +	raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
> +
>  	gen11_gt_irq_handler(gt, master_ctl);
>  
>  	if (master_ctl & GEN11_DISPLAY_IRQ)
> @@ -2810,6 +2816,11 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>  	pmu_irq_stats(i915, IRQ_HANDLED);
>  
>  	return IRQ_HANDLED;
> +
> +enable_none:
> +	dg1_master_intr_enable(regs);
> +none:
> +	return IRQ_NONE;
>  }
>  
>  /* Called from drm generic code, passed 'crtc' which
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index d8579ab9384c..eefa301c6430 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -5774,6 +5774,7 @@
>  
>  #define DG1_MSTR_TILE_INTR		_MMIO(0x190008)
>  #define   DG1_MSTR_IRQ			REG_BIT(31)
> +#define   DG1_MSTR_TILE_MASK		REG_GENMASK(3, 0)
>  #define   DG1_MSTR_TILE(t)		REG_BIT(t)
>  
>  #define GEN11_DISPLAY_INT_CTL		_MMIO(0x44200)
> -- 
> 2.32.0
> 

-- 
Matt Roper
Graphics Software Engineer
VTT-OSGC Platform Enablement
Intel Corporation

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-05-24 17:51   ` Matt Roper
  0 siblings, 0 replies; 25+ messages in thread
From: Matt Roper @ 2022-05-24 17:51 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx, dri-devel

On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> Catch and log any garbage in the register, including no tiles marked, or
> multiple tiles marked.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> ---
> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
> during glmark and more badness. So I thought lets log all possible failure
> modes from here and also use per device logging.
> ---
>  drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>  drivers/gpu/drm/i915/i915_reg.h |  1 +
>  2 files changed, 23 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 73cebc6aa650..79853d3fc1ed 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>  	u32 gu_misc_iir;
>  
>  	if (!intel_irqs_enabled(i915))
> -		return IRQ_NONE;
> +		goto none;
>  
>  	master_tile_ctl = dg1_master_intr_disable(regs);
> -	if (!master_tile_ctl) {
> -		dg1_master_intr_enable(regs);
> -		return IRQ_NONE;
> +	if (!master_tile_ctl)
> +		goto enable_none;
> +
> +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
> +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
> +			 master_tile_ctl);

I know we have a bunch of them already, but shouldn't we be avoiding
printk-based stuff like this inside interrupt handlers?  Should we be
migrating all these error messages over to trace_printk or something
similar that's safer to use?


Matt

> +		goto enable_none;
>  	}
>  
>  	/* FIXME: we only support tile 0 for now. */
> -	if (master_tile_ctl & DG1_MSTR_TILE(0)) {
> -		master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
> -		raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
> -	} else {
> -		DRM_ERROR("Tile not supported: 0x%08x\n", master_tile_ctl);
> -		dg1_master_intr_enable(regs);
> -		return IRQ_NONE;
> +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
> +	    DG1_MSTR_TILE(0)) {
> +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
> +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
> +					     master_tile_ctl)));
> +		goto enable_none;
>  	}
>  
> +	master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
> +	raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
> +
>  	gen11_gt_irq_handler(gt, master_ctl);
>  
>  	if (master_ctl & GEN11_DISPLAY_IRQ)
> @@ -2810,6 +2816,11 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>  	pmu_irq_stats(i915, IRQ_HANDLED);
>  
>  	return IRQ_HANDLED;
> +
> +enable_none:
> +	dg1_master_intr_enable(regs);
> +none:
> +	return IRQ_NONE;
>  }
>  
>  /* Called from drm generic code, passed 'crtc' which
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index d8579ab9384c..eefa301c6430 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -5774,6 +5774,7 @@
>  
>  #define DG1_MSTR_TILE_INTR		_MMIO(0x190008)
>  #define   DG1_MSTR_IRQ			REG_BIT(31)
> +#define   DG1_MSTR_TILE_MASK		REG_GENMASK(3, 0)
>  #define   DG1_MSTR_TILE(t)		REG_BIT(t)
>  
>  #define GEN11_DISPLAY_INT_CTL		_MMIO(0x44200)
> -- 
> 2.32.0
> 

-- 
Matt Roper
Graphics Software Engineer
VTT-OSGC Platform Enablement
Intel Corporation

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-24 17:51   ` [Intel-gfx] " Matt Roper
@ 2022-05-25 16:03     ` Tvrtko Ursulin
  -1 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-05-25 16:03 UTC (permalink / raw)
  To: Matt Roper; +Cc: Intel-gfx, dri-devel, Tvrtko Ursulin


On 24/05/2022 18:51, Matt Roper wrote:
> On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Catch and log any garbage in the register, including no tiles marked, or
>> multiple tiles marked.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Matt Roper <matthew.d.roper@intel.com>
>> ---
>> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
>> during glmark and more badness. So I thought lets log all possible failure
>> modes from here and also use per device logging.
>> ---
>>   drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>   drivers/gpu/drm/i915/i915_reg.h |  1 +
>>   2 files changed, 23 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>> index 73cebc6aa650..79853d3fc1ed 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>   	u32 gu_misc_iir;
>>   
>>   	if (!intel_irqs_enabled(i915))
>> -		return IRQ_NONE;
>> +		goto none;
>>   
>>   	master_tile_ctl = dg1_master_intr_disable(regs);
>> -	if (!master_tile_ctl) {
>> -		dg1_master_intr_enable(regs);
>> -		return IRQ_NONE;
>> +	if (!master_tile_ctl)
>> +		goto enable_none;
>> +
>> +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>> +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>> +			 master_tile_ctl);
> 
> I know we have a bunch of them already, but shouldn't we be avoiding
> printk-based stuff like this inside interrupt handlers?  Should we be
> migrating all these error messages over to trace_printk or something
> similar that's safer to use?

Not sure - I kind of think some really unexpected and worrying 
situations should be loud and on by default. Risk is then spam if not 
ratelimited. Maybe we should instead ratelimit most errors/warnings 
coming for irq handlers?

In this particular case at least DRM_ERROR with no device info is the 
odd one out in the entire file so I'd suggest changing at least that, if 
the rest of my changes is of questionable benefit.

Regards,

Tvrtko

> 
> 
> Matt
> 
>> +		goto enable_none;
>>   	}
>>   
>>   	/* FIXME: we only support tile 0 for now. */
>> -	if (master_tile_ctl & DG1_MSTR_TILE(0)) {
>> -		master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
>> -		raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
>> -	} else {
>> -		DRM_ERROR("Tile not supported: 0x%08x\n", master_tile_ctl);
>> -		dg1_master_intr_enable(regs);
>> -		return IRQ_NONE;
>> +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
>> +	    DG1_MSTR_TILE(0)) {
>> +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
>> +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
>> +					     master_tile_ctl)));
>> +		goto enable_none;
>>   	}
>>   
>> +	master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
>> +	raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
>> +
>>   	gen11_gt_irq_handler(gt, master_ctl);
>>   
>>   	if (master_ctl & GEN11_DISPLAY_IRQ)
>> @@ -2810,6 +2816,11 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>   	pmu_irq_stats(i915, IRQ_HANDLED);
>>   
>>   	return IRQ_HANDLED;
>> +
>> +enable_none:
>> +	dg1_master_intr_enable(regs);
>> +none:
>> +	return IRQ_NONE;
>>   }
>>   
>>   /* Called from drm generic code, passed 'crtc' which
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>> index d8579ab9384c..eefa301c6430 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -5774,6 +5774,7 @@
>>   
>>   #define DG1_MSTR_TILE_INTR		_MMIO(0x190008)
>>   #define   DG1_MSTR_IRQ			REG_BIT(31)
>> +#define   DG1_MSTR_TILE_MASK		REG_GENMASK(3, 0)
>>   #define   DG1_MSTR_TILE(t)		REG_BIT(t)
>>   
>>   #define GEN11_DISPLAY_INT_CTL		_MMIO(0x44200)
>> -- 
>> 2.32.0
>>
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-05-25 16:03     ` Tvrtko Ursulin
  0 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-05-25 16:03 UTC (permalink / raw)
  To: Matt Roper; +Cc: Intel-gfx, dri-devel


On 24/05/2022 18:51, Matt Roper wrote:
> On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> Catch and log any garbage in the register, including no tiles marked, or
>> multiple tiles marked.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Matt Roper <matthew.d.roper@intel.com>
>> ---
>> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
>> during glmark and more badness. So I thought lets log all possible failure
>> modes from here and also use per device logging.
>> ---
>>   drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>   drivers/gpu/drm/i915/i915_reg.h |  1 +
>>   2 files changed, 23 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>> index 73cebc6aa650..79853d3fc1ed 100644
>> --- a/drivers/gpu/drm/i915/i915_irq.c
>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>   	u32 gu_misc_iir;
>>   
>>   	if (!intel_irqs_enabled(i915))
>> -		return IRQ_NONE;
>> +		goto none;
>>   
>>   	master_tile_ctl = dg1_master_intr_disable(regs);
>> -	if (!master_tile_ctl) {
>> -		dg1_master_intr_enable(regs);
>> -		return IRQ_NONE;
>> +	if (!master_tile_ctl)
>> +		goto enable_none;
>> +
>> +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>> +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>> +			 master_tile_ctl);
> 
> I know we have a bunch of them already, but shouldn't we be avoiding
> printk-based stuff like this inside interrupt handlers?  Should we be
> migrating all these error messages over to trace_printk or something
> similar that's safer to use?

Not sure - I kind of think some really unexpected and worrying 
situations should be loud and on by default. Risk is then spam if not 
ratelimited. Maybe we should instead ratelimit most errors/warnings 
coming for irq handlers?

In this particular case at least DRM_ERROR with no device info is the 
odd one out in the entire file so I'd suggest changing at least that, if 
the rest of my changes is of questionable benefit.

Regards,

Tvrtko

> 
> 
> Matt
> 
>> +		goto enable_none;
>>   	}
>>   
>>   	/* FIXME: we only support tile 0 for now. */
>> -	if (master_tile_ctl & DG1_MSTR_TILE(0)) {
>> -		master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
>> -		raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
>> -	} else {
>> -		DRM_ERROR("Tile not supported: 0x%08x\n", master_tile_ctl);
>> -		dg1_master_intr_enable(regs);
>> -		return IRQ_NONE;
>> +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
>> +	    DG1_MSTR_TILE(0)) {
>> +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
>> +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
>> +					     master_tile_ctl)));
>> +		goto enable_none;
>>   	}
>>   
>> +	master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
>> +	raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
>> +
>>   	gen11_gt_irq_handler(gt, master_ctl);
>>   
>>   	if (master_ctl & GEN11_DISPLAY_IRQ)
>> @@ -2810,6 +2816,11 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>   	pmu_irq_stats(i915, IRQ_HANDLED);
>>   
>>   	return IRQ_HANDLED;
>> +
>> +enable_none:
>> +	dg1_master_intr_enable(regs);
>> +none:
>> +	return IRQ_NONE;
>>   }
>>   
>>   /* Called from drm generic code, passed 'crtc' which
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
>> index d8579ab9384c..eefa301c6430 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -5774,6 +5774,7 @@
>>   
>>   #define DG1_MSTR_TILE_INTR		_MMIO(0x190008)
>>   #define   DG1_MSTR_IRQ			REG_BIT(31)
>> +#define   DG1_MSTR_TILE_MASK		REG_GENMASK(3, 0)
>>   #define   DG1_MSTR_TILE(t)		REG_BIT(t)
>>   
>>   #define GEN11_DISPLAY_INT_CTL		_MMIO(0x44200)
>> -- 
>> 2.32.0
>>
> 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-25 16:03     ` [Intel-gfx] " Tvrtko Ursulin
@ 2022-05-25 18:05       ` Matt Roper
  -1 siblings, 0 replies; 25+ messages in thread
From: Matt Roper @ 2022-05-25 18:05 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx, dri-devel, Tvrtko Ursulin

On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
> 
> On 24/05/2022 18:51, Matt Roper wrote:
> > On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
> > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > 
> > > Catch and log any garbage in the register, including no tiles marked, or
> > > multiple tiles marked.
> > > 
> > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > Cc: Matt Roper <matthew.d.roper@intel.com>
> > > ---
> > > We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
> > > during glmark and more badness. So I thought lets log all possible failure
> > > modes from here and also use per device logging.
> > > ---
> > >   drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
> > >   drivers/gpu/drm/i915/i915_reg.h |  1 +
> > >   2 files changed, 23 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > > index 73cebc6aa650..79853d3fc1ed 100644
> > > --- a/drivers/gpu/drm/i915/i915_irq.c
> > > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > > @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
> > >   	u32 gu_misc_iir;
> > >   	if (!intel_irqs_enabled(i915))
> > > -		return IRQ_NONE;
> > > +		goto none;
> > >   	master_tile_ctl = dg1_master_intr_disable(regs);
> > > -	if (!master_tile_ctl) {
> > > -		dg1_master_intr_enable(regs);
> > > -		return IRQ_NONE;
> > > +	if (!master_tile_ctl)
> > > +		goto enable_none;
> > > +
> > > +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
> > > +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
> > > +			 master_tile_ctl);
> > 
> > I know we have a bunch of them already, but shouldn't we be avoiding
> > printk-based stuff like this inside interrupt handlers?  Should we be
> > migrating all these error messages over to trace_printk or something
> > similar that's safer to use?
> 
> Not sure - I kind of think some really unexpected and worrying situations
> should be loud and on by default. Risk is then spam if not ratelimited.
> Maybe we should instead ratelimit most errors/warnings coming for irq
> handlers?

It's not the risk of spam that's the problem, but rather that
printk-based stuff eventually calls into the console code to flush its
buffers.  That's way more overhead than you want in an interrupt handler
so it's bad on its own, but if you're using something slow like a serial
console, it becomes even more of a problem.

While the unexpected bits in the master tile register are strange and
may point to a bigger problem somewhere else, they're also harmless on
their own since we should just ignore those bits and only process the
valid tiles.

> 
> In this particular case at least DRM_ERROR with no device info is the odd
> one out in the entire file so I'd suggest changing at least that, if the
> rest of my changes is of questionable benefit.

Changing DRM_ERROR -> drm_err would probably be fine in the short term
since it doesn't really make us any worse off.  Changing to drm_warn
might not be great since we're generating a lot more lines of output and
probably multiplying the already bad overhead that shouldn't be
happening in an interrupt handler.  But if we could update the interrupt
handler to just save away the details and do the actual drm_warn later,
outside the interrupt handler code, that would be okay.  We should
probably work toward something like that for all of our interrupt
handler warning/error messages.


Matt

> 
> Regards,
> 
> Tvrtko
> 
> > 
> > 
> > Matt
> > 
> > > +		goto enable_none;
> > >   	}
> > >   	/* FIXME: we only support tile 0 for now. */
> > > -	if (master_tile_ctl & DG1_MSTR_TILE(0)) {
> > > -		master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
> > > -		raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
> > > -	} else {
> > > -		DRM_ERROR("Tile not supported: 0x%08x\n", master_tile_ctl);
> > > -		dg1_master_intr_enable(regs);
> > > -		return IRQ_NONE;
> > > +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
> > > +	    DG1_MSTR_TILE(0)) {
> > > +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
> > > +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
> > > +					     master_tile_ctl)));
> > > +		goto enable_none;
> > >   	}
> > > +	master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
> > > +	raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
> > > +
> > >   	gen11_gt_irq_handler(gt, master_ctl);
> > >   	if (master_ctl & GEN11_DISPLAY_IRQ)
> > > @@ -2810,6 +2816,11 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
> > >   	pmu_irq_stats(i915, IRQ_HANDLED);
> > >   	return IRQ_HANDLED;
> > > +
> > > +enable_none:
> > > +	dg1_master_intr_enable(regs);
> > > +none:
> > > +	return IRQ_NONE;
> > >   }
> > >   /* Called from drm generic code, passed 'crtc' which
> > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> > > index d8579ab9384c..eefa301c6430 100644
> > > --- a/drivers/gpu/drm/i915/i915_reg.h
> > > +++ b/drivers/gpu/drm/i915/i915_reg.h
> > > @@ -5774,6 +5774,7 @@
> > >   #define DG1_MSTR_TILE_INTR		_MMIO(0x190008)
> > >   #define   DG1_MSTR_IRQ			REG_BIT(31)
> > > +#define   DG1_MSTR_TILE_MASK		REG_GENMASK(3, 0)
> > >   #define   DG1_MSTR_TILE(t)		REG_BIT(t)
> > >   #define GEN11_DISPLAY_INT_CTL		_MMIO(0x44200)
> > > -- 
> > > 2.32.0
> > > 
> > 

-- 
Matt Roper
Graphics Software Engineer
VTT-OSGC Platform Enablement
Intel Corporation

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-05-25 18:05       ` Matt Roper
  0 siblings, 0 replies; 25+ messages in thread
From: Matt Roper @ 2022-05-25 18:05 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx, dri-devel

On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
> 
> On 24/05/2022 18:51, Matt Roper wrote:
> > On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
> > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > 
> > > Catch and log any garbage in the register, including no tiles marked, or
> > > multiple tiles marked.
> > > 
> > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > Cc: Matt Roper <matthew.d.roper@intel.com>
> > > ---
> > > We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
> > > during glmark and more badness. So I thought lets log all possible failure
> > > modes from here and also use per device logging.
> > > ---
> > >   drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
> > >   drivers/gpu/drm/i915/i915_reg.h |  1 +
> > >   2 files changed, 23 insertions(+), 11 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > > index 73cebc6aa650..79853d3fc1ed 100644
> > > --- a/drivers/gpu/drm/i915/i915_irq.c
> > > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > > @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
> > >   	u32 gu_misc_iir;
> > >   	if (!intel_irqs_enabled(i915))
> > > -		return IRQ_NONE;
> > > +		goto none;
> > >   	master_tile_ctl = dg1_master_intr_disable(regs);
> > > -	if (!master_tile_ctl) {
> > > -		dg1_master_intr_enable(regs);
> > > -		return IRQ_NONE;
> > > +	if (!master_tile_ctl)
> > > +		goto enable_none;
> > > +
> > > +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
> > > +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
> > > +			 master_tile_ctl);
> > 
> > I know we have a bunch of them already, but shouldn't we be avoiding
> > printk-based stuff like this inside interrupt handlers?  Should we be
> > migrating all these error messages over to trace_printk or something
> > similar that's safer to use?
> 
> Not sure - I kind of think some really unexpected and worrying situations
> should be loud and on by default. Risk is then spam if not ratelimited.
> Maybe we should instead ratelimit most errors/warnings coming for irq
> handlers?

It's not the risk of spam that's the problem, but rather that
printk-based stuff eventually calls into the console code to flush its
buffers.  That's way more overhead than you want in an interrupt handler
so it's bad on its own, but if you're using something slow like a serial
console, it becomes even more of a problem.

While the unexpected bits in the master tile register are strange and
may point to a bigger problem somewhere else, they're also harmless on
their own since we should just ignore those bits and only process the
valid tiles.

> 
> In this particular case at least DRM_ERROR with no device info is the odd
> one out in the entire file so I'd suggest changing at least that, if the
> rest of my changes is of questionable benefit.

Changing DRM_ERROR -> drm_err would probably be fine in the short term
since it doesn't really make us any worse off.  Changing to drm_warn
might not be great since we're generating a lot more lines of output and
probably multiplying the already bad overhead that shouldn't be
happening in an interrupt handler.  But if we could update the interrupt
handler to just save away the details and do the actual drm_warn later,
outside the interrupt handler code, that would be okay.  We should
probably work toward something like that for all of our interrupt
handler warning/error messages.


Matt

> 
> Regards,
> 
> Tvrtko
> 
> > 
> > 
> > Matt
> > 
> > > +		goto enable_none;
> > >   	}
> > >   	/* FIXME: we only support tile 0 for now. */
> > > -	if (master_tile_ctl & DG1_MSTR_TILE(0)) {
> > > -		master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
> > > -		raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
> > > -	} else {
> > > -		DRM_ERROR("Tile not supported: 0x%08x\n", master_tile_ctl);
> > > -		dg1_master_intr_enable(regs);
> > > -		return IRQ_NONE;
> > > +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
> > > +	    DG1_MSTR_TILE(0)) {
> > > +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
> > > +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
> > > +					     master_tile_ctl)));
> > > +		goto enable_none;
> > >   	}
> > > +	master_ctl = raw_reg_read(regs, GEN11_GFX_MSTR_IRQ);
> > > +	raw_reg_write(regs, GEN11_GFX_MSTR_IRQ, master_ctl);
> > > +
> > >   	gen11_gt_irq_handler(gt, master_ctl);
> > >   	if (master_ctl & GEN11_DISPLAY_IRQ)
> > > @@ -2810,6 +2816,11 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
> > >   	pmu_irq_stats(i915, IRQ_HANDLED);
> > >   	return IRQ_HANDLED;
> > > +
> > > +enable_none:
> > > +	dg1_master_intr_enable(regs);
> > > +none:
> > > +	return IRQ_NONE;
> > >   }
> > >   /* Called from drm generic code, passed 'crtc' which
> > > diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> > > index d8579ab9384c..eefa301c6430 100644
> > > --- a/drivers/gpu/drm/i915/i915_reg.h
> > > +++ b/drivers/gpu/drm/i915/i915_reg.h
> > > @@ -5774,6 +5774,7 @@
> > >   #define DG1_MSTR_TILE_INTR		_MMIO(0x190008)
> > >   #define   DG1_MSTR_IRQ			REG_BIT(31)
> > > +#define   DG1_MSTR_TILE_MASK		REG_GENMASK(3, 0)
> > >   #define   DG1_MSTR_TILE(t)		REG_BIT(t)
> > >   #define GEN11_DISPLAY_INT_CTL		_MMIO(0x44200)
> > > -- 
> > > 2.32.0
> > > 
> > 

-- 
Matt Roper
Graphics Software Engineer
VTT-OSGC Platform Enablement
Intel Corporation

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-25 16:03     ` [Intel-gfx] " Tvrtko Ursulin
@ 2022-05-25 18:14       ` Lucas De Marchi
  -1 siblings, 0 replies; 25+ messages in thread
From: Lucas De Marchi @ 2022-05-25 18:14 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx, dri-devel, Tvrtko Ursulin

On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
>
>On 24/05/2022 18:51, Matt Roper wrote:
>>On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>>>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>
>>>Catch and log any garbage in the register, including no tiles marked, or
>>>multiple tiles marked.
>>>
>>>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>Cc: Matt Roper <matthew.d.roper@intel.com>
>>>---
>>>We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
>>>during glmark and more badness. So I thought lets log all possible failure
>>>modes from here and also use per device logging.
>>>---
>>>  drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>>  drivers/gpu/drm/i915/i915_reg.h |  1 +
>>>  2 files changed, 23 insertions(+), 11 deletions(-)
>>>
>>>diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>>index 73cebc6aa650..79853d3fc1ed 100644
>>>--- a/drivers/gpu/drm/i915/i915_irq.c
>>>+++ b/drivers/gpu/drm/i915/i915_irq.c
>>>@@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>  	u32 gu_misc_iir;
>>>  	if (!intel_irqs_enabled(i915))
>>>-		return IRQ_NONE;
>>>+		goto none;
>>>  	master_tile_ctl = dg1_master_intr_disable(regs);
>>>-	if (!master_tile_ctl) {
>>>-		dg1_master_intr_enable(regs);
>>>-		return IRQ_NONE;
>>>+	if (!master_tile_ctl)
>>>+		goto enable_none;
>>>+
>>>+	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>>>+		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>>>+			 master_tile_ctl);
>>
>>I know we have a bunch of them already, but shouldn't we be avoiding
>>printk-based stuff like this inside interrupt handlers?  Should we be
>>migrating all these error messages over to trace_printk or something
>>similar that's safer to use?
>
>Not sure - I kind of think some really unexpected and worrying 
>situations should be loud and on by default. Risk is then spam if not 
>ratelimited. Maybe we should instead ratelimit most errors/warnings 
>coming for irq handlers?
>
>In this particular case at least DRM_ERROR with no device info is the 
>odd one out in the entire file so I'd suggest changing at least that, 
>if the rest of my changes is of questionable benefit.

I'd rather remove the printk's from irq rather than adding more. At the very
least, they should be the _once variant or ratelimited. One of the few
cases to even deserve a unlikely(), even to document this shouldn't
really be happening.

Our irq handlers (particularly on dgfx and multi-gt) are already too
long running... I don't like making them any onger or slower.


Lucas De Marchi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-05-25 18:14       ` Lucas De Marchi
  0 siblings, 0 replies; 25+ messages in thread
From: Lucas De Marchi @ 2022-05-25 18:14 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx, dri-devel

On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
>
>On 24/05/2022 18:51, Matt Roper wrote:
>>On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>>>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>
>>>Catch and log any garbage in the register, including no tiles marked, or
>>>multiple tiles marked.
>>>
>>>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>Cc: Matt Roper <matthew.d.roper@intel.com>
>>>---
>>>We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
>>>during glmark and more badness. So I thought lets log all possible failure
>>>modes from here and also use per device logging.
>>>---
>>>  drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>>  drivers/gpu/drm/i915/i915_reg.h |  1 +
>>>  2 files changed, 23 insertions(+), 11 deletions(-)
>>>
>>>diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>>index 73cebc6aa650..79853d3fc1ed 100644
>>>--- a/drivers/gpu/drm/i915/i915_irq.c
>>>+++ b/drivers/gpu/drm/i915/i915_irq.c
>>>@@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>  	u32 gu_misc_iir;
>>>  	if (!intel_irqs_enabled(i915))
>>>-		return IRQ_NONE;
>>>+		goto none;
>>>  	master_tile_ctl = dg1_master_intr_disable(regs);
>>>-	if (!master_tile_ctl) {
>>>-		dg1_master_intr_enable(regs);
>>>-		return IRQ_NONE;
>>>+	if (!master_tile_ctl)
>>>+		goto enable_none;
>>>+
>>>+	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>>>+		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>>>+			 master_tile_ctl);
>>
>>I know we have a bunch of them already, but shouldn't we be avoiding
>>printk-based stuff like this inside interrupt handlers?  Should we be
>>migrating all these error messages over to trace_printk or something
>>similar that's safer to use?
>
>Not sure - I kind of think some really unexpected and worrying 
>situations should be loud and on by default. Risk is then spam if not 
>ratelimited. Maybe we should instead ratelimit most errors/warnings 
>coming for irq handlers?
>
>In this particular case at least DRM_ERROR with no device info is the 
>odd one out in the entire file so I'd suggest changing at least that, 
>if the rest of my changes is of questionable benefit.

I'd rather remove the printk's from irq rather than adding more. At the very
least, they should be the _once variant or ratelimited. One of the few
cases to even deserve a unlikely(), even to document this shouldn't
really be happening.

Our irq handlers (particularly on dgfx and multi-gt) are already too
long running... I don't like making them any onger or slower.


Lucas De Marchi

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-25 18:05       ` [Intel-gfx] " Matt Roper
@ 2022-05-26 10:18         ` Tvrtko Ursulin
  -1 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-05-26 10:18 UTC (permalink / raw)
  To: Matt Roper; +Cc: Intel-gfx, dri-devel, Tvrtko Ursulin


On 25/05/2022 19:05, Matt Roper wrote:
> On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
>>
>> On 24/05/2022 18:51, Matt Roper wrote:
>>> On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> Catch and log any garbage in the register, including no tiles marked, or
>>>> multiple tiles marked.
>>>>
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> Cc: Matt Roper <matthew.d.roper@intel.com>
>>>> ---
>>>> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
>>>> during glmark and more badness. So I thought lets log all possible failure
>>>> modes from here and also use per device logging.
>>>> ---
>>>>    drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>>>    drivers/gpu/drm/i915/i915_reg.h |  1 +
>>>>    2 files changed, 23 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>>> index 73cebc6aa650..79853d3fc1ed 100644
>>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>>> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>>    	u32 gu_misc_iir;
>>>>    	if (!intel_irqs_enabled(i915))
>>>> -		return IRQ_NONE;
>>>> +		goto none;
>>>>    	master_tile_ctl = dg1_master_intr_disable(regs);
>>>> -	if (!master_tile_ctl) {
>>>> -		dg1_master_intr_enable(regs);
>>>> -		return IRQ_NONE;
>>>> +	if (!master_tile_ctl)
>>>> +		goto enable_none;
>>>> +
>>>> +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>>>> +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>>>> +			 master_tile_ctl);
>>>
>>> I know we have a bunch of them already, but shouldn't we be avoiding
>>> printk-based stuff like this inside interrupt handlers?  Should we be
>>> migrating all these error messages over to trace_printk or something
>>> similar that's safer to use?
>>
>> Not sure - I kind of think some really unexpected and worrying situations
>> should be loud and on by default. Risk is then spam if not ratelimited.
>> Maybe we should instead ratelimit most errors/warnings coming for irq
>> handlers?
> 
> It's not the risk of spam that's the problem, but rather that
> printk-based stuff eventually calls into the console code to flush its
> buffers.  That's way more overhead than you want in an interrupt handler
> so it's bad on its own, but if you're using something slow like a serial
> console, it becomes even more of a problem.

Is it a problem for messages which we never expect to see?

> While the unexpected bits in the master tile register are strange and
> may point to a bigger problem somewhere else, they're also harmless on
> their own since we should just ignore those bits and only process the
> valid tiles.

Yes, I was expecting that a patch belonging to multi-tile enablement 
would be incoming soon, which would be changing:

+	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
+	    DG1_MSTR_TILE(0)) {
+		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
+			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
+					     master_tile_ctl)));
+		goto enable_none;
  	}

 From this patch, into something completely different like walking bit 
by bit, handling the present tiles, and warning on unexpected ones. What 
should remain though is warning on no tiles signaled (which what we saw, 
together with garbage in reserved bits).

>> In this particular case at least DRM_ERROR with no device info is the odd
>> one out in the entire file so I'd suggest changing at least that, if the
>> rest of my changes is of questionable benefit.
> 
> Changing DRM_ERROR -> drm_err would probably be fine in the short term
> since it doesn't really make us any worse off.  Changing to drm_warn
> might not be great since we're generating a lot more lines of output and

Sorry I don't follow - why does replacing drm_err with drm_warn generate 
(a lot) more lines of output?

But it can be drm_err for all I care, I don't think we really have 
consistent story between errors and warnings in this area.

> probably multiplying the already bad overhead that shouldn't be
> happening in an interrupt handler.  But if we could update the interrupt
> handler to just save away the details and do the actual drm_warn later,
> outside the interrupt handler code, that would be okay.  We should
> probably work toward something like that for all of our interrupt
> handler warning/error messages.

Not sure I agree - for messages which we don't expect to see it doesn't 
really matter that there will be overhead when they are hit. Presumably 
bad things are already happening there so spending effort to optimise 
those path is questionable.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-05-26 10:18         ` Tvrtko Ursulin
  0 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-05-26 10:18 UTC (permalink / raw)
  To: Matt Roper; +Cc: Intel-gfx, dri-devel


On 25/05/2022 19:05, Matt Roper wrote:
> On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
>>
>> On 24/05/2022 18:51, Matt Roper wrote:
>>> On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> Catch and log any garbage in the register, including no tiles marked, or
>>>> multiple tiles marked.
>>>>
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> Cc: Matt Roper <matthew.d.roper@intel.com>
>>>> ---
>>>> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
>>>> during glmark and more badness. So I thought lets log all possible failure
>>>> modes from here and also use per device logging.
>>>> ---
>>>>    drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>>>    drivers/gpu/drm/i915/i915_reg.h |  1 +
>>>>    2 files changed, 23 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>>> index 73cebc6aa650..79853d3fc1ed 100644
>>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>>> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>>    	u32 gu_misc_iir;
>>>>    	if (!intel_irqs_enabled(i915))
>>>> -		return IRQ_NONE;
>>>> +		goto none;
>>>>    	master_tile_ctl = dg1_master_intr_disable(regs);
>>>> -	if (!master_tile_ctl) {
>>>> -		dg1_master_intr_enable(regs);
>>>> -		return IRQ_NONE;
>>>> +	if (!master_tile_ctl)
>>>> +		goto enable_none;
>>>> +
>>>> +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>>>> +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>>>> +			 master_tile_ctl);
>>>
>>> I know we have a bunch of them already, but shouldn't we be avoiding
>>> printk-based stuff like this inside interrupt handlers?  Should we be
>>> migrating all these error messages over to trace_printk or something
>>> similar that's safer to use?
>>
>> Not sure - I kind of think some really unexpected and worrying situations
>> should be loud and on by default. Risk is then spam if not ratelimited.
>> Maybe we should instead ratelimit most errors/warnings coming for irq
>> handlers?
> 
> It's not the risk of spam that's the problem, but rather that
> printk-based stuff eventually calls into the console code to flush its
> buffers.  That's way more overhead than you want in an interrupt handler
> so it's bad on its own, but if you're using something slow like a serial
> console, it becomes even more of a problem.

Is it a problem for messages which we never expect to see?

> While the unexpected bits in the master tile register are strange and
> may point to a bigger problem somewhere else, they're also harmless on
> their own since we should just ignore those bits and only process the
> valid tiles.

Yes, I was expecting that a patch belonging to multi-tile enablement 
would be incoming soon, which would be changing:

+	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
+	    DG1_MSTR_TILE(0)) {
+		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
+			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
+					     master_tile_ctl)));
+		goto enable_none;
  	}

 From this patch, into something completely different like walking bit 
by bit, handling the present tiles, and warning on unexpected ones. What 
should remain though is warning on no tiles signaled (which what we saw, 
together with garbage in reserved bits).

>> In this particular case at least DRM_ERROR with no device info is the odd
>> one out in the entire file so I'd suggest changing at least that, if the
>> rest of my changes is of questionable benefit.
> 
> Changing DRM_ERROR -> drm_err would probably be fine in the short term
> since it doesn't really make us any worse off.  Changing to drm_warn
> might not be great since we're generating a lot more lines of output and

Sorry I don't follow - why does replacing drm_err with drm_warn generate 
(a lot) more lines of output?

But it can be drm_err for all I care, I don't think we really have 
consistent story between errors and warnings in this area.

> probably multiplying the already bad overhead that shouldn't be
> happening in an interrupt handler.  But if we could update the interrupt
> handler to just save away the details and do the actual drm_warn later,
> outside the interrupt handler code, that would be okay.  We should
> probably work toward something like that for all of our interrupt
> handler warning/error messages.

Not sure I agree - for messages which we don't expect to see it doesn't 
really matter that there will be overhead when they are hit. Presumably 
bad things are already happening there so spending effort to optimise 
those path is questionable.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-25 18:14       ` [Intel-gfx] " Lucas De Marchi
@ 2022-05-26 10:29         ` Tvrtko Ursulin
  -1 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-05-26 10:29 UTC (permalink / raw)
  To: Lucas De Marchi; +Cc: Intel-gfx, dri-devel, Tvrtko Ursulin


On 25/05/2022 19:14, Lucas De Marchi wrote:
> On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
>>
>> On 24/05/2022 18:51, Matt Roper wrote:
>>> On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> Catch and log any garbage in the register, including no tiles 
>>>> marked, or
>>>> multiple tiles marked.
>>>>
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> Cc: Matt Roper <matthew.d.roper@intel.com>
>>>> ---
>>>> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 
>>>> 0xF9D2C008)
>>>> during glmark and more badness. So I thought lets log all possible 
>>>> failure
>>>> modes from here and also use per device logging.
>>>> ---
>>>>  drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>>>  drivers/gpu/drm/i915/i915_reg.h |  1 +
>>>>  2 files changed, 23 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c 
>>>> b/drivers/gpu/drm/i915/i915_irq.c
>>>> index 73cebc6aa650..79853d3fc1ed 100644
>>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>>> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, 
>>>> void *arg)
>>>>      u32 gu_misc_iir;
>>>>      if (!intel_irqs_enabled(i915))
>>>> -        return IRQ_NONE;
>>>> +        goto none;
>>>>      master_tile_ctl = dg1_master_intr_disable(regs);
>>>> -    if (!master_tile_ctl) {
>>>> -        dg1_master_intr_enable(regs);
>>>> -        return IRQ_NONE;
>>>> +    if (!master_tile_ctl)
>>>> +        goto enable_none;
>>>> +
>>>> +    if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>>>> +        drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>>>> +             master_tile_ctl);
>>>
>>> I know we have a bunch of them already, but shouldn't we be avoiding
>>> printk-based stuff like this inside interrupt handlers?  Should we be
>>> migrating all these error messages over to trace_printk or something
>>> similar that's safer to use?
>>
>> Not sure - I kind of think some really unexpected and worrying 
>> situations should be loud and on by default. Risk is then spam if not 
>> ratelimited. Maybe we should instead ratelimit most errors/warnings 
>> coming for irq handlers?
>>
>> In this particular case at least DRM_ERROR with no device info is the 
>> odd one out in the entire file so I'd suggest changing at least that, 
>> if the rest of my changes is of questionable benefit.
> 
> I'd rather remove the printk's from irq rather than adding more. At the 
> very
> least, they should be the _once variant or ratelimited. One of the few
> cases to even deserve a unlikely(), even to document this shouldn't
> really be happening.

I would support ratelimited for all the unexpected bits set, no bits 
set, or similar conditions. I wouldn't remove such printks to 
micro-optimize things. That would potentially lose important feedback in 
cases when we hit truly unexpected situations.

But annotating them as unlikely would be a good thing.

> Our irq handlers (particularly on dgfx and multi-gt) are already too
> long running... I don't like making them any onger or slower.

How come? I mean which interrupts are a problem there? Isn't GuC 
supposed to be taking on that load on itself, isn't that one of the main 
selling points?

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-05-26 10:29         ` Tvrtko Ursulin
  0 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-05-26 10:29 UTC (permalink / raw)
  To: Lucas De Marchi; +Cc: Intel-gfx, dri-devel


On 25/05/2022 19:14, Lucas De Marchi wrote:
> On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
>>
>> On 24/05/2022 18:51, Matt Roper wrote:
>>> On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> Catch and log any garbage in the register, including no tiles 
>>>> marked, or
>>>> multiple tiles marked.
>>>>
>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>> Cc: Matt Roper <matthew.d.roper@intel.com>
>>>> ---
>>>> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 
>>>> 0xF9D2C008)
>>>> during glmark and more badness. So I thought lets log all possible 
>>>> failure
>>>> modes from here and also use per device logging.
>>>> ---
>>>>  drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>>>  drivers/gpu/drm/i915/i915_reg.h |  1 +
>>>>  2 files changed, 23 insertions(+), 11 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c 
>>>> b/drivers/gpu/drm/i915/i915_irq.c
>>>> index 73cebc6aa650..79853d3fc1ed 100644
>>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>>> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, 
>>>> void *arg)
>>>>      u32 gu_misc_iir;
>>>>      if (!intel_irqs_enabled(i915))
>>>> -        return IRQ_NONE;
>>>> +        goto none;
>>>>      master_tile_ctl = dg1_master_intr_disable(regs);
>>>> -    if (!master_tile_ctl) {
>>>> -        dg1_master_intr_enable(regs);
>>>> -        return IRQ_NONE;
>>>> +    if (!master_tile_ctl)
>>>> +        goto enable_none;
>>>> +
>>>> +    if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>>>> +        drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>>>> +             master_tile_ctl);
>>>
>>> I know we have a bunch of them already, but shouldn't we be avoiding
>>> printk-based stuff like this inside interrupt handlers?  Should we be
>>> migrating all these error messages over to trace_printk or something
>>> similar that's safer to use?
>>
>> Not sure - I kind of think some really unexpected and worrying 
>> situations should be loud and on by default. Risk is then spam if not 
>> ratelimited. Maybe we should instead ratelimit most errors/warnings 
>> coming for irq handlers?
>>
>> In this particular case at least DRM_ERROR with no device info is the 
>> odd one out in the entire file so I'd suggest changing at least that, 
>> if the rest of my changes is of questionable benefit.
> 
> I'd rather remove the printk's from irq rather than adding more. At the 
> very
> least, they should be the _once variant or ratelimited. One of the few
> cases to even deserve a unlikely(), even to document this shouldn't
> really be happening.

I would support ratelimited for all the unexpected bits set, no bits 
set, or similar conditions. I wouldn't remove such printks to 
micro-optimize things. That would potentially lose important feedback in 
cases when we hit truly unexpected situations.

But annotating them as unlikely would be a good thing.

> Our irq handlers (particularly on dgfx and multi-gt) are already too
> long running... I don't like making them any onger or slower.

How come? I mean which interrupts are a problem there? Isn't GuC 
supposed to be taking on that load on itself, isn't that one of the main 
selling points?

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-26 10:18         ` [Intel-gfx] " Tvrtko Ursulin
@ 2022-05-27 18:42           ` Matt Roper
  -1 siblings, 0 replies; 25+ messages in thread
From: Matt Roper @ 2022-05-27 18:42 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx, Lucas De Marchi, dri-devel

On Thu, May 26, 2022 at 11:18:17AM +0100, Tvrtko Ursulin wrote:
> 
> On 25/05/2022 19:05, Matt Roper wrote:
> > On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
> > > 
> > > On 24/05/2022 18:51, Matt Roper wrote:
> > > > On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
> > > > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > 
> > > > > Catch and log any garbage in the register, including no tiles marked, or
> > > > > multiple tiles marked.
> > > > > 
> > > > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > Cc: Matt Roper <matthew.d.roper@intel.com>
> > > > > ---
> > > > > We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
> > > > > during glmark and more badness. So I thought lets log all possible failure
> > > > > modes from here and also use per device logging.
> > > > > ---
> > > > >    drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
> > > > >    drivers/gpu/drm/i915/i915_reg.h |  1 +
> > > > >    2 files changed, 23 insertions(+), 11 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > > > > index 73cebc6aa650..79853d3fc1ed 100644
> > > > > --- a/drivers/gpu/drm/i915/i915_irq.c
> > > > > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > > > > @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
> > > > >    	u32 gu_misc_iir;
> > > > >    	if (!intel_irqs_enabled(i915))
> > > > > -		return IRQ_NONE;
> > > > > +		goto none;
> > > > >    	master_tile_ctl = dg1_master_intr_disable(regs);
> > > > > -	if (!master_tile_ctl) {
> > > > > -		dg1_master_intr_enable(regs);
> > > > > -		return IRQ_NONE;
> > > > > +	if (!master_tile_ctl)
> > > > > +		goto enable_none;
> > > > > +
> > > > > +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
> > > > > +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
> > > > > +			 master_tile_ctl);
> > > > 
> > > > I know we have a bunch of them already, but shouldn't we be avoiding
> > > > printk-based stuff like this inside interrupt handlers?  Should we be
> > > > migrating all these error messages over to trace_printk or something
> > > > similar that's safer to use?
> > > 
> > > Not sure - I kind of think some really unexpected and worrying situations
> > > should be loud and on by default. Risk is then spam if not ratelimited.
> > > Maybe we should instead ratelimit most errors/warnings coming for irq
> > > handlers?
> > 
> > It's not the risk of spam that's the problem, but rather that
> > printk-based stuff eventually calls into the console code to flush its
> > buffers.  That's way more overhead than you want in an interrupt handler
> > so it's bad on its own, but if you're using something slow like a serial
> > console, it becomes even more of a problem.
> 
> Is it a problem for messages which we never expect to see?

Kind of.  While not as catastrophic, it's the same argument for why we
don't use BUG() anymore...when the impossible does manage to happen
there's unnecessary collateral damage on things outside of graphics.  If
we're adding huge delays inside an interrupt handler (while other
interrupts are disabled) that impacts the system-wide usability, not
just our own driver.

I'd also argue that these messages actually are semi-expected.  Random
bits being set shouldn't happen, but in the world of dgpu's, we do
occasionally see cases where the PCI link itself goes down for reasons
outside our control and then all registers read back as 0xFFFFFFFF,
which will probably trigger error messages here (as well as a bunch of
other places).

> 
> > While the unexpected bits in the master tile register are strange and
> > may point to a bigger problem somewhere else, they're also harmless on
> > their own since we should just ignore those bits and only process the
> > valid tiles.
> 
> Yes, I was expecting that a patch belonging to multi-tile enablement would
> be incoming soon, which would be changing:
> 
> +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
> +	    DG1_MSTR_TILE(0)) {
> +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
> +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
> +					     master_tile_ctl)));
> +		goto enable_none;
>  	}
> 
> From this patch, into something completely different like walking bit by
> bit, handling the present tiles, and warning on unexpected ones. What should
> remain though is warning on no tiles signaled (which what we saw, together
> with garbage in reserved bits).

Yeah.  Although I still feel the interrupt handler should really just be
flagging the errors so that the actual prints themselves can happen
outside the interrupt.

> 
> > > In this particular case at least DRM_ERROR with no device info is the odd
> > > one out in the entire file so I'd suggest changing at least that, if the
> > > rest of my changes is of questionable benefit.
> > 
> > Changing DRM_ERROR -> drm_err would probably be fine in the short term
> > since it doesn't really make us any worse off.  Changing to drm_warn
> > might not be great since we're generating a lot more lines of output and
> 
> Sorry I don't follow - why does replacing drm_err with drm_warn generate (a
> lot) more lines of output?

Sorry, my mistake; I had it in my mind that we were talking about a
drm_WARN_ON rather than just drm_warn (i.e., including a big stacktrace
and such).  DRM_ERROR -> drm_warn alone shouldn't have any extra
negative impact.

> 
> But it can be drm_err for all I care, I don't think we really have
> consistent story between errors and warnings in this area.
> 
> > probably multiplying the already bad overhead that shouldn't be
> > happening in an interrupt handler.  But if we could update the interrupt
> > handler to just save away the details and do the actual drm_warn later,
> > outside the interrupt handler code, that would be okay.  We should
> > probably work toward something like that for all of our interrupt
> > handler warning/error messages.
> 
> Not sure I agree - for messages which we don't expect to see it doesn't
> really matter that there will be overhead when they are hit. Presumably bad
> things are already happening there so spending effort to optimise those path
> is questionable.

Something bad is happening to graphics is we hit one of these cases.
But if we start doing prints while interrupts are disabled, we start
having more of a negative impact on the rest of the system too.


Matt

> 
> Regards,
> 
> Tvrtko

-- 
Matt Roper
Graphics Software Engineer
VTT-OSGC Platform Enablement
Intel Corporation

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-05-27 18:42           ` Matt Roper
  0 siblings, 0 replies; 25+ messages in thread
From: Matt Roper @ 2022-05-27 18:42 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx, Lucas De Marchi, dri-devel, Tvrtko Ursulin

On Thu, May 26, 2022 at 11:18:17AM +0100, Tvrtko Ursulin wrote:
> 
> On 25/05/2022 19:05, Matt Roper wrote:
> > On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
> > > 
> > > On 24/05/2022 18:51, Matt Roper wrote:
> > > > On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
> > > > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > 
> > > > > Catch and log any garbage in the register, including no tiles marked, or
> > > > > multiple tiles marked.
> > > > > 
> > > > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > Cc: Matt Roper <matthew.d.roper@intel.com>
> > > > > ---
> > > > > We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
> > > > > during glmark and more badness. So I thought lets log all possible failure
> > > > > modes from here and also use per device logging.
> > > > > ---
> > > > >    drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
> > > > >    drivers/gpu/drm/i915/i915_reg.h |  1 +
> > > > >    2 files changed, 23 insertions(+), 11 deletions(-)
> > > > > 
> > > > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > > > > index 73cebc6aa650..79853d3fc1ed 100644
> > > > > --- a/drivers/gpu/drm/i915/i915_irq.c
> > > > > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > > > > @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
> > > > >    	u32 gu_misc_iir;
> > > > >    	if (!intel_irqs_enabled(i915))
> > > > > -		return IRQ_NONE;
> > > > > +		goto none;
> > > > >    	master_tile_ctl = dg1_master_intr_disable(regs);
> > > > > -	if (!master_tile_ctl) {
> > > > > -		dg1_master_intr_enable(regs);
> > > > > -		return IRQ_NONE;
> > > > > +	if (!master_tile_ctl)
> > > > > +		goto enable_none;
> > > > > +
> > > > > +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
> > > > > +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
> > > > > +			 master_tile_ctl);
> > > > 
> > > > I know we have a bunch of them already, but shouldn't we be avoiding
> > > > printk-based stuff like this inside interrupt handlers?  Should we be
> > > > migrating all these error messages over to trace_printk or something
> > > > similar that's safer to use?
> > > 
> > > Not sure - I kind of think some really unexpected and worrying situations
> > > should be loud and on by default. Risk is then spam if not ratelimited.
> > > Maybe we should instead ratelimit most errors/warnings coming for irq
> > > handlers?
> > 
> > It's not the risk of spam that's the problem, but rather that
> > printk-based stuff eventually calls into the console code to flush its
> > buffers.  That's way more overhead than you want in an interrupt handler
> > so it's bad on its own, but if you're using something slow like a serial
> > console, it becomes even more of a problem.
> 
> Is it a problem for messages which we never expect to see?

Kind of.  While not as catastrophic, it's the same argument for why we
don't use BUG() anymore...when the impossible does manage to happen
there's unnecessary collateral damage on things outside of graphics.  If
we're adding huge delays inside an interrupt handler (while other
interrupts are disabled) that impacts the system-wide usability, not
just our own driver.

I'd also argue that these messages actually are semi-expected.  Random
bits being set shouldn't happen, but in the world of dgpu's, we do
occasionally see cases where the PCI link itself goes down for reasons
outside our control and then all registers read back as 0xFFFFFFFF,
which will probably trigger error messages here (as well as a bunch of
other places).

> 
> > While the unexpected bits in the master tile register are strange and
> > may point to a bigger problem somewhere else, they're also harmless on
> > their own since we should just ignore those bits and only process the
> > valid tiles.
> 
> Yes, I was expecting that a patch belonging to multi-tile enablement would
> be incoming soon, which would be changing:
> 
> +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
> +	    DG1_MSTR_TILE(0)) {
> +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
> +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
> +					     master_tile_ctl)));
> +		goto enable_none;
>  	}
> 
> From this patch, into something completely different like walking bit by
> bit, handling the present tiles, and warning on unexpected ones. What should
> remain though is warning on no tiles signaled (which what we saw, together
> with garbage in reserved bits).

Yeah.  Although I still feel the interrupt handler should really just be
flagging the errors so that the actual prints themselves can happen
outside the interrupt.

> 
> > > In this particular case at least DRM_ERROR with no device info is the odd
> > > one out in the entire file so I'd suggest changing at least that, if the
> > > rest of my changes is of questionable benefit.
> > 
> > Changing DRM_ERROR -> drm_err would probably be fine in the short term
> > since it doesn't really make us any worse off.  Changing to drm_warn
> > might not be great since we're generating a lot more lines of output and
> 
> Sorry I don't follow - why does replacing drm_err with drm_warn generate (a
> lot) more lines of output?

Sorry, my mistake; I had it in my mind that we were talking about a
drm_WARN_ON rather than just drm_warn (i.e., including a big stacktrace
and such).  DRM_ERROR -> drm_warn alone shouldn't have any extra
negative impact.

> 
> But it can be drm_err for all I care, I don't think we really have
> consistent story between errors and warnings in this area.
> 
> > probably multiplying the already bad overhead that shouldn't be
> > happening in an interrupt handler.  But if we could update the interrupt
> > handler to just save away the details and do the actual drm_warn later,
> > outside the interrupt handler code, that would be okay.  We should
> > probably work toward something like that for all of our interrupt
> > handler warning/error messages.
> 
> Not sure I agree - for messages which we don't expect to see it doesn't
> really matter that there will be overhead when they are hit. Presumably bad
> things are already happening there so spending effort to optimise those path
> is questionable.

Something bad is happening to graphics is we hit one of these cases.
But if we start doing prints while interrupts are disabled, we start
having more of a negative impact on the rest of the system too.


Matt

> 
> Regards,
> 
> Tvrtko

-- 
Matt Roper
Graphics Software Engineer
VTT-OSGC Platform Enablement
Intel Corporation

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-05-27 18:42           ` Matt Roper
@ 2022-06-06 11:55             ` Tvrtko Ursulin
  -1 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-06-06 11:55 UTC (permalink / raw)
  To: Matt Roper; +Cc: Intel-gfx, Lucas De Marchi, dri-devel, Tvrtko Ursulin


On 27/05/2022 19:42, Matt Roper wrote:
> On Thu, May 26, 2022 at 11:18:17AM +0100, Tvrtko Ursulin wrote:
>> On 25/05/2022 19:05, Matt Roper wrote:
>>> On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
>>>>
>>>> On 24/05/2022 18:51, Matt Roper wrote:
>>>>> On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>>>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>
>>>>>> Catch and log any garbage in the register, including no tiles marked, or
>>>>>> multiple tiles marked.
>>>>>>
>>>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>> Cc: Matt Roper <matthew.d.roper@intel.com>
>>>>>> ---
>>>>>> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
>>>>>> during glmark and more badness. So I thought lets log all possible failure
>>>>>> modes from here and also use per device logging.
>>>>>> ---
>>>>>>     drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>>>>>     drivers/gpu/drm/i915/i915_reg.h |  1 +
>>>>>>     2 files changed, 23 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>>>>> index 73cebc6aa650..79853d3fc1ed 100644
>>>>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>>>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>>>>> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>>>>     	u32 gu_misc_iir;
>>>>>>     	if (!intel_irqs_enabled(i915))
>>>>>> -		return IRQ_NONE;
>>>>>> +		goto none;
>>>>>>     	master_tile_ctl = dg1_master_intr_disable(regs);
>>>>>> -	if (!master_tile_ctl) {
>>>>>> -		dg1_master_intr_enable(regs);
>>>>>> -		return IRQ_NONE;
>>>>>> +	if (!master_tile_ctl)
>>>>>> +		goto enable_none;
>>>>>> +
>>>>>> +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>>>>>> +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>>>>>> +			 master_tile_ctl);
>>>>>
>>>>> I know we have a bunch of them already, but shouldn't we be avoiding
>>>>> printk-based stuff like this inside interrupt handlers?  Should we be
>>>>> migrating all these error messages over to trace_printk or something
>>>>> similar that's safer to use?
>>>>
>>>> Not sure - I kind of think some really unexpected and worrying situations
>>>> should be loud and on by default. Risk is then spam if not ratelimited.
>>>> Maybe we should instead ratelimit most errors/warnings coming for irq
>>>> handlers?
>>>
>>> It's not the risk of spam that's the problem, but rather that
>>> printk-based stuff eventually calls into the console code to flush its
>>> buffers.  That's way more overhead than you want in an interrupt handler
>>> so it's bad on its own, but if you're using something slow like a serial
>>> console, it becomes even more of a problem.
>>
>> Is it a problem for messages which we never expect to see?
> 
> Kind of.  While not as catastrophic, it's the same argument for why we
> don't use BUG() anymore...when the impossible does manage to happen
> there's unnecessary collateral damage on things outside of graphics.  If
> we're adding huge delays inside an interrupt handler (while other
> interrupts are disabled) that impacts the system-wide usability, not
> just our own driver.
> 
> I'd also argue that these messages actually are semi-expected.  Random
> bits being set shouldn't happen, but in the world of dgpu's, we do
> occasionally see cases where the PCI link itself goes down for reasons
> outside our control and then all registers read back as 0xFFFFFFFF,
> which will probably trigger error messages here (as well as a bunch of
> other places).

Could you expand a bit on what is semi-expected and when? I mean the 
circumstances of PCI link going down. We certainly don't have any code 
to survive that.

>>> While the unexpected bits in the master tile register are strange and
>>> may point to a bigger problem somewhere else, they're also harmless on
>>> their own since we should just ignore those bits and only process the
>>> valid tiles.
>>
>> Yes, I was expecting that a patch belonging to multi-tile enablement would
>> be incoming soon, which would be changing:
>>
>> +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
>> +	    DG1_MSTR_TILE(0)) {
>> +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
>> +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
>> +					     master_tile_ctl)));
>> +		goto enable_none;
>>   	}
>>
>>  From this patch, into something completely different like walking bit by
>> bit, handling the present tiles, and warning on unexpected ones. What should
>> remain though is warning on no tiles signaled (which what we saw, together
>> with garbage in reserved bits).
> 
> Yeah.  Although I still feel the interrupt handler should really just be
> flagging the errors so that the actual prints themselves can happen
> outside the interrupt.
> 
>>
>>>> In this particular case at least DRM_ERROR with no device info is the odd
>>>> one out in the entire file so I'd suggest changing at least that, if the
>>>> rest of my changes is of questionable benefit.
>>>
>>> Changing DRM_ERROR -> drm_err would probably be fine in the short term
>>> since it doesn't really make us any worse off.  Changing to drm_warn
>>> might not be great since we're generating a lot more lines of output and
>>
>> Sorry I don't follow - why does replacing drm_err with drm_warn generate (a
>> lot) more lines of output?
> 
> Sorry, my mistake; I had it in my mind that we were talking about a
> drm_WARN_ON rather than just drm_warn (i.e., including a big stacktrace
> and such).  DRM_ERROR -> drm_warn alone shouldn't have any extra
> negative impact.
> 
>>
>> But it can be drm_err for all I care, I don't think we really have
>> consistent story between errors and warnings in this area.
>>
>>> probably multiplying the already bad overhead that shouldn't be
>>> happening in an interrupt handler.  But if we could update the interrupt
>>> handler to just save away the details and do the actual drm_warn later,
>>> outside the interrupt handler code, that would be okay.  We should
>>> probably work toward something like that for all of our interrupt
>>> handler warning/error messages.
>>
>> Not sure I agree - for messages which we don't expect to see it doesn't
>> really matter that there will be overhead when they are hit. Presumably bad
>> things are already happening there so spending effort to optimise those path
>> is questionable.
> 
> Something bad is happening to graphics is we hit one of these cases.
> But if we start doing prints while interrupts are disabled, we start
> having more of a negative impact on the rest of the system too.

Truly for the case of this particular patch I don't think we should 
care. Rate limiting should be all that is needed in the short term to 
strike a balance between effort and benefit. But lets first clarify the 
PCI link going down problem.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-06-06 11:55             ` Tvrtko Ursulin
  0 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-06-06 11:55 UTC (permalink / raw)
  To: Matt Roper; +Cc: Intel-gfx, Lucas De Marchi, dri-devel


On 27/05/2022 19:42, Matt Roper wrote:
> On Thu, May 26, 2022 at 11:18:17AM +0100, Tvrtko Ursulin wrote:
>> On 25/05/2022 19:05, Matt Roper wrote:
>>> On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
>>>>
>>>> On 24/05/2022 18:51, Matt Roper wrote:
>>>>> On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>>>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>
>>>>>> Catch and log any garbage in the register, including no tiles marked, or
>>>>>> multiple tiles marked.
>>>>>>
>>>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>> Cc: Matt Roper <matthew.d.roper@intel.com>
>>>>>> ---
>>>>>> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
>>>>>> during glmark and more badness. So I thought lets log all possible failure
>>>>>> modes from here and also use per device logging.
>>>>>> ---
>>>>>>     drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>>>>>     drivers/gpu/drm/i915/i915_reg.h |  1 +
>>>>>>     2 files changed, 23 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>>>>> index 73cebc6aa650..79853d3fc1ed 100644
>>>>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>>>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>>>>> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>>>>     	u32 gu_misc_iir;
>>>>>>     	if (!intel_irqs_enabled(i915))
>>>>>> -		return IRQ_NONE;
>>>>>> +		goto none;
>>>>>>     	master_tile_ctl = dg1_master_intr_disable(regs);
>>>>>> -	if (!master_tile_ctl) {
>>>>>> -		dg1_master_intr_enable(regs);
>>>>>> -		return IRQ_NONE;
>>>>>> +	if (!master_tile_ctl)
>>>>>> +		goto enable_none;
>>>>>> +
>>>>>> +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>>>>>> +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>>>>>> +			 master_tile_ctl);
>>>>>
>>>>> I know we have a bunch of them already, but shouldn't we be avoiding
>>>>> printk-based stuff like this inside interrupt handlers?  Should we be
>>>>> migrating all these error messages over to trace_printk or something
>>>>> similar that's safer to use?
>>>>
>>>> Not sure - I kind of think some really unexpected and worrying situations
>>>> should be loud and on by default. Risk is then spam if not ratelimited.
>>>> Maybe we should instead ratelimit most errors/warnings coming for irq
>>>> handlers?
>>>
>>> It's not the risk of spam that's the problem, but rather that
>>> printk-based stuff eventually calls into the console code to flush its
>>> buffers.  That's way more overhead than you want in an interrupt handler
>>> so it's bad on its own, but if you're using something slow like a serial
>>> console, it becomes even more of a problem.
>>
>> Is it a problem for messages which we never expect to see?
> 
> Kind of.  While not as catastrophic, it's the same argument for why we
> don't use BUG() anymore...when the impossible does manage to happen
> there's unnecessary collateral damage on things outside of graphics.  If
> we're adding huge delays inside an interrupt handler (while other
> interrupts are disabled) that impacts the system-wide usability, not
> just our own driver.
> 
> I'd also argue that these messages actually are semi-expected.  Random
> bits being set shouldn't happen, but in the world of dgpu's, we do
> occasionally see cases where the PCI link itself goes down for reasons
> outside our control and then all registers read back as 0xFFFFFFFF,
> which will probably trigger error messages here (as well as a bunch of
> other places).

Could you expand a bit on what is semi-expected and when? I mean the 
circumstances of PCI link going down. We certainly don't have any code 
to survive that.

>>> While the unexpected bits in the master tile register are strange and
>>> may point to a bigger problem somewhere else, they're also harmless on
>>> their own since we should just ignore those bits and only process the
>>> valid tiles.
>>
>> Yes, I was expecting that a patch belonging to multi-tile enablement would
>> be incoming soon, which would be changing:
>>
>> +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
>> +	    DG1_MSTR_TILE(0)) {
>> +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
>> +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
>> +					     master_tile_ctl)));
>> +		goto enable_none;
>>   	}
>>
>>  From this patch, into something completely different like walking bit by
>> bit, handling the present tiles, and warning on unexpected ones. What should
>> remain though is warning on no tiles signaled (which what we saw, together
>> with garbage in reserved bits).
> 
> Yeah.  Although I still feel the interrupt handler should really just be
> flagging the errors so that the actual prints themselves can happen
> outside the interrupt.
> 
>>
>>>> In this particular case at least DRM_ERROR with no device info is the odd
>>>> one out in the entire file so I'd suggest changing at least that, if the
>>>> rest of my changes is of questionable benefit.
>>>
>>> Changing DRM_ERROR -> drm_err would probably be fine in the short term
>>> since it doesn't really make us any worse off.  Changing to drm_warn
>>> might not be great since we're generating a lot more lines of output and
>>
>> Sorry I don't follow - why does replacing drm_err with drm_warn generate (a
>> lot) more lines of output?
> 
> Sorry, my mistake; I had it in my mind that we were talking about a
> drm_WARN_ON rather than just drm_warn (i.e., including a big stacktrace
> and such).  DRM_ERROR -> drm_warn alone shouldn't have any extra
> negative impact.
> 
>>
>> But it can be drm_err for all I care, I don't think we really have
>> consistent story between errors and warnings in this area.
>>
>>> probably multiplying the already bad overhead that shouldn't be
>>> happening in an interrupt handler.  But if we could update the interrupt
>>> handler to just save away the details and do the actual drm_warn later,
>>> outside the interrupt handler code, that would be okay.  We should
>>> probably work toward something like that for all of our interrupt
>>> handler warning/error messages.
>>
>> Not sure I agree - for messages which we don't expect to see it doesn't
>> really matter that there will be overhead when they are hit. Presumably bad
>> things are already happening there so spending effort to optimise those path
>> is questionable.
> 
> Something bad is happening to graphics is we hit one of these cases.
> But if we start doing prints while interrupts are disabled, we start
> having more of a negative impact on the rest of the system too.

Truly for the case of this particular patch I don't think we should 
care. Rate limiting should be all that is needed in the short term to 
strike a balance between effort and benefit. But lets first clarify the 
PCI link going down problem.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-06-06 11:55             ` [Intel-gfx] " Tvrtko Ursulin
@ 2022-06-06 15:21               ` Matt Roper
  -1 siblings, 0 replies; 25+ messages in thread
From: Matt Roper @ 2022-06-06 15:21 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx, Lucas De Marchi, dri-devel, Tvrtko Ursulin

On Mon, Jun 06, 2022 at 12:55:20PM +0100, Tvrtko Ursulin wrote:
> 
> On 27/05/2022 19:42, Matt Roper wrote:
> > On Thu, May 26, 2022 at 11:18:17AM +0100, Tvrtko Ursulin wrote:
> > > On 25/05/2022 19:05, Matt Roper wrote:
> > > > On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
> > > > > 
> > > > > On 24/05/2022 18:51, Matt Roper wrote:
> > > > > > On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
> > > > > > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > > > 
> > > > > > > Catch and log any garbage in the register, including no tiles marked, or
> > > > > > > multiple tiles marked.
> > > > > > > 
> > > > > > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > > > Cc: Matt Roper <matthew.d.roper@intel.com>
> > > > > > > ---
> > > > > > > We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
> > > > > > > during glmark and more badness. So I thought lets log all possible failure
> > > > > > > modes from here and also use per device logging.
> > > > > > > ---
> > > > > > >     drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
> > > > > > >     drivers/gpu/drm/i915/i915_reg.h |  1 +
> > > > > > >     2 files changed, 23 insertions(+), 11 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > > > > > > index 73cebc6aa650..79853d3fc1ed 100644
> > > > > > > --- a/drivers/gpu/drm/i915/i915_irq.c
> > > > > > > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > > > > > > @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
> > > > > > >     	u32 gu_misc_iir;
> > > > > > >     	if (!intel_irqs_enabled(i915))
> > > > > > > -		return IRQ_NONE;
> > > > > > > +		goto none;
> > > > > > >     	master_tile_ctl = dg1_master_intr_disable(regs);
> > > > > > > -	if (!master_tile_ctl) {
> > > > > > > -		dg1_master_intr_enable(regs);
> > > > > > > -		return IRQ_NONE;
> > > > > > > +	if (!master_tile_ctl)
> > > > > > > +		goto enable_none;
> > > > > > > +
> > > > > > > +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
> > > > > > > +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
> > > > > > > +			 master_tile_ctl);
> > > > > > 
> > > > > > I know we have a bunch of them already, but shouldn't we be avoiding
> > > > > > printk-based stuff like this inside interrupt handlers?  Should we be
> > > > > > migrating all these error messages over to trace_printk or something
> > > > > > similar that's safer to use?
> > > > > 
> > > > > Not sure - I kind of think some really unexpected and worrying situations
> > > > > should be loud and on by default. Risk is then spam if not ratelimited.
> > > > > Maybe we should instead ratelimit most errors/warnings coming for irq
> > > > > handlers?
> > > > 
> > > > It's not the risk of spam that's the problem, but rather that
> > > > printk-based stuff eventually calls into the console code to flush its
> > > > buffers.  That's way more overhead than you want in an interrupt handler
> > > > so it's bad on its own, but if you're using something slow like a serial
> > > > console, it becomes even more of a problem.
> > > 
> > > Is it a problem for messages which we never expect to see?
> > 
> > Kind of.  While not as catastrophic, it's the same argument for why we
> > don't use BUG() anymore...when the impossible does manage to happen
> > there's unnecessary collateral damage on things outside of graphics.  If
> > we're adding huge delays inside an interrupt handler (while other
> > interrupts are disabled) that impacts the system-wide usability, not
> > just our own driver.
> > 
> > I'd also argue that these messages actually are semi-expected.  Random
> > bits being set shouldn't happen, but in the world of dgpu's, we do
> > occasionally see cases where the PCI link itself goes down for reasons
> > outside our control and then all registers read back as 0xFFFFFFFF,
> > which will probably trigger error messages here (as well as a bunch of
> > other places).
> 
> Could you expand a bit on what is semi-expected and when? I mean the
> circumstances of PCI link going down. We certainly don't have any code to
> survive that.

Yeah, I'm referring to the "Lost access to MMIO BAR" errors; in the past
most of them have ultimately been tracked down to bugs in early
firmware, so flashing an updated IFWI/BIOS onto the device usually
solved the problems.  Generally those buggy firmwares are an internal
problem that never make it into the wild, but I think we have also seen
cases where they get triggered by physical/electrical problems on a
specific part; that can potentially happen to anyone who's unlucky
enough to get a defective/damaged unit.

Basically "hardware returns all F's" happens because the CPU initiates
an MMIO transaction with the hardware, the hardware fails to produce any
response (possibly due to failing hardware, possibly due to
firmware/BIOS bugs), so 0xFFFFFFFF gets returned as an autocompletion to
prevent the CPU core from hanging.

It looks like we still have a few open here:
https://gitlab.freedesktop.org/search?search=%22Lost+access+to+MMIO+BAR%22&group_id=2642&project_id=4519&scope=issues&search_code=false&snippets=false&repository_ref=&nav_source=navbar

and there are some features on specific platforms we haven't turned on
yet because they also trigger these failures (which is still under
debug).

We don't/can't really do much to handle these problems in i915 today
except printing the 'lost access' error so that we know to ignore
whatever kinds of bogus errors we get after that point (usually lots of
messages about forcewake failing to clear, engine/GuC reset failing to
complete, etc.).  But aside from i915 being broken, the rest of the
platform should generally continue to work, so you can still access the
machine over the network, save logs to disk, etc.


Matt

> 
> > > > While the unexpected bits in the master tile register are strange and
> > > > may point to a bigger problem somewhere else, they're also harmless on
> > > > their own since we should just ignore those bits and only process the
> > > > valid tiles.
> > > 
> > > Yes, I was expecting that a patch belonging to multi-tile enablement would
> > > be incoming soon, which would be changing:
> > > 
> > > +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
> > > +	    DG1_MSTR_TILE(0)) {
> > > +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
> > > +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
> > > +					     master_tile_ctl)));
> > > +		goto enable_none;
> > >   	}
> > > 
> > >  From this patch, into something completely different like walking bit by
> > > bit, handling the present tiles, and warning on unexpected ones. What should
> > > remain though is warning on no tiles signaled (which what we saw, together
> > > with garbage in reserved bits).
> > 
> > Yeah.  Although I still feel the interrupt handler should really just be
> > flagging the errors so that the actual prints themselves can happen
> > outside the interrupt.
> > 
> > > 
> > > > > In this particular case at least DRM_ERROR with no device info is the odd
> > > > > one out in the entire file so I'd suggest changing at least that, if the
> > > > > rest of my changes is of questionable benefit.
> > > > 
> > > > Changing DRM_ERROR -> drm_err would probably be fine in the short term
> > > > since it doesn't really make us any worse off.  Changing to drm_warn
> > > > might not be great since we're generating a lot more lines of output and
> > > 
> > > Sorry I don't follow - why does replacing drm_err with drm_warn generate (a
> > > lot) more lines of output?
> > 
> > Sorry, my mistake; I had it in my mind that we were talking about a
> > drm_WARN_ON rather than just drm_warn (i.e., including a big stacktrace
> > and such).  DRM_ERROR -> drm_warn alone shouldn't have any extra
> > negative impact.
> > 
> > > 
> > > But it can be drm_err for all I care, I don't think we really have
> > > consistent story between errors and warnings in this area.
> > > 
> > > > probably multiplying the already bad overhead that shouldn't be
> > > > happening in an interrupt handler.  But if we could update the interrupt
> > > > handler to just save away the details and do the actual drm_warn later,
> > > > outside the interrupt handler code, that would be okay.  We should
> > > > probably work toward something like that for all of our interrupt
> > > > handler warning/error messages.
> > > 
> > > Not sure I agree - for messages which we don't expect to see it doesn't
> > > really matter that there will be overhead when they are hit. Presumably bad
> > > things are already happening there so spending effort to optimise those path
> > > is questionable.
> > 
> > Something bad is happening to graphics is we hit one of these cases.
> > But if we start doing prints while interrupts are disabled, we start
> > having more of a negative impact on the rest of the system too.
> 
> Truly for the case of this particular patch I don't think we should care.
> Rate limiting should be all that is needed in the short term to strike a
> balance between effort and benefit. But lets first clarify the PCI link
> going down problem.
> 
> Regards,
> 
> Tvrtko

-- 
Matt Roper
Graphics Software Engineer
VTT-OSGC Platform Enablement
Intel Corporation

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-06-06 15:21               ` Matt Roper
  0 siblings, 0 replies; 25+ messages in thread
From: Matt Roper @ 2022-06-06 15:21 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx, Lucas De Marchi, dri-devel

On Mon, Jun 06, 2022 at 12:55:20PM +0100, Tvrtko Ursulin wrote:
> 
> On 27/05/2022 19:42, Matt Roper wrote:
> > On Thu, May 26, 2022 at 11:18:17AM +0100, Tvrtko Ursulin wrote:
> > > On 25/05/2022 19:05, Matt Roper wrote:
> > > > On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
> > > > > 
> > > > > On 24/05/2022 18:51, Matt Roper wrote:
> > > > > > On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
> > > > > > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > > > 
> > > > > > > Catch and log any garbage in the register, including no tiles marked, or
> > > > > > > multiple tiles marked.
> > > > > > > 
> > > > > > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > > > > > Cc: Matt Roper <matthew.d.roper@intel.com>
> > > > > > > ---
> > > > > > > We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
> > > > > > > during glmark and more badness. So I thought lets log all possible failure
> > > > > > > modes from here and also use per device logging.
> > > > > > > ---
> > > > > > >     drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
> > > > > > >     drivers/gpu/drm/i915/i915_reg.h |  1 +
> > > > > > >     2 files changed, 23 insertions(+), 11 deletions(-)
> > > > > > > 
> > > > > > > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > > > > > > index 73cebc6aa650..79853d3fc1ed 100644
> > > > > > > --- a/drivers/gpu/drm/i915/i915_irq.c
> > > > > > > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > > > > > > @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
> > > > > > >     	u32 gu_misc_iir;
> > > > > > >     	if (!intel_irqs_enabled(i915))
> > > > > > > -		return IRQ_NONE;
> > > > > > > +		goto none;
> > > > > > >     	master_tile_ctl = dg1_master_intr_disable(regs);
> > > > > > > -	if (!master_tile_ctl) {
> > > > > > > -		dg1_master_intr_enable(regs);
> > > > > > > -		return IRQ_NONE;
> > > > > > > +	if (!master_tile_ctl)
> > > > > > > +		goto enable_none;
> > > > > > > +
> > > > > > > +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
> > > > > > > +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
> > > > > > > +			 master_tile_ctl);
> > > > > > 
> > > > > > I know we have a bunch of them already, but shouldn't we be avoiding
> > > > > > printk-based stuff like this inside interrupt handlers?  Should we be
> > > > > > migrating all these error messages over to trace_printk or something
> > > > > > similar that's safer to use?
> > > > > 
> > > > > Not sure - I kind of think some really unexpected and worrying situations
> > > > > should be loud and on by default. Risk is then spam if not ratelimited.
> > > > > Maybe we should instead ratelimit most errors/warnings coming for irq
> > > > > handlers?
> > > > 
> > > > It's not the risk of spam that's the problem, but rather that
> > > > printk-based stuff eventually calls into the console code to flush its
> > > > buffers.  That's way more overhead than you want in an interrupt handler
> > > > so it's bad on its own, but if you're using something slow like a serial
> > > > console, it becomes even more of a problem.
> > > 
> > > Is it a problem for messages which we never expect to see?
> > 
> > Kind of.  While not as catastrophic, it's the same argument for why we
> > don't use BUG() anymore...when the impossible does manage to happen
> > there's unnecessary collateral damage on things outside of graphics.  If
> > we're adding huge delays inside an interrupt handler (while other
> > interrupts are disabled) that impacts the system-wide usability, not
> > just our own driver.
> > 
> > I'd also argue that these messages actually are semi-expected.  Random
> > bits being set shouldn't happen, but in the world of dgpu's, we do
> > occasionally see cases where the PCI link itself goes down for reasons
> > outside our control and then all registers read back as 0xFFFFFFFF,
> > which will probably trigger error messages here (as well as a bunch of
> > other places).
> 
> Could you expand a bit on what is semi-expected and when? I mean the
> circumstances of PCI link going down. We certainly don't have any code to
> survive that.

Yeah, I'm referring to the "Lost access to MMIO BAR" errors; in the past
most of them have ultimately been tracked down to bugs in early
firmware, so flashing an updated IFWI/BIOS onto the device usually
solved the problems.  Generally those buggy firmwares are an internal
problem that never make it into the wild, but I think we have also seen
cases where they get triggered by physical/electrical problems on a
specific part; that can potentially happen to anyone who's unlucky
enough to get a defective/damaged unit.

Basically "hardware returns all F's" happens because the CPU initiates
an MMIO transaction with the hardware, the hardware fails to produce any
response (possibly due to failing hardware, possibly due to
firmware/BIOS bugs), so 0xFFFFFFFF gets returned as an autocompletion to
prevent the CPU core from hanging.

It looks like we still have a few open here:
https://gitlab.freedesktop.org/search?search=%22Lost+access+to+MMIO+BAR%22&group_id=2642&project_id=4519&scope=issues&search_code=false&snippets=false&repository_ref=&nav_source=navbar

and there are some features on specific platforms we haven't turned on
yet because they also trigger these failures (which is still under
debug).

We don't/can't really do much to handle these problems in i915 today
except printing the 'lost access' error so that we know to ignore
whatever kinds of bogus errors we get after that point (usually lots of
messages about forcewake failing to clear, engine/GuC reset failing to
complete, etc.).  But aside from i915 being broken, the rest of the
platform should generally continue to work, so you can still access the
machine over the network, save logs to disk, etc.


Matt

> 
> > > > While the unexpected bits in the master tile register are strange and
> > > > may point to a bigger problem somewhere else, they're also harmless on
> > > > their own since we should just ignore those bits and only process the
> > > > valid tiles.
> > > 
> > > Yes, I was expecting that a patch belonging to multi-tile enablement would
> > > be incoming soon, which would be changing:
> > > 
> > > +	if (REG_FIELD_GET(DG1_MSTR_TILE_MASK, master_tile_ctl) !=
> > > +	    DG1_MSTR_TILE(0)) {
> > > +		drm_warn(&i915->drm, "Unexpected irq from tile %u!\n",
> > > +			 ilog2(REG_FIELD_GET(DG1_MSTR_TILE_MASK,
> > > +					     master_tile_ctl)));
> > > +		goto enable_none;
> > >   	}
> > > 
> > >  From this patch, into something completely different like walking bit by
> > > bit, handling the present tiles, and warning on unexpected ones. What should
> > > remain though is warning on no tiles signaled (which what we saw, together
> > > with garbage in reserved bits).
> > 
> > Yeah.  Although I still feel the interrupt handler should really just be
> > flagging the errors so that the actual prints themselves can happen
> > outside the interrupt.
> > 
> > > 
> > > > > In this particular case at least DRM_ERROR with no device info is the odd
> > > > > one out in the entire file so I'd suggest changing at least that, if the
> > > > > rest of my changes is of questionable benefit.
> > > > 
> > > > Changing DRM_ERROR -> drm_err would probably be fine in the short term
> > > > since it doesn't really make us any worse off.  Changing to drm_warn
> > > > might not be great since we're generating a lot more lines of output and
> > > 
> > > Sorry I don't follow - why does replacing drm_err with drm_warn generate (a
> > > lot) more lines of output?
> > 
> > Sorry, my mistake; I had it in my mind that we were talking about a
> > drm_WARN_ON rather than just drm_warn (i.e., including a big stacktrace
> > and such).  DRM_ERROR -> drm_warn alone shouldn't have any extra
> > negative impact.
> > 
> > > 
> > > But it can be drm_err for all I care, I don't think we really have
> > > consistent story between errors and warnings in this area.
> > > 
> > > > probably multiplying the already bad overhead that shouldn't be
> > > > happening in an interrupt handler.  But if we could update the interrupt
> > > > handler to just save away the details and do the actual drm_warn later,
> > > > outside the interrupt handler code, that would be okay.  We should
> > > > probably work toward something like that for all of our interrupt
> > > > handler warning/error messages.
> > > 
> > > Not sure I agree - for messages which we don't expect to see it doesn't
> > > really matter that there will be overhead when they are hit. Presumably bad
> > > things are already happening there so spending effort to optimise those path
> > > is questionable.
> > 
> > Something bad is happening to graphics is we hit one of these cases.
> > But if we start doing prints while interrupts are disabled, we start
> > having more of a negative impact on the rest of the system too.
> 
> Truly for the case of this particular patch I don't think we should care.
> Rate limiting should be all that is needed in the short term to strike a
> balance between effort and benefit. But lets first clarify the PCI link
> going down problem.
> 
> Regards,
> 
> Tvrtko

-- 
Matt Roper
Graphics Software Engineer
VTT-OSGC Platform Enablement
Intel Corporation

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
  2022-06-06 15:21               ` [Intel-gfx] " Matt Roper
@ 2022-06-07  9:20                 ` Tvrtko Ursulin
  -1 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-06-07  9:20 UTC (permalink / raw)
  To: Matt Roper; +Cc: Intel-gfx, Lucas De Marchi, dri-devel, Tvrtko Ursulin


On 06/06/2022 16:21, Matt Roper wrote:
> On Mon, Jun 06, 2022 at 12:55:20PM +0100, Tvrtko Ursulin wrote:
>>
>> On 27/05/2022 19:42, Matt Roper wrote:
>>> On Thu, May 26, 2022 at 11:18:17AM +0100, Tvrtko Ursulin wrote:
>>>> On 25/05/2022 19:05, Matt Roper wrote:
>>>>> On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
>>>>>>
>>>>>> On 24/05/2022 18:51, Matt Roper wrote:
>>>>>>> On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>>>>>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>>>
>>>>>>>> Catch and log any garbage in the register, including no tiles marked, or
>>>>>>>> multiple tiles marked.
>>>>>>>>
>>>>>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>>> Cc: Matt Roper <matthew.d.roper@intel.com>
>>>>>>>> ---
>>>>>>>> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
>>>>>>>> during glmark and more badness. So I thought lets log all possible failure
>>>>>>>> modes from here and also use per device logging.
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>>>>>>>      drivers/gpu/drm/i915/i915_reg.h |  1 +
>>>>>>>>      2 files changed, 23 insertions(+), 11 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>>>>>>> index 73cebc6aa650..79853d3fc1ed 100644
>>>>>>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>>>>>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>>>>>>> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>>>>>>      	u32 gu_misc_iir;
>>>>>>>>      	if (!intel_irqs_enabled(i915))
>>>>>>>> -		return IRQ_NONE;
>>>>>>>> +		goto none;
>>>>>>>>      	master_tile_ctl = dg1_master_intr_disable(regs);
>>>>>>>> -	if (!master_tile_ctl) {
>>>>>>>> -		dg1_master_intr_enable(regs);
>>>>>>>> -		return IRQ_NONE;
>>>>>>>> +	if (!master_tile_ctl)
>>>>>>>> +		goto enable_none;
>>>>>>>> +
>>>>>>>> +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>>>>>>>> +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>>>>>>>> +			 master_tile_ctl);
>>>>>>>
>>>>>>> I know we have a bunch of them already, but shouldn't we be avoiding
>>>>>>> printk-based stuff like this inside interrupt handlers?  Should we be
>>>>>>> migrating all these error messages over to trace_printk or something
>>>>>>> similar that's safer to use?
>>>>>>
>>>>>> Not sure - I kind of think some really unexpected and worrying situations
>>>>>> should be loud and on by default. Risk is then spam if not ratelimited.
>>>>>> Maybe we should instead ratelimit most errors/warnings coming for irq
>>>>>> handlers?
>>>>>
>>>>> It's not the risk of spam that's the problem, but rather that
>>>>> printk-based stuff eventually calls into the console code to flush its
>>>>> buffers.  That's way more overhead than you want in an interrupt handler
>>>>> so it's bad on its own, but if you're using something slow like a serial
>>>>> console, it becomes even more of a problem.
>>>>
>>>> Is it a problem for messages which we never expect to see?
>>>
>>> Kind of.  While not as catastrophic, it's the same argument for why we
>>> don't use BUG() anymore...when the impossible does manage to happen
>>> there's unnecessary collateral damage on things outside of graphics.  If
>>> we're adding huge delays inside an interrupt handler (while other
>>> interrupts are disabled) that impacts the system-wide usability, not
>>> just our own driver.
>>>
>>> I'd also argue that these messages actually are semi-expected.  Random
>>> bits being set shouldn't happen, but in the world of dgpu's, we do
>>> occasionally see cases where the PCI link itself goes down for reasons
>>> outside our control and then all registers read back as 0xFFFFFFFF,
>>> which will probably trigger error messages here (as well as a bunch of
>>> other places).
>>
>> Could you expand a bit on what is semi-expected and when? I mean the
>> circumstances of PCI link going down. We certainly don't have any code to
>> survive that.
> 
> Yeah, I'm referring to the "Lost access to MMIO BAR" errors; in the past
> most of them have ultimately been tracked down to bugs in early
> firmware, so flashing an updated IFWI/BIOS onto the device usually
> solved the problems.  Generally those buggy firmwares are an internal
> problem that never make it into the wild, but I think we have also seen
> cases where they get triggered by physical/electrical problems on a
> specific part; that can potentially happen to anyone who's unlucky
> enough to get a defective/damaged unit.
> 
> Basically "hardware returns all F's" happens because the CPU initiates
> an MMIO transaction with the hardware, the hardware fails to produce any
> response (possibly due to failing hardware, possibly due to
> firmware/BIOS bugs), so 0xFFFFFFFF gets returned as an autocompletion to
> prevent the CPU core from hanging.
> 
> It looks like we still have a few open here:
> https://gitlab.freedesktop.org/search?search=%22Lost+access+to+MMIO+BAR%22&group_id=2642&project_id=4519&scope=issues&search_code=false&snippets=false&repository_ref=&nav_source=navbar
> 
> and there are some features on specific platforms we haven't turned on
> yet because they also trigger these failures (which is still under
> debug).
> 
> We don't/can't really do much to handle these problems in i915 today
> except printing the 'lost access' error so that we know to ignore
> whatever kinds of bogus errors we get after that point (usually lots of
> messages about forcewake failing to clear, engine/GuC reset failing to
> complete, etc.).  But aside from i915 being broken, the rest of the
> platform should generally continue to work, so you can still access the
> machine over the network, save logs to disk, etc.

Interesting, I missed the addition of 29b6f88d60dd ("drm/i915: Try to 
detect sudden loss of MMIO access"), thanks!

In case of my "Garbage in master_tile_ctl" or "Unexpected irq from 
tile.." messages, in case of lost PCI link they should happen only once. 
I don't think hardware will keep raising interrupts if driver cannot 
talk to it. But it does seem prudent to go with the rate-limiting 
flavour just in case.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [Intel-gfx] [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR
@ 2022-06-07  9:20                 ` Tvrtko Ursulin
  0 siblings, 0 replies; 25+ messages in thread
From: Tvrtko Ursulin @ 2022-06-07  9:20 UTC (permalink / raw)
  To: Matt Roper; +Cc: Intel-gfx, Lucas De Marchi, dri-devel


On 06/06/2022 16:21, Matt Roper wrote:
> On Mon, Jun 06, 2022 at 12:55:20PM +0100, Tvrtko Ursulin wrote:
>>
>> On 27/05/2022 19:42, Matt Roper wrote:
>>> On Thu, May 26, 2022 at 11:18:17AM +0100, Tvrtko Ursulin wrote:
>>>> On 25/05/2022 19:05, Matt Roper wrote:
>>>>> On Wed, May 25, 2022 at 05:03:13PM +0100, Tvrtko Ursulin wrote:
>>>>>>
>>>>>> On 24/05/2022 18:51, Matt Roper wrote:
>>>>>>> On Tue, May 24, 2022 at 10:43:39AM +0100, Tvrtko Ursulin wrote:
>>>>>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>>>
>>>>>>>> Catch and log any garbage in the register, including no tiles marked, or
>>>>>>>> multiple tiles marked.
>>>>>>>>
>>>>>>>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>>>>> Cc: Matt Roper <matthew.d.roper@intel.com>
>>>>>>>> ---
>>>>>>>> We caught garbage in DG1_MSTR_TILE_INTR with DG2 (actual value 0xF9D2C008)
>>>>>>>> during glmark and more badness. So I thought lets log all possible failure
>>>>>>>> modes from here and also use per device logging.
>>>>>>>> ---
>>>>>>>>      drivers/gpu/drm/i915/i915_irq.c | 33 ++++++++++++++++++++++-----------
>>>>>>>>      drivers/gpu/drm/i915/i915_reg.h |  1 +
>>>>>>>>      2 files changed, 23 insertions(+), 11 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>>>>>>> index 73cebc6aa650..79853d3fc1ed 100644
>>>>>>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>>>>>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>>>>>>> @@ -2778,24 +2778,30 @@ static irqreturn_t dg1_irq_handler(int irq, void *arg)
>>>>>>>>      	u32 gu_misc_iir;
>>>>>>>>      	if (!intel_irqs_enabled(i915))
>>>>>>>> -		return IRQ_NONE;
>>>>>>>> +		goto none;
>>>>>>>>      	master_tile_ctl = dg1_master_intr_disable(regs);
>>>>>>>> -	if (!master_tile_ctl) {
>>>>>>>> -		dg1_master_intr_enable(regs);
>>>>>>>> -		return IRQ_NONE;
>>>>>>>> +	if (!master_tile_ctl)
>>>>>>>> +		goto enable_none;
>>>>>>>> +
>>>>>>>> +	if (master_tile_ctl & ~(DG1_MSTR_IRQ | DG1_MSTR_TILE_MASK)) {
>>>>>>>> +		drm_warn(&i915->drm, "Garbage in master_tile_ctl: 0x%08x!\n",
>>>>>>>> +			 master_tile_ctl);
>>>>>>>
>>>>>>> I know we have a bunch of them already, but shouldn't we be avoiding
>>>>>>> printk-based stuff like this inside interrupt handlers?  Should we be
>>>>>>> migrating all these error messages over to trace_printk or something
>>>>>>> similar that's safer to use?
>>>>>>
>>>>>> Not sure - I kind of think some really unexpected and worrying situations
>>>>>> should be loud and on by default. Risk is then spam if not ratelimited.
>>>>>> Maybe we should instead ratelimit most errors/warnings coming for irq
>>>>>> handlers?
>>>>>
>>>>> It's not the risk of spam that's the problem, but rather that
>>>>> printk-based stuff eventually calls into the console code to flush its
>>>>> buffers.  That's way more overhead than you want in an interrupt handler
>>>>> so it's bad on its own, but if you're using something slow like a serial
>>>>> console, it becomes even more of a problem.
>>>>
>>>> Is it a problem for messages which we never expect to see?
>>>
>>> Kind of.  While not as catastrophic, it's the same argument for why we
>>> don't use BUG() anymore...when the impossible does manage to happen
>>> there's unnecessary collateral damage on things outside of graphics.  If
>>> we're adding huge delays inside an interrupt handler (while other
>>> interrupts are disabled) that impacts the system-wide usability, not
>>> just our own driver.
>>>
>>> I'd also argue that these messages actually are semi-expected.  Random
>>> bits being set shouldn't happen, but in the world of dgpu's, we do
>>> occasionally see cases where the PCI link itself goes down for reasons
>>> outside our control and then all registers read back as 0xFFFFFFFF,
>>> which will probably trigger error messages here (as well as a bunch of
>>> other places).
>>
>> Could you expand a bit on what is semi-expected and when? I mean the
>> circumstances of PCI link going down. We certainly don't have any code to
>> survive that.
> 
> Yeah, I'm referring to the "Lost access to MMIO BAR" errors; in the past
> most of them have ultimately been tracked down to bugs in early
> firmware, so flashing an updated IFWI/BIOS onto the device usually
> solved the problems.  Generally those buggy firmwares are an internal
> problem that never make it into the wild, but I think we have also seen
> cases where they get triggered by physical/electrical problems on a
> specific part; that can potentially happen to anyone who's unlucky
> enough to get a defective/damaged unit.
> 
> Basically "hardware returns all F's" happens because the CPU initiates
> an MMIO transaction with the hardware, the hardware fails to produce any
> response (possibly due to failing hardware, possibly due to
> firmware/BIOS bugs), so 0xFFFFFFFF gets returned as an autocompletion to
> prevent the CPU core from hanging.
> 
> It looks like we still have a few open here:
> https://gitlab.freedesktop.org/search?search=%22Lost+access+to+MMIO+BAR%22&group_id=2642&project_id=4519&scope=issues&search_code=false&snippets=false&repository_ref=&nav_source=navbar
> 
> and there are some features on specific platforms we haven't turned on
> yet because they also trigger these failures (which is still under
> debug).
> 
> We don't/can't really do much to handle these problems in i915 today
> except printing the 'lost access' error so that we know to ignore
> whatever kinds of bogus errors we get after that point (usually lots of
> messages about forcewake failing to clear, engine/GuC reset failing to
> complete, etc.).  But aside from i915 being broken, the rest of the
> platform should generally continue to work, so you can still access the
> machine over the network, save logs to disk, etc.

Interesting, I missed the addition of 29b6f88d60dd ("drm/i915: Try to 
detect sudden loss of MMIO access"), thanks!

In case of my "Garbage in master_tile_ctl" or "Unexpected irq from 
tile.." messages, in case of lost PCI link they should happen only once. 
I don't think hardware will keep raising interrupts if driver cannot 
talk to it. But it does seem prudent to go with the rate-limiting 
flavour just in case.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2022-06-07  9:20 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-24  9:43 [PATCH] drm/i915/dg2: Catch and log more unexpected values in DG1_MSTR_TILE_INTR Tvrtko Ursulin
2022-05-24  9:43 ` [Intel-gfx] " Tvrtko Ursulin
2022-05-24 10:01 ` [Intel-gfx] ✗ Fi.CI.SPARSE: warning for " Patchwork
2022-05-24 10:22 ` [Intel-gfx] ✓ Fi.CI.BAT: success " Patchwork
2022-05-24 11:46 ` [Intel-gfx] ✗ Fi.CI.IGT: failure " Patchwork
2022-05-24 17:51 ` [PATCH] " Matt Roper
2022-05-24 17:51   ` [Intel-gfx] " Matt Roper
2022-05-25 16:03   ` Tvrtko Ursulin
2022-05-25 16:03     ` [Intel-gfx] " Tvrtko Ursulin
2022-05-25 18:05     ` Matt Roper
2022-05-25 18:05       ` [Intel-gfx] " Matt Roper
2022-05-26 10:18       ` Tvrtko Ursulin
2022-05-26 10:18         ` [Intel-gfx] " Tvrtko Ursulin
2022-05-27 18:42         ` Matt Roper
2022-05-27 18:42           ` Matt Roper
2022-06-06 11:55           ` Tvrtko Ursulin
2022-06-06 11:55             ` [Intel-gfx] " Tvrtko Ursulin
2022-06-06 15:21             ` Matt Roper
2022-06-06 15:21               ` [Intel-gfx] " Matt Roper
2022-06-07  9:20               ` Tvrtko Ursulin
2022-06-07  9:20                 ` [Intel-gfx] " Tvrtko Ursulin
2022-05-25 18:14     ` Lucas De Marchi
2022-05-25 18:14       ` [Intel-gfx] " Lucas De Marchi
2022-05-26 10:29       ` Tvrtko Ursulin
2022-05-26 10:29         ` [Intel-gfx] " Tvrtko Ursulin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.