All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs
@ 2017-11-10 11:53 Mika Kuoppala
  2017-11-10 12:16 ` Chris Wilson
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Mika Kuoppala @ 2017-11-10 11:53 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

We have a problem of distinguishing intended hangs
submitted by igt during CI/bat and hangs that are nonintended
happening in close proximity.

As we know how igt constructs a batch intended to hang
the gpu, we can use this in our advantage when error state
is constructed. The signature of a intended hang can
be parsed out in this stage.

Batches that are expected to hang can be watermarked by:

batch[i++] = MI_BATCH_BUFFER_END;
batch[i++] = MI_NOOP;
batch[i++] = IGT_HANG_SIGNATURE; /* 0xc5c5c5c5 */

Note that we do the parsing only with gem debug builds.

Later when we spew out notification about the hang into
the dmesg, we can use DRM_INFO for intended hangs
and DRM_WARN for nonintended. This way auxiliary tools
can make better judgement on what to consider important
for reporting.

Cc: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c              |   2 +-
 drivers/gpu/drm/i915/i915_drv.h                  |   6 +-
 drivers/gpu/drm/i915/i915_gpu_error.c            | 100 +++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_irq.c                  |   4 +-
 drivers/gpu/drm/i915/intel_hangcheck.c           |   6 +-
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c |   2 +-
 6 files changed, 104 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 39883cd915db..678f472a8956 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4092,7 +4092,7 @@ i915_wedged_set(void *data, u64 val)
 		engine->hangcheck.stalled = true;
 	}
 
-	i915_handle_error(i915, val, "Manually setting wedged to %llu", val);
+	i915_handle_error(i915, true, val, "Manually setting wedged to %llu", val);
 
 	wait_on_bit(&i915->gpu_error.flags,
 		    I915_RESET_HANDOFF,
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fe93115c4caa..91e0ab8509d2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1004,6 +1004,8 @@ struct i915_gpu_state {
 			u32 *pages[0];
 		} *ringbuffer, *batchbuffer, *wa_batchbuffer, *ctx, *hws_page;
 
+		bool batch_had_hang_signature;
+
 		struct drm_i915_error_object **user_bo;
 		long user_bo_count;
 
@@ -3372,8 +3374,9 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
 			   &dev_priv->gpu_error.hangcheck_work, delay);
 }
 
-__printf(3, 4)
+__printf(4, 5)
 void i915_handle_error(struct drm_i915_private *dev_priv,
+		       bool intentional,
 		       u32 engine_mask,
 		       const char *fmt, ...);
 
@@ -3960,6 +3963,7 @@ static inline void i915_error_state_buf_release(
 
 struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
 void i915_capture_error_state(struct drm_i915_private *dev_priv,
+			      bool intentional,
 			      u32 engine_mask,
 			      const char *error_msg);
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 5c2d83a838d8..a5bfefb94d6b 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -918,15 +918,52 @@ void __i915_gpu_state_free(struct kref *error_ref)
 	kfree(error);
 }
 
+#ifdef CONFIG_DRM_I915_DEBUG_GEM
+
+#define HANG_SIGNATURE_MAGIC 0xc5c5c5c5
+
+static bool batch_has_hang_signature(const u32 * const batch, u64 len)
+{
+	struct marker { u32 val; u64 offset; } m[2] = {
+		{ MI_BATCH_BUFFER_END, -1 },
+		{ HANG_SIGNATURE_MAGIC, -1 }
+	};
+	u64 i, j;
+
+	len = min_t(u64, len, PAGE_SIZE);
+	len = DIV_ROUND_UP(len, 4);
+
+	for (i = 0, j = 0; i < len && j < ARRAY_SIZE(m); i++) {
+		if (m[j].offset == -1 && batch[i] == m[j].val)
+			m[j++].offset = i;
+
+		if (m[0].offset != -1 && i - m[0].offset > 2)
+			break;
+	}
+
+	if (m[0].offset == -1 || m[1].offset == -1)
+		return false;
+
+	return m[1].offset - m[0].offset == 2;
+}
+#else
+static bool batch_has_hang_signature(struct i915_vma *vma)
+{
+	return false;
+}
+#endif
+
 static struct drm_i915_error_object *
-i915_error_object_create(struct drm_i915_private *i915,
-			 struct i915_vma *vma)
+__i915_error_object_create(struct drm_i915_private *i915,
+			   struct i915_vma *vma,
+			   bool *hangsig)
 {
 	struct i915_ggtt *ggtt = &i915->ggtt;
 	const u64 slot = ggtt->error_capture.start;
 	struct drm_i915_error_object *dst;
 	struct compress compress;
-	unsigned long num_pages;
+	unsigned long num_pages, src_pages;
+	unsigned long page_count = 0;
 	struct sgt_iter iter;
 	dma_addr_t dma;
 
@@ -934,6 +971,7 @@ i915_error_object_create(struct drm_i915_private *i915,
 		return NULL;
 
 	num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
+	src_pages = num_pages;
 	num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
 	dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
 		      GFP_ATOMIC | __GFP_NOWARN);
@@ -958,9 +996,16 @@ i915_error_object_create(struct drm_i915_private *i915,
 				       I915_CACHE_NONE, 0);
 
 		s = io_mapping_map_atomic_wc(&ggtt->mappable, slot);
+
+		/* Check last page only for intentional hang signature */
+		if (hangsig && page_count == src_pages - 1)
+			*hangsig = batch_has_hang_signature((void __force *)s,
+							    vma->size);
+
 		ret = compress_page(&compress, (void  __force *)s, dst);
 		io_mapping_unmap_atomic(s);
 
+		page_count++;
 		if (ret)
 			goto unwind;
 	}
@@ -978,6 +1023,21 @@ i915_error_object_create(struct drm_i915_private *i915,
 	return dst;
 }
 
+static struct drm_i915_error_object *
+i915_error_object_create(struct drm_i915_private *i915,
+			 struct i915_vma *vma)
+{
+	return __i915_error_object_create(i915, vma, NULL);
+}
+
+static struct drm_i915_error_object *
+i915_error_batch_object_create(struct drm_i915_private *i915,
+			       struct i915_vma *vma,
+			       bool *signature)
+{
+	return __i915_error_object_create(i915, vma, signature);
+}
+
 /* The error capture is special as tries to run underneath the normal
  * locking rules - so we use the raw version of the i915_gem_active lookup.
  */
@@ -1484,8 +1544,9 @@ static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
 			 * by userspace.
 			 */
 			ee->batchbuffer =
-				i915_error_object_create(dev_priv,
-							 request->batch);
+				i915_error_batch_object_create(dev_priv,
+							       request->batch,
+							       &ee->batch_had_hang_signature);
 
 			if (HAS_BROKEN_CS_TLB(dev_priv))
 				ee->wa_batchbuffer =
@@ -1701,7 +1762,8 @@ static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
 static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
 				   struct i915_gpu_state *error,
 				   u32 engine_mask,
-				   const char *error_msg)
+				   const char *error_msg,
+				   bool simulated)
 {
 	u32 ecode;
 	int engine_id = -1, len;
@@ -1709,7 +1771,8 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
 	ecode = i915_error_generate_code(dev_priv, error, &engine_id);
 
 	len = scnprintf(error->error_msg, sizeof(error->error_msg),
-			"GPU HANG: ecode %d:%d:0x%08x",
+			"GPU HANG%s: ecode %d:%d:0x%08x",
+			simulated ? "(simulated)" : "",
 			INTEL_GEN(dev_priv), engine_id, ecode);
 
 	if (engine_id != -1 && error->engine[engine_id].context.pid)
@@ -1803,6 +1866,18 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
 	return error;
 }
 
+static bool error_state_has_hang_signature(const struct i915_gpu_state *error)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
+		if (error->engine[i].batch_had_hang_signature)
+			return true;
+	}
+
+	return false;
+}
+
 /**
  * i915_capture_error_state - capture an error record for later analysis
  * @dev: drm device
@@ -1813,12 +1888,14 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
  * to pick up.
  */
 void i915_capture_error_state(struct drm_i915_private *dev_priv,
+			      bool intentional,
 			      u32 engine_mask,
 			      const char *error_msg)
 {
 	static bool warned;
 	struct i915_gpu_state *error;
 	unsigned long flags;
+	bool expected;
 
 	if (!i915_modparams.error_capture)
 		return;
@@ -1832,8 +1909,13 @@ void i915_capture_error_state(struct drm_i915_private *dev_priv,
 		return;
 	}
 
-	i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
-	DRM_INFO("%s\n", error->error_msg);
+	expected = intentional || error_state_has_hang_signature(error);
+	i915_error_capture_msg(dev_priv, error, engine_mask, error_msg, expected);
+
+	if (expected)
+		DRM_INFO("%s\n", error->error_msg);
+	else
+		DRM_WARN("%s\n", error->error_msg);
 
 	if (!error->simulated) {
 		spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index ff00e462697a..75f519f910cc 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2828,6 +2828,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
 /**
  * i915_handle_error - handle a gpu error
  * @dev_priv: i915 device private
+ * @intentional: error was intentionally injected
  * @engine_mask: mask representing engines that are hung
  * @fmt: Error message format string
  *
@@ -2838,6 +2839,7 @@ static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
  * of a ring dump etc.).
  */
 void i915_handle_error(struct drm_i915_private *dev_priv,
+		       bool intentional,
 		       u32 engine_mask,
 		       const char *fmt, ...)
 {
@@ -2859,7 +2861,7 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
 	 */
 	intel_runtime_pm_get(dev_priv);
 
-	i915_capture_error_state(dev_priv, engine_mask, error_msg);
+	i915_capture_error_state(dev_priv, intentional, engine_mask, error_msg);
 	i915_clear_error_registers(dev_priv);
 
 	/*
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 12ac270a5f93..7b9e8dec7d65 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -266,7 +266,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	 */
 	tmp = I915_READ_CTL(engine);
 	if (tmp & RING_WAIT) {
-		i915_handle_error(dev_priv, 0,
+		i915_handle_error(dev_priv, false, 0,
 				  "Kicking stuck wait on %s",
 				  engine->name);
 		I915_WRITE_CTL(engine, tmp);
@@ -278,7 +278,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 		default:
 			return ENGINE_DEAD;
 		case 1:
-			i915_handle_error(dev_priv, 0,
+			i915_handle_error(dev_priv, false, 0,
 					  "Kicking stuck semaphore on %s",
 					  engine->name);
 			I915_WRITE_CTL(engine, tmp);
@@ -407,7 +407,7 @@ static void hangcheck_declare_hang(struct drm_i915_private *i915,
 				 "%s, ", engine->name);
 	msg[len-2] = '\0';
 
-	return i915_handle_error(i915, hung, "%s", msg);
+	return i915_handle_error(i915, false, hung, "%s", msg);
 }
 
 /*
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 71ce06680d66..7900bba9fe16 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -846,7 +846,7 @@ static int igt_handle_error(void *arg)
 	engine->hangcheck.stalled = true;
 	engine->hangcheck.seqno = intel_engine_get_seqno(engine);
 
-	i915_handle_error(i915, intel_engine_flag(engine), "%s", __func__);
+	i915_handle_error(i915, true, intel_engine_flag(engine), "%s", __func__);
 
 	xchg(&i915->gpu_error.first_error, error);
 
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs
  2017-11-10 11:53 [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs Mika Kuoppala
@ 2017-11-10 12:16 ` Chris Wilson
  2017-11-10 12:20   ` Mika Kuoppala
  2017-11-10 12:47 ` ✓ Fi.CI.BAT: success for " Patchwork
  2017-11-10 14:22 ` ✓ Fi.CI.IGT: " Patchwork
  2 siblings, 1 reply; 9+ messages in thread
From: Chris Wilson @ 2017-11-10 12:16 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx; +Cc: Daniel Vetter

Quoting Mika Kuoppala (2017-11-10 11:53:47)
> We have a problem of distinguishing intended hangs
> submitted by igt during CI/bat and hangs that are nonintended
> happening in close proximity.

Do we? I haven't had that problem in distinguishing them.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs
  2017-11-10 12:16 ` Chris Wilson
@ 2017-11-10 12:20   ` Mika Kuoppala
  2017-11-10 12:30     ` Chris Wilson
  0 siblings, 1 reply; 9+ messages in thread
From: Mika Kuoppala @ 2017-11-10 12:20 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Daniel Vetter

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2017-11-10 11:53:47)
>> We have a problem of distinguishing intended hangs
>> submitted by igt during CI/bat and hangs that are nonintended
>> happening in close proximity.
>
> Do we? I haven't had that problem in distinguishing them.

Piglit can't tell them apart afaik. Due to info level.

-Mika


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs
  2017-11-10 12:20   ` Mika Kuoppala
@ 2017-11-10 12:30     ` Chris Wilson
  2017-11-10 12:49       ` Mika Kuoppala
  2017-11-10 12:52       ` Chris Wilson
  0 siblings, 2 replies; 9+ messages in thread
From: Chris Wilson @ 2017-11-10 12:30 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx; +Cc: Daniel Vetter

Quoting Mika Kuoppala (2017-11-10 12:20:55)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Mika Kuoppala (2017-11-10 11:53:47)
> >> We have a problem of distinguishing intended hangs
> >> submitted by igt during CI/bat and hangs that are nonintended
> >> happening in close proximity.
> >
> > Do we? I haven't had that problem in distinguishing them.
> 
> Piglit can't tell them apart afaik. Due to info level.

Piglit? If the test passes, it doesn't matter how the kernel got there,
the user behaviour is as expected. If the test wants to assert that it
didn't hang, it can do that.

I am very opposed to interpreting magic contents within the kernel, more
so when it is already known by userspace.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Print dmesg warn on unintended hangs
  2017-11-10 11:53 [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs Mika Kuoppala
  2017-11-10 12:16 ` Chris Wilson
@ 2017-11-10 12:47 ` Patchwork
  2017-11-10 14:22 ` ✓ Fi.CI.IGT: " Patchwork
  2 siblings, 0 replies; 9+ messages in thread
From: Patchwork @ 2017-11-10 12:47 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Print dmesg warn on unintended hangs
URL   : https://patchwork.freedesktop.org/series/33597/
State : success

== Summary ==

Series 33597v1 drm/i915: Print dmesg warn on unintended hangs
https://patchwork.freedesktop.org/api/1.0/series/33597/revisions/1/mbox/

fi-bdw-5557u     total:289  pass:268  dwarn:0   dfail:0   fail:0   skip:21  time:444s
fi-bdw-gvtdvm    total:289  pass:265  dwarn:0   dfail:0   fail:0   skip:24  time:454s
fi-blb-e6850     total:289  pass:223  dwarn:1   dfail:0   fail:0   skip:65  time:382s
fi-bsw-n3050     total:289  pass:243  dwarn:0   dfail:0   fail:0   skip:46  time:540s
fi-bwr-2160      total:289  pass:183  dwarn:0   dfail:0   fail:0   skip:106 time:274s
fi-bxt-dsi       total:289  pass:259  dwarn:0   dfail:0   fail:0   skip:30  time:498s
fi-bxt-j4205     total:289  pass:260  dwarn:0   dfail:0   fail:0   skip:29  time:505s
fi-byt-j1900     total:289  pass:254  dwarn:0   dfail:0   fail:0   skip:35  time:498s
fi-byt-n2820     total:289  pass:250  dwarn:0   dfail:0   fail:0   skip:39  time:499s
fi-elk-e7500     total:289  pass:229  dwarn:0   dfail:0   fail:0   skip:60  time:427s
fi-gdg-551       total:289  pass:178  dwarn:1   dfail:0   fail:1   skip:109 time:259s
fi-glk-1         total:289  pass:261  dwarn:0   dfail:0   fail:0   skip:28  time:545s
fi-hsw-4770      total:289  pass:262  dwarn:0   dfail:0   fail:0   skip:27  time:430s
fi-hsw-4770r     total:289  pass:262  dwarn:0   dfail:0   fail:0   skip:27  time:440s
fi-ilk-650       total:289  pass:228  dwarn:0   dfail:0   fail:0   skip:61  time:427s
fi-ivb-3520m     total:289  pass:260  dwarn:0   dfail:0   fail:0   skip:29  time:482s
fi-ivb-3770      total:289  pass:260  dwarn:0   dfail:0   fail:0   skip:29  time:460s
fi-kbl-7500u     total:289  pass:264  dwarn:1   dfail:0   fail:0   skip:24  time:481s
fi-kbl-7560u     total:289  pass:270  dwarn:0   dfail:0   fail:0   skip:19  time:521s
fi-kbl-7567u     total:289  pass:269  dwarn:0   dfail:0   fail:0   skip:20  time:473s
fi-kbl-r         total:289  pass:262  dwarn:0   dfail:0   fail:0   skip:27  time:530s
fi-pnv-d510      total:289  pass:222  dwarn:1   dfail:0   fail:0   skip:66  time:574s
fi-skl-6260u     total:289  pass:269  dwarn:0   dfail:0   fail:0   skip:20  time:466s
fi-skl-6600u     total:289  pass:262  dwarn:0   dfail:0   fail:0   skip:27  time:546s
fi-skl-6700hq    total:289  pass:263  dwarn:0   dfail:0   fail:0   skip:26  time:565s
fi-skl-6700k     total:289  pass:265  dwarn:0   dfail:0   fail:0   skip:24  time:522s
fi-skl-6770hq    total:289  pass:269  dwarn:0   dfail:0   fail:0   skip:20  time:499s
fi-skl-gvtdvm    total:289  pass:266  dwarn:0   dfail:0   fail:0   skip:23  time:462s
fi-snb-2520m     total:289  pass:250  dwarn:0   dfail:0   fail:0   skip:39  time:559s
fi-snb-2600      total:289  pass:249  dwarn:0   dfail:0   fail:0   skip:40  time:420s
Blacklisted hosts:
fi-cfl-s         total:289  pass:254  dwarn:3   dfail:0   fail:0   skip:32  time:529s
fi-glk-dsi       total:289  pass:259  dwarn:0   dfail:0   fail:0   skip:30  time:489s
fi-cnl-y failed to connect after reboot

c36994b067f0771eee4d3136f40c0c6040a8fa77 drm-tip: 2017y-11m-10d-11h-48m-19s UTC integration manifest
b4aa111288af drm/i915: Print dmesg warn on unintended hangs

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_7057/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs
  2017-11-10 12:30     ` Chris Wilson
@ 2017-11-10 12:49       ` Mika Kuoppala
  2017-11-10 13:21         ` Daniel Vetter
  2017-11-10 12:52       ` Chris Wilson
  1 sibling, 1 reply; 9+ messages in thread
From: Mika Kuoppala @ 2017-11-10 12:49 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Daniel Vetter

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2017-11-10 12:20:55)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Quoting Mika Kuoppala (2017-11-10 11:53:47)
>> >> We have a problem of distinguishing intended hangs
>> >> submitted by igt during CI/bat and hangs that are nonintended
>> >> happening in close proximity.
>> >
>> > Do we? I haven't had that problem in distinguishing them.
>> 
>> Piglit can't tell them apart afaik. Due to info level.
>
> Piglit? If the test passes, it doesn't matter how the kernel got there,
> the user behaviour is as expected. If the test wants to assert that it
> didn't hang, it can do that.

Through reset counts? At starters we could assert in framework that
all tests that do not call igt_hang() expect reset count to
stay the same between entry/exit.

I see the logic behind that user behaviour is as expected.

Would be good that CI folks chime in here and detail how
they want things to work.

-Mika
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs
  2017-11-10 12:30     ` Chris Wilson
  2017-11-10 12:49       ` Mika Kuoppala
@ 2017-11-10 12:52       ` Chris Wilson
  1 sibling, 0 replies; 9+ messages in thread
From: Chris Wilson @ 2017-11-10 12:52 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx; +Cc: Daniel Vetter

Quoting Chris Wilson (2017-11-10 12:30:21)
> Quoting Mika Kuoppala (2017-11-10 12:20:55)
> > Chris Wilson <chris@chris-wilson.co.uk> writes:
> > 
> > > Quoting Mika Kuoppala (2017-11-10 11:53:47)
> > >> We have a problem of distinguishing intended hangs
> > >> submitted by igt during CI/bat and hangs that are nonintended
> > >> happening in close proximity.
> > >
> > > Do we? I haven't had that problem in distinguishing them.
> > 
> > Piglit can't tell them apart afaik. Due to info level.
> 
> Piglit? If the test passes, it doesn't matter how the kernel got there,
> the user behaviour is as expected. If the test wants to assert that it
> didn't hang, it can do that.
> 
> I am very opposed to interpreting magic contents within the kernel, more
> so when it is already known by userspace.

A clearer form of identification would be either a context-param saying
that userspace is going to use this context for injecting hang, which we
already have and use, or for fine grained precision having an execbuf
flag (copying onto the request) saying that this is the hanging batch.

But that only makes sense (to me) in the context of gem_reset_stats,
drv_hangman, gem_exec_capture or whichever, where you want such fine
grained checking. And as some point, we acknowledge we are checking
kernel internals (not user behaviour) and move it to the selftests.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs
  2017-11-10 12:49       ` Mika Kuoppala
@ 2017-11-10 13:21         ` Daniel Vetter
  0 siblings, 0 replies; 9+ messages in thread
From: Daniel Vetter @ 2017-11-10 13:21 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: Daniel Vetter, intel-gfx

On Fri, Nov 10, 2017 at 02:49:25PM +0200, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Mika Kuoppala (2017-11-10 12:20:55)
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >> 
> >> > Quoting Mika Kuoppala (2017-11-10 11:53:47)
> >> >> We have a problem of distinguishing intended hangs
> >> >> submitted by igt during CI/bat and hangs that are nonintended
> >> >> happening in close proximity.
> >> >
> >> > Do we? I haven't had that problem in distinguishing them.
> >> 
> >> Piglit can't tell them apart afaik. Due to info level.
> >
> > Piglit? If the test passes, it doesn't matter how the kernel got there,
> > the user behaviour is as expected. If the test wants to assert that it
> > didn't hang, it can do that.
> 
> Through reset counts? At starters we could assert in framework that
> all tests that do not call igt_hang() expect reset count to
> stay the same between entry/exit.
> 
> I see the logic behind that user behaviour is as expected.
> 
> Would be good that CI folks chime in here and detail how
> they want things to work.

I'm very vary of having to sprinkle that all over CI tbh, but if it's in
the framework I guess it can work too. Will be fun to figure out how to
catch unintended hangs in the tests that do provoke hangs, but should be
doable.

But for adding it to the framework I think we're already putting way too
much random quiescent stuff in there, and for generic kms tests there's
kinda no need for that. So not entirely sold that this is the best
approach we can do.

A semi-middleground would be if we have new functions that open a gem fd
for rendering, and we have some sanity-checks to make sure that only when
you ask for rendering do the igt ioctl wrappers allow you to. Then we
could stuff all these checks in there.

But that still leaves the issue that a gpu hang on e.g. a s/r test or
module reload won't be caught, and we really want to catch these.

Module reload btw is also one case where just checking the reset counter
will just not work. And module reload is exactly one of these cases where
we do want to make sure we don't misprogram the gpu so it dies.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

* ✓ Fi.CI.IGT: success for drm/i915: Print dmesg warn on unintended hangs
  2017-11-10 11:53 [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs Mika Kuoppala
  2017-11-10 12:16 ` Chris Wilson
  2017-11-10 12:47 ` ✓ Fi.CI.BAT: success for " Patchwork
@ 2017-11-10 14:22 ` Patchwork
  2 siblings, 0 replies; 9+ messages in thread
From: Patchwork @ 2017-11-10 14:22 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Print dmesg warn on unintended hangs
URL   : https://patchwork.freedesktop.org/series/33597/
State : success

== Summary ==

Test kms_flip:
        Subgroup plain-flip-ts-check:
                pass       -> FAIL       (shard-hsw) fdo#100368
        Subgroup dpms-vs-vblank-race-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#103060 +1
Test kms_setmode:
        Subgroup basic:
                fail       -> PASS       (shard-hsw) fdo#99912
Test kms_busy:
        Subgroup extended-modeset-hang-oldfb-with-reset-render-c:
                dmesg-warn -> PASS       (shard-hsw) fdo#102249

fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103060 https://bugs.freedesktop.org/show_bug.cgi?id=103060
fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912
fdo#102249 https://bugs.freedesktop.org/show_bug.cgi?id=102249

shard-hsw        total:2584 pass:1451 dwarn:1   dfail:2   fail:12  skip:1118 time:9354s
Blacklisted hosts:
shard-apl        total:2511 pass:1553 dwarn:3   dfail:2   fail:19  skip:934 time:12317s
shard-kbl        total:2534 pass:1654 dwarn:20  dfail:4   fail:24  skip:830 time:9681s
shard-snb        total:2584 pass:1191 dwarn:1   dfail:2   fail:12  skip:1378 time:7764s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_7057/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-11-10 14:22 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-10 11:53 [PATCH RFC] drm/i915: Print dmesg warn on unintended hangs Mika Kuoppala
2017-11-10 12:16 ` Chris Wilson
2017-11-10 12:20   ` Mika Kuoppala
2017-11-10 12:30     ` Chris Wilson
2017-11-10 12:49       ` Mika Kuoppala
2017-11-10 13:21         ` Daniel Vetter
2017-11-10 12:52       ` Chris Wilson
2017-11-10 12:47 ` ✓ Fi.CI.BAT: success for " Patchwork
2017-11-10 14:22 ` ✓ Fi.CI.IGT: " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.