All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/7] drm/i915/selftests: Flush old resets between engines
@ 2018-02-05  9:21 Chris Wilson
  2018-02-05  9:21 ` [PATCH 2/7] drm/i915/selftests: Use a sacrificial context for hang testing Chris Wilson
                   ` (7 more replies)
  0 siblings, 8 replies; 16+ messages in thread
From: Chris Wilson @ 2018-02-05  9:21 UTC (permalink / raw)
  To: intel-gfx

When injecting rapid resets, we must be careful to at least wait for the
previous reset to have taken effect and the engine restarted. If we
perform a second reset before that has happened, we will notice that the
engine hasn't recovered and declare it lost, wedging the device and
failing. In practice, since we wait for each hanging batch to start
before injecting the reset, this too-fast-reset condition can only be
triggered when moving onto the next engine in the test, so we need only
wait for the existing reset to complete before switching engines.

v2: Wrap up the wait inside a safety net to bail out in case of angry hw.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 65 ++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index d1f91a533afa..a4f4ff22389b 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -244,6 +244,57 @@ static u32 hws_seqno(const struct hang *h,
 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
 }
 
+struct wedge_me {
+	struct delayed_work work;
+	struct drm_i915_private *i915;
+	const void *symbol;
+};
+
+static void wedge_me(struct work_struct *work)
+{
+	struct wedge_me *w = container_of(work, typeof(*w), work.work);
+
+	pr_err("%pS timed out, cancelling all further testing.\n",
+	       w->symbol);
+	i915_gem_set_wedged(w->i915);
+}
+
+static void __init_wedge(struct wedge_me *w,
+			 struct drm_i915_private *i915,
+			 long timeout,
+			 const void *symbol)
+{
+	w->i915 = i915;
+	w->symbol = symbol;
+
+	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
+	schedule_delayed_work(&w->work, timeout);
+}
+
+static void __fini_wedge(struct wedge_me *w)
+{
+	cancel_delayed_work_sync(&w->work);
+	destroy_delayed_work_on_stack(&w->work);
+	w->i915 = NULL;
+}
+
+#define wedge_on_timeout(W, DEV, TIMEOUT)				\
+	for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
+	     (W)->i915;							\
+	     __fini_wedge((W)))
+
+static int flush_test(struct drm_i915_private *i915, unsigned int flags)
+{
+	struct wedge_me w;
+
+	cond_resched();
+
+	wedge_on_timeout(&w, i915, HZ)
+		i915_gem_wait_for_idle(i915, flags);
+
+	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
+}
+
 static void hang_fini(struct hang *h)
 {
 	*h->batch = MI_BATCH_BUFFER_END;
@@ -255,7 +306,7 @@ static void hang_fini(struct hang *h)
 	i915_gem_object_unpin_map(h->hws);
 	i915_gem_object_put(h->hws);
 
-	i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
+	flush_test(h->i915, I915_WAIT_LOCKED);
 }
 
 static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
@@ -487,7 +538,9 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 		if (err)
 			break;
 
-		cond_resched();
+		err = flush_test(i915, 0);
+		if (err)
+			break;
 	}
 
 	if (i915_terminally_wedged(&i915->gpu_error))
@@ -726,7 +779,9 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
 		if (err)
 			break;
 
-		cond_resched();
+		err = flush_test(i915, 0);
+		if (err)
+			break;
 	}
 
 	if (i915_terminally_wedged(&i915->gpu_error))
@@ -952,6 +1007,10 @@ static int igt_reset_queue(void *arg)
 		i915_gem_chipset_flush(i915);
 
 		i915_gem_request_put(prev);
+
+		err = flush_test(i915, I915_WAIT_LOCKED);
+		if (err)
+			break;
 	}
 
 fini:
-- 
2.15.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/7] drm/i915/selftests: Use a sacrificial context for hang testing
  2018-02-05  9:21 [PATCH 1/7] drm/i915/selftests: Flush old resets between engines Chris Wilson
@ 2018-02-05  9:21 ` Chris Wilson
  2018-02-05 14:02   ` Mika Kuoppala
  2018-02-05  9:21 ` [PATCH 3/7] drm/i915/execlists: Move the reset bits to a more natural home Chris Wilson
                   ` (6 subsequent siblings)
  7 siblings, 1 reply; 16+ messages in thread
From: Chris Wilson @ 2018-02-05  9:21 UTC (permalink / raw)
  To: intel-gfx; +Cc: Michel

Avoid injecting hangs in to the i915->kernel_context in case the GPU
reset leaves corruption in the context image in its wake (leading to
continual failures and system hangs after the selftests are ostensibly
complete). Use a sacrificial kernel_context instead.

v2: Closing a context is tricky; export a function (for selftests) from
i915_gem_context.c to get it right.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com
---
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 39 +++++++++++++-----------
 drivers/gpu/drm/i915/selftests/mock_context.c    | 11 +++++++
 drivers/gpu/drm/i915/selftests/mock_context.h    |  3 ++
 3 files changed, 36 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index a4f4ff22389b..e0b662a2b8ae 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -33,6 +33,7 @@ struct hang {
 	struct drm_i915_private *i915;
 	struct drm_i915_gem_object *hws;
 	struct drm_i915_gem_object *obj;
+	struct i915_gem_context *ctx;
 	u32 *seqno;
 	u32 *batch;
 };
@@ -45,9 +46,15 @@ static int hang_init(struct hang *h, struct drm_i915_private *i915)
 	memset(h, 0, sizeof(*h));
 	h->i915 = i915;
 
+	h->ctx = kernel_context(i915);
+	if (IS_ERR(h->ctx))
+		return PTR_ERR(h->ctx);
+
 	h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
-	if (IS_ERR(h->hws))
-		return PTR_ERR(h->hws);
+	if (IS_ERR(h->hws)) {
+		err = PTR_ERR(h->hws);
+		goto err_ctx;
+	}
 
 	h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
 	if (IS_ERR(h->obj)) {
@@ -79,6 +86,8 @@ static int hang_init(struct hang *h, struct drm_i915_private *i915)
 	i915_gem_object_put(h->obj);
 err_hws:
 	i915_gem_object_put(h->hws);
+err_ctx:
+	kernel_context_close(h->ctx);
 	return err;
 }
 
@@ -196,9 +205,7 @@ static int emit_recurse_batch(struct hang *h,
 }
 
 static struct drm_i915_gem_request *
-hang_create_request(struct hang *h,
-		    struct intel_engine_cs *engine,
-		    struct i915_gem_context *ctx)
+hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *rq;
 	int err;
@@ -225,7 +232,7 @@ hang_create_request(struct hang *h,
 		h->batch = vaddr;
 	}
 
-	rq = i915_gem_request_alloc(engine, ctx);
+	rq = i915_gem_request_alloc(engine, h->ctx);
 	if (IS_ERR(rq))
 		return rq;
 
@@ -306,6 +313,8 @@ static void hang_fini(struct hang *h)
 	i915_gem_object_unpin_map(h->hws);
 	i915_gem_object_put(h->hws);
 
+	kernel_context_close(h->ctx);
+
 	flush_test(h->i915, I915_WAIT_LOCKED);
 }
 
@@ -341,7 +350,7 @@ static int igt_hang_sanitycheck(void *arg)
 		if (!intel_engine_can_store_dword(engine))
 			continue;
 
-		rq = hang_create_request(&h, engine, i915->kernel_context);
+		rq = hang_create_request(&h, engine);
 		if (IS_ERR(rq)) {
 			err = PTR_ERR(rq);
 			pr_err("Failed to create request for %s, err=%d\n",
@@ -478,8 +487,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 				struct drm_i915_gem_request *rq;
 
 				mutex_lock(&i915->drm.struct_mutex);
-				rq = hang_create_request(&h, engine,
-							 i915->kernel_context);
+				rq = hang_create_request(&h, engine);
 				if (IS_ERR(rq)) {
 					err = PTR_ERR(rq);
 					mutex_unlock(&i915->drm.struct_mutex);
@@ -686,8 +694,7 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
 				struct drm_i915_gem_request *rq;
 
 				mutex_lock(&i915->drm.struct_mutex);
-				rq = hang_create_request(&h, engine,
-							 i915->kernel_context);
+				rq = hang_create_request(&h, engine);
 				if (IS_ERR(rq)) {
 					err = PTR_ERR(rq);
 					mutex_unlock(&i915->drm.struct_mutex);
@@ -842,7 +849,7 @@ static int igt_wait_reset(void *arg)
 	if (err)
 		goto unlock;
 
-	rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context);
+	rq = hang_create_request(&h, i915->engine[RCS]);
 	if (IS_ERR(rq)) {
 		err = PTR_ERR(rq);
 		goto fini;
@@ -921,7 +928,7 @@ static int igt_reset_queue(void *arg)
 		if (!intel_engine_can_store_dword(engine))
 			continue;
 
-		prev = hang_create_request(&h, engine, i915->kernel_context);
+		prev = hang_create_request(&h, engine);
 		if (IS_ERR(prev)) {
 			err = PTR_ERR(prev);
 			goto fini;
@@ -935,9 +942,7 @@ static int igt_reset_queue(void *arg)
 			struct drm_i915_gem_request *rq;
 			unsigned int reset_count;
 
-			rq = hang_create_request(&h,
-						 engine,
-						 i915->kernel_context);
+			rq = hang_create_request(&h, engine);
 			if (IS_ERR(rq)) {
 				err = PTR_ERR(rq);
 				goto fini;
@@ -1048,7 +1053,7 @@ static int igt_handle_error(void *arg)
 	if (err)
 		goto err_unlock;
 
-	rq = hang_create_request(&h, engine, i915->kernel_context);
+	rq = hang_create_request(&h, engine);
 	if (IS_ERR(rq)) {
 		err = PTR_ERR(rq);
 		goto err_fini;
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index bbf80d42e793..501becc47c0c 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -92,3 +92,14 @@ live_context(struct drm_i915_private *i915, struct drm_file *file)
 
 	return i915_gem_create_context(i915, file->driver_priv);
 }
+
+struct i915_gem_context *
+kernel_context(struct drm_i915_private *i915)
+{
+	return i915_gem_context_create_kernel(i915, I915_PRIORITY_NORMAL);
+}
+
+void kernel_context_close(struct i915_gem_context *ctx)
+{
+	context_close(ctx);
+}
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.h b/drivers/gpu/drm/i915/selftests/mock_context.h
index 2f432c03d413..29b9d60a158b 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.h
+++ b/drivers/gpu/drm/i915/selftests/mock_context.h
@@ -36,4 +36,7 @@ void mock_context_close(struct i915_gem_context *ctx);
 struct i915_gem_context *
 live_context(struct drm_i915_private *i915, struct drm_file *file);
 
+struct i915_gem_context *kernel_context(struct drm_i915_private *i915);
+void kernel_context_close(struct i915_gem_context *ctx);
+
 #endif /* !__MOCK_CONTEXT_H */
-- 
2.15.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 3/7] drm/i915/execlists: Move the reset bits to a more natural home
  2018-02-05  9:21 [PATCH 1/7] drm/i915/selftests: Flush old resets between engines Chris Wilson
  2018-02-05  9:21 ` [PATCH 2/7] drm/i915/selftests: Use a sacrificial context for hang testing Chris Wilson
@ 2018-02-05  9:21 ` Chris Wilson
  2018-02-05 15:19   ` Mika Kuoppala
  2018-02-05  9:21 ` [PATCH 4/7] drm/i915: Skip post-reset request emission if the engine is not idle Chris Wilson
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 16+ messages in thread
From: Chris Wilson @ 2018-02-05  9:21 UTC (permalink / raw)
  To: intel-gfx

In preparation for the next patch, we want the engine to appear idle
after a reset (if there are no requests in flight). For execlists, this
entails clearing the active status on reset, it will be regenerated on
restarting the engine after the reset. In the process, note that a
couple of other status flags and checks could be moved into the
describing function.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index deeedfc9fe44..0af9488e4070 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1462,6 +1462,9 @@ static void enable_execlists(struct intel_engine_cs *engine)
 	I915_WRITE(RING_HWS_PGA(engine->mmio_base),
 		   engine->status_page.ggtt_offset);
 	POSTING_READ(RING_HWS_PGA(engine->mmio_base));
+
+	/* Following the reset, we need to reload the CSB read/write pointers */
+	engine->execlists.csb_head = -1;
 }
 
 static int gen8_init_common_ring(struct intel_engine_cs *engine)
@@ -1479,11 +1482,6 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	enable_execlists(engine);
 	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name);
 
-	GEM_BUG_ON(engine->id >= ARRAY_SIZE(gtiir));
-
-	execlists->csb_head = -1;
-	execlists->active = 0;
-
 	/* After a GPU reset, we may have requests to replay */
 	if (execlists->first)
 		tasklet_schedule(&execlists->tasklet);
@@ -1529,6 +1527,8 @@ static void reset_irq(struct intel_engine_cs *engine)
 	struct drm_i915_private *dev_priv = engine->i915;
 	int i;
 
+	GEM_BUG_ON(engine->id >= ARRAY_SIZE(gtiir));
+
 	/*
 	 * Clear any pending interrupt state.
 	 *
@@ -1577,6 +1577,9 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
+	/* Mark all CS interrupts as complete */
+	execlists->active = 0;
+
 	/* If the request was innocent, we leave the request in the ELSP
 	 * and will try to replay it on restarting. The context image may
 	 * have been corrupted by the reset, in which case we may have
-- 
2.15.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 4/7] drm/i915: Skip post-reset request emission if the engine is not idle
  2018-02-05  9:21 [PATCH 1/7] drm/i915/selftests: Flush old resets between engines Chris Wilson
  2018-02-05  9:21 ` [PATCH 2/7] drm/i915/selftests: Use a sacrificial context for hang testing Chris Wilson
  2018-02-05  9:21 ` [PATCH 3/7] drm/i915/execlists: Move the reset bits to a more natural home Chris Wilson
@ 2018-02-05  9:21 ` Chris Wilson
  2018-02-05  9:21 ` [PATCH 5/7] drm/i915: Show the GPU state when declaring wedged Chris Wilson
                   ` (4 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Chris Wilson @ 2018-02-05  9:21 UTC (permalink / raw)
  To: intel-gfx

Since commit 7b6da818d86f ("drm/i915: Restore the kernel context after a
GPU reset on an idle engine") we submit a request following the engine
reset. The intent is that we don't submit a request if the engine is
busy (as it will restart active by itself) but we only checked to see if
there were remaining requests in flight on the hardware and skipped
checking to see if there were any ready requests that would be
immediately submitted on restart (the same time as our new request would
be). Having convinced the engine to appear idle in the previous patch,
we can use intel_engine_is_idle() as a better test to only submit a new
request if there are no pending requests.

As it happens, this is tripping up igt/drv_selftest/live_hangcheck in CI
as we overfill the kernel_context ringbuffer trigger an infinite
recursion from within the reset.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104786
References: 7b6da818d86f ("drm/i915: Restore the kernel context after a GPU reset on an idle engine")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 63308ec016a3..6090ef3141be 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3134,7 +3134,7 @@ void i915_gem_reset(struct drm_i915_private *dev_priv)
 		 * an incoherent read by the CS (presumably stale TLB). An
 		 * empty request appears sufficient to paper over the glitch.
 		 */
-		if (list_empty(&engine->timeline->requests)) {
+		if (intel_engine_is_idle(engine)) {
 			struct drm_i915_gem_request *rq;
 
 			rq = i915_gem_request_alloc(engine,
-- 
2.15.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 5/7] drm/i915: Show the GPU state when declaring wedged
  2018-02-05  9:21 [PATCH 1/7] drm/i915/selftests: Flush old resets between engines Chris Wilson
                   ` (2 preceding siblings ...)
  2018-02-05  9:21 ` [PATCH 4/7] drm/i915: Skip post-reset request emission if the engine is not idle Chris Wilson
@ 2018-02-05  9:21 ` Chris Wilson
  2018-02-05  9:51   ` Mika Kuoppala
  2018-02-05  9:22 ` [PATCH 6/7] drm/i915/execlists: Remove the startup spam Chris Wilson
                   ` (3 subsequent siblings)
  7 siblings, 1 reply; 16+ messages in thread
From: Chris Wilson @ 2018-02-05  9:21 UTC (permalink / raw)
  To: intel-gfx

Dump each engine state when i915_gem_set_wedged() is called to give us
some more clues as to why we had to terminate the GPU.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6090ef3141be..a11358fd1176 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3199,6 +3199,13 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
+	if (drm_debug & DRM_UT_DRIVER) {
+		struct drm_printer p = drm_debug_printer(__func__);
+
+		for_each_engine(engine, i915, id)
+			intel_engine_dump(engine, &p, "%s\n", engine->name);
+	}
+
 	/*
 	 * First, stop submission to hw, but do not yet complete requests by
 	 * rolling the global seqno forward (since this would complete requests
-- 
2.15.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 6/7] drm/i915/execlists: Remove the startup spam
  2018-02-05  9:21 [PATCH 1/7] drm/i915/selftests: Flush old resets between engines Chris Wilson
                   ` (3 preceding siblings ...)
  2018-02-05  9:21 ` [PATCH 5/7] drm/i915: Show the GPU state when declaring wedged Chris Wilson
@ 2018-02-05  9:22 ` Chris Wilson
  2018-02-05  9:22 ` [PATCH 7/7] drm/i915: Remove unbannable context spam from reset Chris Wilson
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 16+ messages in thread
From: Chris Wilson @ 2018-02-05  9:22 UTC (permalink / raw)
  To: intel-gfx

Execlists is now enabled by default and included in the list of
capabilities printed out to dmesg and beyond. We do not need to mention
it again, every time we restart the engine, so kill the spam.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0af9488e4070..adf257dfa5e0 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1480,7 +1480,6 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	intel_engine_init_hangcheck(engine);
 
 	enable_execlists(engine);
-	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name);
 
 	/* After a GPU reset, we may have requests to replay */
 	if (execlists->first)
-- 
2.15.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 7/7] drm/i915: Remove unbannable context spam from reset
  2018-02-05  9:21 [PATCH 1/7] drm/i915/selftests: Flush old resets between engines Chris Wilson
                   ` (4 preceding siblings ...)
  2018-02-05  9:22 ` [PATCH 6/7] drm/i915/execlists: Remove the startup spam Chris Wilson
@ 2018-02-05  9:22 ` Chris Wilson
  2018-02-05  9:30   ` Chris Wilson
  2018-02-05  9:58 ` ✗ Fi.CI.BAT: warning for series starting with [1/7] drm/i915/selftests: Flush old resets between engines Patchwork
  2018-02-05 14:27 ` [PATCH 1/7] " Mika Kuoppala
  7 siblings, 1 reply; 16+ messages in thread
From: Chris Wilson @ 2018-02-05  9:22 UTC (permalink / raw)
  To: intel-gfx

During testing, we trigger a lot of resets on an unbannable context
leading to massive amounts of irrelevant debug spam. Remove the
ban_score accounting and message for the unbannable context so that we
improve the signal:noise in the log messages for when the unexpected
occurs.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a11358fd1176..4dd084f34f1e 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2823,24 +2823,23 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
 	return 0;
 }
 
-static bool ban_context(const struct i915_gem_context *ctx,
-			unsigned int score)
-{
-	return (i915_gem_context_is_bannable(ctx) &&
-		score >= CONTEXT_SCORE_BAN_THRESHOLD);
-}
-
 static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
 {
-	unsigned int score;
 	bool banned;
 
 	atomic_inc(&ctx->guilty_count);
 
-	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
-	banned = ban_context(ctx, score);
-	DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
-			 ctx->name, score, yesno(banned));
+	banned = false;
+	if (i915_gem_context_is_bannable(ctx)) {
+		unsigned int score;
+
+		score = atomic_add_return(CONTEXT_SCORE_GUILTY,
+					  &ctx->ban_score);
+		banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
+
+		DRM_DEBUG_DRIVER("context %s marked guilty (score %d) banned? %s\n",
+				 ctx->name, score, yesno(banned));
+	}
 	if (!banned)
 		return;
 
-- 
2.15.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 7/7] drm/i915: Remove unbannable context spam from reset
  2018-02-05  9:22 ` [PATCH 7/7] drm/i915: Remove unbannable context spam from reset Chris Wilson
@ 2018-02-05  9:30   ` Chris Wilson
  2018-02-05 13:27     ` Chris Wilson
  0 siblings, 1 reply; 16+ messages in thread
From: Chris Wilson @ 2018-02-05  9:30 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2018-02-05 09:22:01)
> During testing, we trigger a lot of resets on an unbannable context
> leading to massive amounts of irrelevant debug spam. Remove the
> ban_score accounting and message for the unbannable context so that we
> improve the signal:noise in the log messages for when the unexpected
> occurs.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>

From older thread,
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/7] drm/i915: Show the GPU state when declaring wedged
  2018-02-05  9:21 ` [PATCH 5/7] drm/i915: Show the GPU state when declaring wedged Chris Wilson
@ 2018-02-05  9:51   ` Mika Kuoppala
  2018-02-05 10:02     ` Chris Wilson
  0 siblings, 1 reply; 16+ messages in thread
From: Mika Kuoppala @ 2018-02-05  9:51 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Dump each engine state when i915_gem_set_wedged() is called to give us
> some more clues as to why we had to terminate the GPU.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c | 7 +++++++
>  1 file changed, 7 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 6090ef3141be..a11358fd1176 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3199,6 +3199,13 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>  
> +	if (drm_debug & DRM_UT_DRIVER) {
> +		struct drm_printer p = drm_debug_printer(__func__);
> +
> +		for_each_engine(engine, i915, id)
> +			intel_engine_dump(engine, &p, "%s\n", engine->name);

We have both %s\n and plain %s across the dumps we do across the driver.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> +	}
> +
>  	/*
>  	 * First, stop submission to hw, but do not yet complete requests by
>  	 * rolling the global seqno forward (since this would complete requests
> -- 
> 2.15.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* ✗ Fi.CI.BAT: warning for series starting with [1/7] drm/i915/selftests: Flush old resets between engines
  2018-02-05  9:21 [PATCH 1/7] drm/i915/selftests: Flush old resets between engines Chris Wilson
                   ` (5 preceding siblings ...)
  2018-02-05  9:22 ` [PATCH 7/7] drm/i915: Remove unbannable context spam from reset Chris Wilson
@ 2018-02-05  9:58 ` Patchwork
  2018-02-05 14:27 ` [PATCH 1/7] " Mika Kuoppala
  7 siblings, 0 replies; 16+ messages in thread
From: Patchwork @ 2018-02-05  9:58 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/7] drm/i915/selftests: Flush old resets between engines
URL   : https://patchwork.freedesktop.org/series/37645/
State : warning

== Summary ==

Series 37645v1 series starting with [1/7] drm/i915/selftests: Flush old resets between engines
https://patchwork.freedesktop.org/api/1.0/series/37645/revisions/1/mbox/

Test gem_mmap_gtt:
        Subgroup basic-small-bo-tiledx:
                pass       -> FAIL       (fi-gdg-551) fdo#102575
Test gem_sync:
        Subgroup basic-all:
                pass       -> SKIP       (fi-blb-e6850)
        Subgroup basic-each:
                pass       -> SKIP       (fi-blb-e6850)
        Subgroup basic-many-each:
                pass       -> SKIP       (fi-blb-e6850)
        Subgroup basic-store-all:
                pass       -> SKIP       (fi-blb-e6850)
        Subgroup basic-store-each:
                pass       -> SKIP       (fi-blb-e6850)
Test gem_tiled_blits:
        Subgroup basic:
                pass       -> SKIP       (fi-blb-e6850)
Test gem_tiled_fence_blits:
        Subgroup basic:
                pass       -> SKIP       (fi-blb-e6850)
Test gem_wait:
        Subgroup basic-busy-all:
                pass       -> SKIP       (fi-blb-e6850)
        Subgroup basic-wait-all:
                pass       -> SKIP       (fi-blb-e6850)
        Subgroup basic-await-all:
                pass       -> SKIP       (fi-blb-e6850)
Test kms_busy:
        Subgroup basic-flip-a:
                pass       -> SKIP       (fi-blb-e6850)
        Subgroup basic-flip-b:
                pass       -> SKIP       (fi-blb-e6850)
Test kms_cursor_legacy:
        Subgroup basic-busy-flip-before-cursor-legacy:
                pass       -> SKIP       (fi-blb-e6850)

fdo#102575 https://bugs.freedesktop.org/show_bug.cgi?id=102575

fi-bdw-5557u     total:288  pass:267  dwarn:0   dfail:0   fail:0   skip:21  time:418s
fi-bdw-gvtdvm    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:425s
fi-blb-e6850     total:288  pass:210  dwarn:1   dfail:0   fail:0   skip:77  time:344s
fi-bsw-n3050     total:288  pass:242  dwarn:0   dfail:0   fail:0   skip:46  time:483s
fi-bwr-2160      total:288  pass:183  dwarn:0   dfail:0   fail:0   skip:105 time:283s
fi-bxt-dsi       total:288  pass:258  dwarn:0   dfail:0   fail:0   skip:30  time:479s
fi-bxt-j4205     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:484s
fi-byt-j1900     total:288  pass:253  dwarn:0   dfail:0   fail:0   skip:35  time:472s
fi-byt-n2820     total:288  pass:249  dwarn:0   dfail:0   fail:0   skip:39  time:452s
fi-cfl-s2        total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:567s
fi-elk-e7500     total:288  pass:229  dwarn:0   dfail:0   fail:0   skip:59  time:419s
fi-gdg-551       total:288  pass:179  dwarn:0   dfail:0   fail:1   skip:108 time:277s
fi-glk-1         total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:508s
fi-hsw-4770      total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:387s
fi-hsw-4770r     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:399s
fi-ilk-650       total:288  pass:228  dwarn:0   dfail:0   fail:0   skip:60  time:407s
fi-ivb-3520m     total:288  pass:259  dwarn:0   dfail:0   fail:0   skip:29  time:451s
fi-ivb-3770      total:288  pass:255  dwarn:0   dfail:0   fail:0   skip:33  time:419s
fi-kbl-7500u     total:288  pass:263  dwarn:1   dfail:0   fail:0   skip:24  time:456s
fi-kbl-7560u     total:288  pass:269  dwarn:0   dfail:0   fail:0   skip:19  time:498s
fi-kbl-7567u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:451s
fi-kbl-r         total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:500s
fi-pnv-d510      total:288  pass:222  dwarn:1   dfail:0   fail:0   skip:65  time:582s
fi-skl-6260u     total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:432s
fi-skl-6600u     total:288  pass:261  dwarn:0   dfail:0   fail:0   skip:27  time:511s
fi-skl-6700hq    total:288  pass:262  dwarn:0   dfail:0   fail:0   skip:26  time:530s
fi-skl-6700k2    total:288  pass:264  dwarn:0   dfail:0   fail:0   skip:24  time:480s
fi-skl-6770hq    total:288  pass:268  dwarn:0   dfail:0   fail:0   skip:20  time:491s
fi-skl-guc       total:288  pass:260  dwarn:0   dfail:0   fail:0   skip:28  time:418s
fi-skl-gvtdvm    total:288  pass:265  dwarn:0   dfail:0   fail:0   skip:23  time:430s
fi-snb-2520m     total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:525s
fi-snb-2600      total:288  pass:248  dwarn:0   dfail:0   fail:0   skip:40  time:393s
Blacklisted hosts:
fi-glk-dsi       total:288  pass:258  dwarn:0   dfail:0   fail:0   skip:30  time:469s

2e76a2952923eba64c4f9baf461613bc42ee997a drm-tip: 2018y-02m-02d-20h-33m-12s UTC integration manifest
e63de0d27388 drm/i915: Remove unbannable context spam from reset
e029dfa3c180 drm/i915/execlists: Remove the startup spam
0f15882e49c8 drm/i915: Show the GPU state when declaring wedged
84574a7a1c7b drm/i915: Skip post-reset request emission if the engine is not idle
06a631a2449d drm/i915/execlists: Move the reset bits to a more natural home
4b5083f7ce6d drm/i915/selftests: Use a sacrificial context for hang testing
4d3fd2f0c674 drm/i915/selftests: Flush old resets between engines

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_7882/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 5/7] drm/i915: Show the GPU state when declaring wedged
  2018-02-05  9:51   ` Mika Kuoppala
@ 2018-02-05 10:02     ` Chris Wilson
  0 siblings, 0 replies; 16+ messages in thread
From: Chris Wilson @ 2018-02-05 10:02 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2018-02-05 09:51:42)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Dump each engine state when i915_gem_set_wedged() is called to give us
> > some more clues as to why we had to terminate the GPU.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_gem.c | 7 +++++++
> >  1 file changed, 7 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 6090ef3141be..a11358fd1176 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -3199,6 +3199,13 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
> >       struct intel_engine_cs *engine;
> >       enum intel_engine_id id;
> >  
> > +     if (drm_debug & DRM_UT_DRIVER) {
> > +             struct drm_printer p = drm_debug_printer(__func__);
> > +
> > +             for_each_engine(engine, i915, id)
> > +                     intel_engine_dump(engine, &p, "%s\n", engine->name);
> 
> We have both %s\n and plain %s across the dumps we do across the driver.

Hmm, if (header) drm_vprintf(m, header, &ap); We need those "\n".

Though the whole indentation scheme for intel_engine_dump() needs some
thought and fixing.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 7/7] drm/i915: Remove unbannable context spam from reset
  2018-02-05  9:30   ` Chris Wilson
@ 2018-02-05 13:27     ` Chris Wilson
  0 siblings, 0 replies; 16+ messages in thread
From: Chris Wilson @ 2018-02-05 13:27 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2018-02-05 09:30:27)
> Quoting Chris Wilson (2018-02-05 09:22:01)
> > During testing, we trigger a lot of resets on an unbannable context
> > leading to massive amounts of irrelevant debug spam. Remove the
> > ban_score accounting and message for the unbannable context so that we
> > improve the signal:noise in the log messages for when the unexpected
> > occurs.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> 
> From older thread,
> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

Applied the 3 logging reduction patches along with the 2 improvements
suggested by Mika. Still need to get the selftest and reset fixes
reviewed, please :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/7] drm/i915/selftests: Use a sacrificial context for hang testing
  2018-02-05  9:21 ` [PATCH 2/7] drm/i915/selftests: Use a sacrificial context for hang testing Chris Wilson
@ 2018-02-05 14:02   ` Mika Kuoppala
  2018-02-05 14:06     ` Chris Wilson
  0 siblings, 1 reply; 16+ messages in thread
From: Mika Kuoppala @ 2018-02-05 14:02 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Michel

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Avoid injecting hangs in to the i915->kernel_context in case the GPU
> reset leaves corruption in the context image in its wake (leading to
> continual failures and system hangs after the selftests are ostensibly
> complete). Use a sacrificial kernel_context instead.
>
> v2: Closing a context is tricky; export a function (for selftests) from
> i915_gem_context.c to get it right.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com
> ---
>  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 39 +++++++++++++-----------
>  drivers/gpu/drm/i915/selftests/mock_context.c    | 11 +++++++
>  drivers/gpu/drm/i915/selftests/mock_context.h    |  3 ++
>  3 files changed, 36 insertions(+), 17 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index a4f4ff22389b..e0b662a2b8ae 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -33,6 +33,7 @@ struct hang {
>  	struct drm_i915_private *i915;
>  	struct drm_i915_gem_object *hws;
>  	struct drm_i915_gem_object *obj;
> +	struct i915_gem_context *ctx;
>  	u32 *seqno;
>  	u32 *batch;
>  };
> @@ -45,9 +46,15 @@ static int hang_init(struct hang *h, struct drm_i915_private *i915)
>  	memset(h, 0, sizeof(*h));
>  	h->i915 = i915;
>  
> +	h->ctx = kernel_context(i915);
> +	if (IS_ERR(h->ctx))
> +		return PTR_ERR(h->ctx);
> +
>  	h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
> -	if (IS_ERR(h->hws))
> -		return PTR_ERR(h->hws);
> +	if (IS_ERR(h->hws)) {
> +		err = PTR_ERR(h->hws);
> +		goto err_ctx;
> +	}
>  
>  	h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
>  	if (IS_ERR(h->obj)) {
> @@ -79,6 +86,8 @@ static int hang_init(struct hang *h, struct drm_i915_private *i915)
>  	i915_gem_object_put(h->obj);
>  err_hws:
>  	i915_gem_object_put(h->hws);
> +err_ctx:
> +	kernel_context_close(h->ctx);
>  	return err;
>  }
>  
> @@ -196,9 +205,7 @@ static int emit_recurse_batch(struct hang *h,
>  }
>  
>  static struct drm_i915_gem_request *
> -hang_create_request(struct hang *h,
> -		    struct intel_engine_cs *engine,
> -		    struct i915_gem_context *ctx)
> +hang_create_request(struct hang *h, struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_gem_request *rq;
>  	int err;
> @@ -225,7 +232,7 @@ hang_create_request(struct hang *h,
>  		h->batch = vaddr;
>  	}
>  
> -	rq = i915_gem_request_alloc(engine, ctx);
> +	rq = i915_gem_request_alloc(engine, h->ctx);
>  	if (IS_ERR(rq))
>  		return rq;
>  
> @@ -306,6 +313,8 @@ static void hang_fini(struct hang *h)
>  	i915_gem_object_unpin_map(h->hws);
>  	i915_gem_object_put(h->hws);
>  
> +	kernel_context_close(h->ctx);
> +
>  	flush_test(h->i915, I915_WAIT_LOCKED);
>  }
>  
> @@ -341,7 +350,7 @@ static int igt_hang_sanitycheck(void *arg)
>  		if (!intel_engine_can_store_dword(engine))
>  			continue;
>  
> -		rq = hang_create_request(&h, engine, i915->kernel_context);
> +		rq = hang_create_request(&h, engine);
>  		if (IS_ERR(rq)) {
>  			err = PTR_ERR(rq);
>  			pr_err("Failed to create request for %s, err=%d\n",
> @@ -478,8 +487,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
>  				struct drm_i915_gem_request *rq;
>  
>  				mutex_lock(&i915->drm.struct_mutex);
> -				rq = hang_create_request(&h, engine,
> -							 i915->kernel_context);
> +				rq = hang_create_request(&h, engine);
>  				if (IS_ERR(rq)) {
>  					err = PTR_ERR(rq);
>  					mutex_unlock(&i915->drm.struct_mutex);
> @@ -686,8 +694,7 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
>  				struct drm_i915_gem_request *rq;
>  
>  				mutex_lock(&i915->drm.struct_mutex);
> -				rq = hang_create_request(&h, engine,
> -							 i915->kernel_context);
> +				rq = hang_create_request(&h, engine);
>  				if (IS_ERR(rq)) {
>  					err = PTR_ERR(rq);
>  					mutex_unlock(&i915->drm.struct_mutex);
> @@ -842,7 +849,7 @@ static int igt_wait_reset(void *arg)
>  	if (err)
>  		goto unlock;
>  
> -	rq = hang_create_request(&h, i915->engine[RCS], i915->kernel_context);
> +	rq = hang_create_request(&h, i915->engine[RCS]);
>  	if (IS_ERR(rq)) {
>  		err = PTR_ERR(rq);
>  		goto fini;
> @@ -921,7 +928,7 @@ static int igt_reset_queue(void *arg)
>  		if (!intel_engine_can_store_dword(engine))
>  			continue;
>  
> -		prev = hang_create_request(&h, engine, i915->kernel_context);
> +		prev = hang_create_request(&h, engine);
>  		if (IS_ERR(prev)) {
>  			err = PTR_ERR(prev);
>  			goto fini;
> @@ -935,9 +942,7 @@ static int igt_reset_queue(void *arg)
>  			struct drm_i915_gem_request *rq;
>  			unsigned int reset_count;
>  
> -			rq = hang_create_request(&h,
> -						 engine,
> -						 i915->kernel_context);
> +			rq = hang_create_request(&h, engine);
>  			if (IS_ERR(rq)) {
>  				err = PTR_ERR(rq);
>  				goto fini;
> @@ -1048,7 +1053,7 @@ static int igt_handle_error(void *arg)
>  	if (err)
>  		goto err_unlock;
>  
> -	rq = hang_create_request(&h, engine, i915->kernel_context);
> +	rq = hang_create_request(&h, engine);
>  	if (IS_ERR(rq)) {
>  		err = PTR_ERR(rq);
>  		goto err_fini;
> diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
> index bbf80d42e793..501becc47c0c 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_context.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_context.c
> @@ -92,3 +92,14 @@ live_context(struct drm_i915_private *i915, struct drm_file *file)
>  
>  	return i915_gem_create_context(i915, file->driver_priv);
>  }
> +
> +struct i915_gem_context *
> +kernel_context(struct drm_i915_private *i915)

kernel_context_create would be more symmetric.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> +{
> +	return i915_gem_context_create_kernel(i915, I915_PRIORITY_NORMAL);
> +}
> +
> +void kernel_context_close(struct i915_gem_context *ctx)
> +{
> +	context_close(ctx);
> +}
> diff --git a/drivers/gpu/drm/i915/selftests/mock_context.h b/drivers/gpu/drm/i915/selftests/mock_context.h
> index 2f432c03d413..29b9d60a158b 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_context.h
> +++ b/drivers/gpu/drm/i915/selftests/mock_context.h
> @@ -36,4 +36,7 @@ void mock_context_close(struct i915_gem_context *ctx);
>  struct i915_gem_context *
>  live_context(struct drm_i915_private *i915, struct drm_file *file);
>  
> +struct i915_gem_context *kernel_context(struct drm_i915_private *i915);
> +void kernel_context_close(struct i915_gem_context *ctx);
> +
>  #endif /* !__MOCK_CONTEXT_H */
> -- 
> 2.15.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/7] drm/i915/selftests: Use a sacrificial context for hang testing
  2018-02-05 14:02   ` Mika Kuoppala
@ 2018-02-05 14:06     ` Chris Wilson
  0 siblings, 0 replies; 16+ messages in thread
From: Chris Wilson @ 2018-02-05 14:06 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx; +Cc: Michel

Quoting Mika Kuoppala (2018-02-05 14:02:55)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> > diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
> > index bbf80d42e793..501becc47c0c 100644
> > --- a/drivers/gpu/drm/i915/selftests/mock_context.c
> > +++ b/drivers/gpu/drm/i915/selftests/mock_context.c
> > @@ -92,3 +92,14 @@ live_context(struct drm_i915_private *i915, struct drm_file *file)
> >  
> >       return i915_gem_create_context(i915, file->driver_priv);
> >  }
> > +
> > +struct i915_gem_context *
> > +kernel_context(struct drm_i915_private *i915)
> 
> kernel_context_create would be more symmetric.

Already established mock_context(), live_context() hence
kernel_context(). Which will comes first C++ in the kernel or the will
to rename? ;)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/7] drm/i915/selftests: Flush old resets between engines
  2018-02-05  9:21 [PATCH 1/7] drm/i915/selftests: Flush old resets between engines Chris Wilson
                   ` (6 preceding siblings ...)
  2018-02-05  9:58 ` ✗ Fi.CI.BAT: warning for series starting with [1/7] drm/i915/selftests: Flush old resets between engines Patchwork
@ 2018-02-05 14:27 ` Mika Kuoppala
  7 siblings, 0 replies; 16+ messages in thread
From: Mika Kuoppala @ 2018-02-05 14:27 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> When injecting rapid resets, we must be careful to at least wait for the
> previous reset to have taken effect and the engine restarted. If we
> perform a second reset before that has happened, we will notice that the
> engine hasn't recovered and declare it lost, wedging the device and
> failing. In practice, since we wait for each hanging batch to start
> before injecting the reset, this too-fast-reset condition can only be
> triggered when moving onto the next engine in the test, so we need only
> wait for the existing reset to complete before switching engines.
>
> v2: Wrap up the wait inside a safety net to bail out in case of angry hw.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 65 ++++++++++++++++++++++--
>  1 file changed, 62 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index d1f91a533afa..a4f4ff22389b 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -244,6 +244,57 @@ static u32 hws_seqno(const struct hang *h,
>  	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
>  }
>  
> +struct wedge_me {
> +	struct delayed_work work;
> +	struct drm_i915_private *i915;
> +	const void *symbol;
> +};
> +
> +static void wedge_me(struct work_struct *work)
> +{
> +	struct wedge_me *w = container_of(work, typeof(*w), work.work);
> +
> +	pr_err("%pS timed out, cancelling all further testing.\n",
> +	       w->symbol);
> +	i915_gem_set_wedged(w->i915);
> +}
> +
> +static void __init_wedge(struct wedge_me *w,
> +			 struct drm_i915_private *i915,
> +			 long timeout,
> +			 const void *symbol)
> +{
> +	w->i915 = i915;
> +	w->symbol = symbol;
> +
> +	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> +	schedule_delayed_work(&w->work, timeout);
> +}
> +
> +static void __fini_wedge(struct wedge_me *w)
> +{
> +	cancel_delayed_work_sync(&w->work);
> +	destroy_delayed_work_on_stack(&w->work);
> +	w->i915 = NULL;
> +}
> +
> +#define wedge_on_timeout(W, DEV, TIMEOUT)				\
> +	for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
> +	     (W)->i915;							\
> +	     __fini_wedge((W)))
> +
> +static int flush_test(struct drm_i915_private *i915, unsigned int flags)
> +{
> +	struct wedge_me w;
> +
> +	cond_resched();
> +
> +	wedge_on_timeout(&w, i915, HZ)
> +		i915_gem_wait_for_idle(i915, flags);
> +
> +	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
> +}
> +
>  static void hang_fini(struct hang *h)
>  {
>  	*h->batch = MI_BATCH_BUFFER_END;
> @@ -255,7 +306,7 @@ static void hang_fini(struct hang *h)
>  	i915_gem_object_unpin_map(h->hws);
>  	i915_gem_object_put(h->hws);
>  
> -	i915_gem_wait_for_idle(h->i915, I915_WAIT_LOCKED);
> +	flush_test(h->i915, I915_WAIT_LOCKED);
>  }
>  
>  static bool wait_for_hang(struct hang *h, struct drm_i915_gem_request *rq)
> @@ -487,7 +538,9 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
>  		if (err)
>  			break;
>  
> -		cond_resched();
> +		err = flush_test(i915, 0);
> +		if (err)
> +			break;
>  	}
>  
>  	if (i915_terminally_wedged(&i915->gpu_error))
> @@ -726,7 +779,9 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
>  		if (err)
>  			break;
>  
> -		cond_resched();
> +		err = flush_test(i915, 0);
> +		if (err)
> +			break;
>  	}
>  
>  	if (i915_terminally_wedged(&i915->gpu_error))
> @@ -952,6 +1007,10 @@ static int igt_reset_queue(void *arg)
>  		i915_gem_chipset_flush(i915);
>  
>  		i915_gem_request_put(prev);
> +
> +		err = flush_test(i915, I915_WAIT_LOCKED);
> +		if (err)
> +			break;
>  	}
>  
>  fini:
> -- 
> 2.15.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 3/7] drm/i915/execlists: Move the reset bits to a more natural home
  2018-02-05  9:21 ` [PATCH 3/7] drm/i915/execlists: Move the reset bits to a more natural home Chris Wilson
@ 2018-02-05 15:19   ` Mika Kuoppala
  0 siblings, 0 replies; 16+ messages in thread
From: Mika Kuoppala @ 2018-02-05 15:19 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> In preparation for the next patch, we want the engine to appear idle
> after a reset (if there are no requests in flight). For execlists, this
> entails clearing the active status on reset, it will be regenerated on
> restarting the engine after the reset. In the process, note that a
> couple of other status flags and checks could be moved into the
> describing function.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 13 ++++++++-----
>  1 file changed, 8 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index deeedfc9fe44..0af9488e4070 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1462,6 +1462,9 @@ static void enable_execlists(struct intel_engine_cs *engine)
>  	I915_WRITE(RING_HWS_PGA(engine->mmio_base),
>  		   engine->status_page.ggtt_offset);
>  	POSTING_READ(RING_HWS_PGA(engine->mmio_base));
> +
> +	/* Following the reset, we need to reload the CSB read/write pointers */
> +	engine->execlists.csb_head = -1;
>  }
>  
>  static int gen8_init_common_ring(struct intel_engine_cs *engine)
> @@ -1479,11 +1482,6 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>  	enable_execlists(engine);
>  	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name);
>  
> -	GEM_BUG_ON(engine->id >= ARRAY_SIZE(gtiir));
> -
> -	execlists->csb_head = -1;
> -	execlists->active = 0;
> -
>  	/* After a GPU reset, we may have requests to replay */
>  	if (execlists->first)
>  		tasklet_schedule(&execlists->tasklet);
> @@ -1529,6 +1527,8 @@ static void reset_irq(struct intel_engine_cs *engine)
>  	struct drm_i915_private *dev_priv = engine->i915;
>  	int i;
>  
> +	GEM_BUG_ON(engine->id >= ARRAY_SIZE(gtiir));
> +
>  	/*
>  	 * Clear any pending interrupt state.
>  	 *
> @@ -1577,6 +1577,9 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  
> +	/* Mark all CS interrupts as complete */
> +	execlists->active = 0;
> +
>  	/* If the request was innocent, we leave the request in the ELSP
>  	 * and will try to replay it on restarting. The context image may
>  	 * have been corrupted by the reset, in which case we may have
> -- 
> 2.15.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2018-02-05 15:20 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-02-05  9:21 [PATCH 1/7] drm/i915/selftests: Flush old resets between engines Chris Wilson
2018-02-05  9:21 ` [PATCH 2/7] drm/i915/selftests: Use a sacrificial context for hang testing Chris Wilson
2018-02-05 14:02   ` Mika Kuoppala
2018-02-05 14:06     ` Chris Wilson
2018-02-05  9:21 ` [PATCH 3/7] drm/i915/execlists: Move the reset bits to a more natural home Chris Wilson
2018-02-05 15:19   ` Mika Kuoppala
2018-02-05  9:21 ` [PATCH 4/7] drm/i915: Skip post-reset request emission if the engine is not idle Chris Wilson
2018-02-05  9:21 ` [PATCH 5/7] drm/i915: Show the GPU state when declaring wedged Chris Wilson
2018-02-05  9:51   ` Mika Kuoppala
2018-02-05 10:02     ` Chris Wilson
2018-02-05  9:22 ` [PATCH 6/7] drm/i915/execlists: Remove the startup spam Chris Wilson
2018-02-05  9:22 ` [PATCH 7/7] drm/i915: Remove unbannable context spam from reset Chris Wilson
2018-02-05  9:30   ` Chris Wilson
2018-02-05 13:27     ` Chris Wilson
2018-02-05  9:58 ` ✗ Fi.CI.BAT: warning for series starting with [1/7] drm/i915/selftests: Flush old resets between engines Patchwork
2018-02-05 14:27 ` [PATCH 1/7] " Mika Kuoppala

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.