[PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide
@ 2018-03-22  7:35 Chris Wilson
  2018-03-22  7:35 ` [PATCH 2/4] drm/i915/selftests: Stress resets-vs-request-priority Chris Wilson
                   ` (8 more replies)
  0 siblings, 9 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22  7:35 UTC (permalink / raw)
  To: intel-gfx

If we fail to reset the GPU in a timely fashion, dump the GEM trace so
that we can see what operations were in flight when the GPU got stuck.

v2: There's more than one timeout that deserves tracing!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4372826998aa..1969a65072ca 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
 {
 	struct wedge_me *w = container_of(work, typeof(*w), work.work);
 
-	pr_err("%pS timed out, cancelling all further testing.\n",
-	       w->symbol);
+	pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
+
+	GEM_TRACE("%pS timed out.\n", w->symbol);
+	GEM_TRACE_DUMP();
+
 	i915_gem_set_wedged(w->i915);
 }
 
@@ -621,9 +624,19 @@ static int active_engine(void *data)
 		mutex_unlock(&engine->i915->drm.struct_mutex);
 
 		if (old) {
-			i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
+			if (i915_request_wait(old, 0, 10*HZ) < 0) {
+				GEM_TRACE("%s timed out.\n", engine->name);
+				GEM_TRACE_DUMP();
+
+				i915_gem_set_wedged(engine->i915);
+				i915_request_put(old);
+				err = -EIO;
+				break;
+			}
 			i915_request_put(old);
 		}
+
+		cond_resched();
 	}
 
 	for (count = 0; count < ARRAY_SIZE(rq); count++)
@@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 
 	err = i915_subtests(tests, i915);
 
+	mutex_lock(&i915->drm.struct_mutex);
+	flush_test(i915, I915_WAIT_LOCKED);
+	mutex_unlock(&i915->drm.struct_mutex);
+
 	i915_modparams.enable_hangcheck = saved_hangcheck;
 	intel_runtime_pm_put(i915);
 
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 2/4] drm/i915/selftests: Stress resets-vs-request-priority
  2018-03-22  7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
@ 2018-03-22  7:35 ` Chris Wilson
  2018-03-22  7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22  7:35 UTC (permalink / raw)
  To: intel-gfx

Watch what happens if we try to reset with a queue of requests with
varying priorities -- that may need reordering or preemption across the
reset.

v2: Tweak priorities to avoid starving the hanging thread.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 189 +++++++++++++++--------
 1 file changed, 126 insertions(+), 63 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 1969a65072ca..c5ed4006c319 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -25,6 +25,7 @@
 #include <linux/kthread.h>
 
 #include "../i915_selftest.h"
+#include "i915_random.h"
 
 #include "mock_context.h"
 #include "mock_drm.h"
@@ -486,6 +487,8 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 
 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 		do {
+			u32 seqno = intel_engine_get_seqno(engine);
+
 			if (active) {
 				struct i915_request *rq;
 
@@ -514,12 +517,13 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 					break;
 				}
 
+				GEM_BUG_ON(!rq->global_seqno);
+				seqno = rq->global_seqno - 1;
 				i915_request_put(rq);
 			}
 
 			engine->hangcheck.stalled = true;
-			engine->hangcheck.seqno =
-				intel_engine_get_seqno(engine);
+			engine->hangcheck.seqno = seqno;
 
 			err = i915_reset_engine(engine, NULL);
 			if (err) {
@@ -576,11 +580,25 @@ static int igt_reset_active_engine(void *arg)
 	return __igt_reset_engine(arg, true);
 }
 
+struct active_engine {
+	struct task_struct *task;
+	struct intel_engine_cs *engine;
+	unsigned long resets;
+	unsigned int flags;
+};
+
+#define TEST_ACTIVE	BIT(0)
+#define TEST_OTHERS	BIT(1)
+#define TEST_SELF	BIT(2)
+#define TEST_PRIORITY	BIT(3)
+
 static int active_engine(void *data)
 {
-	struct intel_engine_cs *engine = data;
-	struct i915_request *rq[2] = {};
-	struct i915_gem_context *ctx[2];
+	I915_RND_STATE(prng);
+	struct active_engine *arg = data;
+	struct intel_engine_cs *engine = arg->engine;
+	struct i915_request *rq[8] = {};
+	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
 	struct drm_file *file;
 	unsigned long count = 0;
 	int err = 0;
@@ -589,25 +607,20 @@ static int active_engine(void *data)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	mutex_lock(&engine->i915->drm.struct_mutex);
-	ctx[0] = live_context(engine->i915, file);
-	mutex_unlock(&engine->i915->drm.struct_mutex);
-	if (IS_ERR(ctx[0])) {
-		err = PTR_ERR(ctx[0]);
-		goto err_file;
-	}
-
-	mutex_lock(&engine->i915->drm.struct_mutex);
-	ctx[1] = live_context(engine->i915, file);
-	mutex_unlock(&engine->i915->drm.struct_mutex);
-	if (IS_ERR(ctx[1])) {
-		err = PTR_ERR(ctx[1]);
-		i915_gem_context_put(ctx[0]);
-		goto err_file;
+	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
+		mutex_lock(&engine->i915->drm.struct_mutex);
+		ctx[count] = live_context(engine->i915, file);
+		mutex_unlock(&engine->i915->drm.struct_mutex);
+		if (IS_ERR(ctx[count])) {
+			err = PTR_ERR(ctx[count]);
+			while (--count)
+				i915_gem_context_put(ctx[count]);
+			goto err_file;
+		}
 	}
 
 	while (!kthread_should_stop()) {
-		unsigned int idx = count++ & 1;
+		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
 		struct i915_request *old = rq[idx];
 		struct i915_request *new;
 
@@ -619,6 +632,10 @@ static int active_engine(void *data)
 			break;
 		}
 
+		if (arg->flags & TEST_PRIORITY)
+			ctx[idx]->priority =
+				i915_prandom_u32_max_state(512, &prng);
+
 		rq[idx] = i915_request_get(new);
 		i915_request_add(new);
 		mutex_unlock(&engine->i915->drm.struct_mutex);
@@ -647,8 +664,9 @@ static int active_engine(void *data)
 	return err;
 }
 
-static int __igt_reset_engine_others(struct drm_i915_private *i915,
-				     bool active)
+static int __igt_reset_engines(struct drm_i915_private *i915,
+			       const char *test_name,
+			       unsigned int flags)
 {
 	struct intel_engine_cs *engine, *other;
 	enum intel_engine_id id, tmp;
@@ -662,50 +680,61 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
 	if (!intel_has_reset_engine(i915))
 		return 0;
 
-	if (active) {
+	if (flags & TEST_ACTIVE) {
 		mutex_lock(&i915->drm.struct_mutex);
 		err = hang_init(&h, i915);
 		mutex_unlock(&i915->drm.struct_mutex);
 		if (err)
 			return err;
+
+		if (flags & TEST_PRIORITY)
+			h.ctx->priority = 1024;
 	}
 
 	for_each_engine(engine, i915, id) {
-		struct task_struct *threads[I915_NUM_ENGINES] = {};
-		unsigned long resets[I915_NUM_ENGINES];
+		struct active_engine threads[I915_NUM_ENGINES] = {};
 		unsigned long global = i915_reset_count(&i915->gpu_error);
-		unsigned long count = 0;
+		unsigned long count = 0, reported;
 		IGT_TIMEOUT(end_time);
 
-		if (active && !intel_engine_can_store_dword(engine))
+		if (flags & TEST_ACTIVE &&
+		    !intel_engine_can_store_dword(engine))
 			continue;
 
 		memset(threads, 0, sizeof(threads));
 		for_each_engine(other, i915, tmp) {
 			struct task_struct *tsk;
 
-			resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
-							      other);
+			threads[tmp].resets =
+				i915_reset_engine_count(&i915->gpu_error,
+							other);
 
-			if (other == engine)
+			if (!(flags & TEST_OTHERS))
 				continue;
 
-			tsk = kthread_run(active_engine, other,
+			if (other == engine && !(flags & TEST_SELF))
+				continue;
+
+			threads[tmp].engine = other;
+			threads[tmp].flags = flags;
+
+			tsk = kthread_run(active_engine, &threads[tmp],
 					  "igt/%s", other->name);
 			if (IS_ERR(tsk)) {
 				err = PTR_ERR(tsk);
 				goto unwind;
 			}
 
-			threads[tmp] = tsk;
+			threads[tmp].task = tsk;
 			get_task_struct(tsk);
 		}
 
 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 		do {
-			if (active) {
-				struct i915_request *rq;
+			u32 seqno = intel_engine_get_seqno(engine);
+			struct i915_request *rq = NULL;
 
+			if (flags & TEST_ACTIVE) {
 				mutex_lock(&i915->drm.struct_mutex);
 				rq = hang_create_request(&h, engine);
 				if (IS_ERR(rq)) {
@@ -731,33 +760,38 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
 					break;
 				}
 
-				i915_request_put(rq);
+				GEM_BUG_ON(!rq->global_seqno);
+				seqno = rq->global_seqno - 1;
 			}
 
 			engine->hangcheck.stalled = true;
-			engine->hangcheck.seqno =
-				intel_engine_get_seqno(engine);
+			engine->hangcheck.seqno = seqno;
 
 			err = i915_reset_engine(engine, NULL);
 			if (err) {
-				pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
-				       engine->name, active ? "active" : "idle", err);
+				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
+				       engine->name, test_name, err);
 				break;
 			}
 
 			engine->hangcheck.stalled = false;
 			count++;
+
+			if (rq) {
+				i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
+				i915_request_put(rq);
+			}
 		} while (time_before(jiffies, end_time));
 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
-			engine->name, active ? "active" : "idle", count);
-
-		if (i915_reset_engine_count(&i915->gpu_error, engine) -
-		    resets[engine->id] != (active ? count : 0)) {
-			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
-			       engine->name, active ? "active" : "idle", count,
-			       i915_reset_engine_count(&i915->gpu_error,
-						       engine) - resets[engine->id]);
+			engine->name, test_name, count);
+
+		reported = i915_reset_engine_count(&i915->gpu_error, engine);
+		reported -= threads[engine->id].resets;
+		if (reported != (flags & TEST_ACTIVE ? count : 0)) {
+			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
+			       engine->name, test_name, count, reported,
+			       (flags & TEST_ACTIVE ? count : 0));
 			if (!err)
 				err = -EINVAL;
 		}
@@ -766,24 +800,26 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
 		for_each_engine(other, i915, tmp) {
 			int ret;
 
-			if (!threads[tmp])
+			if (!threads[tmp].task)
 				continue;
 
-			ret = kthread_stop(threads[tmp]);
+			ret = kthread_stop(threads[tmp].task);
 			if (ret) {
 				pr_err("kthread for other engine %s failed, err=%d\n",
 				       other->name, ret);
 				if (!err)
 					err = ret;
 			}
-			put_task_struct(threads[tmp]);
+			put_task_struct(threads[tmp].task);
 
-			if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
-								   other)) {
+			if (other != engine &&
+			    threads[tmp].resets !=
+			    i915_reset_engine_count(&i915->gpu_error, other)) {
 				pr_err("Innocent engine %s was reset (count=%ld)\n",
 				       other->name,
 				       i915_reset_engine_count(&i915->gpu_error,
-							       other) - resets[tmp]);
+							       other) -
+				       threads[tmp].resets);
 				if (!err)
 					err = -EINVAL;
 			}
@@ -807,7 +843,7 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
 	if (i915_terminally_wedged(&i915->gpu_error))
 		err = -EIO;
 
-	if (active) {
+	if (flags & TEST_ACTIVE) {
 		mutex_lock(&i915->drm.struct_mutex);
 		hang_fini(&h);
 		mutex_unlock(&i915->drm.struct_mutex);
@@ -816,14 +852,42 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
 	return err;
 }
 
-static int igt_reset_idle_engine_others(void *arg)
+static int igt_reset_engines(void *arg)
 {
-	return __igt_reset_engine_others(arg, false);
-}
+	static const struct {
+		const char *name;
+		unsigned int flags;
+	} phases[] = {
+		{ "idle", 0 },
+		{ "active", TEST_ACTIVE },
+		{ "others-idle", TEST_OTHERS },
+		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
+		{
+			"others-priority",
+			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
+		},
+		{
+			"self-priority",
+			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
+		},
+		{ }
+	};
+	struct drm_i915_private *i915 = arg;
+	typeof(*phases) *p;
+	int err;
 
-static int igt_reset_active_engine_others(void *arg)
-{
-	return __igt_reset_engine_others(arg, true);
+	for (p = phases; p->name; p++) {
+		if (p->flags & TEST_PRIORITY) {
+			if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
+				continue;
+		}
+
+		err = __igt_reset_engines(arg, p->name, p->flags);
+		if (err)
+			return err;
+	}
+
+	return 0;
 }
 
 static u32 fake_hangcheck(struct i915_request *rq)
@@ -1122,8 +1186,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(igt_hang_sanitycheck),
 		SUBTEST(igt_reset_idle_engine),
 		SUBTEST(igt_reset_active_engine),
-		SUBTEST(igt_reset_idle_engine_others),
-		SUBTEST(igt_reset_active_engine_others),
+		SUBTEST(igt_reset_engines),
 		SUBTEST(igt_wait_reset),
 		SUBTEST(igt_reset_queue),
 		SUBTEST(igt_handle_error),
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
  2018-03-22  7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
  2018-03-22  7:35 ` [PATCH 2/4] drm/i915/selftests: Stress resets-vs-request-priority Chris Wilson
@ 2018-03-22  7:35 ` Chris Wilson
  2018-03-22 14:35   ` Mika Kuoppala
                     ` (2 more replies)
  2018-03-22  7:35 ` [PATCH 4/4] drm/i915: Flush pending interrupt following a GPU reset Chris Wilson
                   ` (6 subsequent siblings)
  8 siblings, 3 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22  7:35 UTC (permalink / raw)
  To: intel-gfx

Using engine->irq_posted for execlists, we are not always serialised by
the tasklet as we supposed. On the reset paths, the tasklet is disabled
and ignored. Instead, we manipulate the engine->irq_posted directly to
account for the reset, but if an interrupt fired before the reset and so
wrote to engine->irq_posted, that write may not be flushed from the
local CPU's cacheline until much later as the tasklet is already active
and so does not generate a mb(). To correctly serialise the interrupt
with reset, we need serialisation on the set_bit() itself.

And at last Mika can be happy.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
CC: Michel Thierry <michel.thierry@intel.com>
Cc: Jeff McGee <jeff.mcgee@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_irq.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index fa7310766217..27aee25429b7 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
 	bool tasklet = false;
 
 	if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
-		if (READ_ONCE(engine->execlists.active)) {
-			__set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-			tasklet = true;
-		}
+		if (READ_ONCE(engine->execlists.active))
+			tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
+						    &engine->irq_posted);
 	}
 
 	if (iir & GT_RENDER_USER_INTERRUPT) {
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH 4/4] drm/i915: Flush pending interrupt following a GPU reset
  2018-03-22  7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
  2018-03-22  7:35 ` [PATCH 2/4] drm/i915/selftests: Stress resets-vs-request-priority Chris Wilson
  2018-03-22  7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
@ 2018-03-22  7:35 ` Chris Wilson
  2018-03-22  7:43 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide Patchwork
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22  7:35 UTC (permalink / raw)
  To: intel-gfx

After resetting the GPU (or subset of engines), call synchronize_irq()
to flush any pending irq before proceeding with the cleanup. For a
device level reset, we disable the interupts around the reset, but when
resetting just one engine, we have to avoid such global disabling. This
leaves us open to an interrupt arriving for the engine as we try to
reset it. We already do try to flush the IIR following the reset, but we
have to ensure that the in-flight interrupt does not land after we start
cleaning up after the reset; enter synchronize_irq().

As it current stands, we very rarely, but fatally, see sequences such as:

    2.... 57964564us : execlists_reset_prepare: rcs0
    2.... 57964613us : execlists_reset: rcs0 seqno=424
    0d.h1 57964615us : gen8_cs_irq_handler: rcs0 CS active=1
    2d..1 57964617us : __i915_request_unsubmit: rcs0 fence 29:1056 <- global_seqno 1060
    2.... 57964703us : execlists_reset_finish: rcs0
    0..s. 57964705us : execlists_submission_tasklet: rcs0 awake?=1, active=0, irq-posted?=1

v2: Move the sync into the execlists reset handler so that we coordinate
the flush with disabling the interrupt handling and canceling the
pending interrupt.
v3: Just use synchronize_hardirq() to avoid the might_sleep(), we do not
yet have threaded-irq to worry about.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Jeff McGee <jeff.mcgee@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c    | 7 ++++---
 drivers/gpu/drm/i915/intel_uncore.c | 4 +++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 67b6a0f658d6..ce09c5ad334f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -805,6 +805,10 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 
 	spin_unlock(&engine->timeline->lock);
 
+	/* Mark all CS interrupts as complete */
+	smp_store_mb(execlists->active, 0);
+	synchronize_hardirq(engine->i915->drm.irq);
+
 	/*
 	 * The port is checked prior to scheduling a tasklet, but
 	 * just in case we have suspended the tasklet to do the
@@ -813,9 +817,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	 */
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
 
-	/* Mark all CS interrupts as complete */
-	execlists->active = 0;
-
 	local_irq_restore(flags);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 4c616d074a97..f37ecfc69e49 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -2116,8 +2116,10 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 		i915_stop_engines(dev_priv, engine_mask);
 
 		ret = -ENODEV;
-		if (reset)
+		if (reset) {
+			GEM_TRACE("engine_mask=%x\n", engine_mask);
 			ret = reset(dev_priv, engine_mask);
+		}
 		if (ret != -ETIMEDOUT)
 			break;
 
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide
  2018-03-22  7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
                   ` (2 preceding siblings ...)
  2018-03-22  7:35 ` [PATCH 4/4] drm/i915: Flush pending interrupt following a GPU reset Chris Wilson
@ 2018-03-22  7:43 ` Patchwork
  2018-03-22  7:49 ` [PATCH v2] " Chris Wilson
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2018-03-22  7:43 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide
URL   : https://patchwork.freedesktop.org/series/40439/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
6adaf914c971 drm/i915/selftests: Include the trace as a debug aide
-:36: CHECK:SPACING: spaces preferred around that '*' (ctx:VxV)
#36: FILE: drivers/gpu/drm/i915/selftests/intel_hangcheck.c:627:
+			if (i915_request_wait(old, 0, 10*HZ) < 0) {
 			                                ^

total: 0 errors, 0 warnings, 1 checks, 43 lines checked
aae1777597a8 drm/i915/selftests: Stress resets-vs-request-priority
10e13a64240a drm/i915: Use full serialisation around engine->irq_posted
a41114e3c858 drm/i915: Flush pending interrupt following a GPU reset
-:23: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#23: 
    2d..1 57964617us : __i915_request_unsubmit: rcs0 fence 29:1056 <- global_seqno 1060

total: 0 errors, 1 warnings, 0 checks, 30 lines checked

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH v2] drm/i915/selftests: Include the trace as a debug aide
  2018-03-22  7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
                   ` (3 preceding siblings ...)
  2018-03-22  7:43 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide Patchwork
@ 2018-03-22  7:49 ` Chris Wilson
  2018-03-22 14:26   ` Mika Kuoppala
  2018-03-22  7:58 ` ✓ Fi.CI.BAT: success for series starting with [1/4] " Patchwork
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2018-03-22  7:49 UTC (permalink / raw)
  To: intel-gfx

If we fail to reset the GPU in a timely fashion, dump the GEM trace so
that we can see what operations were in flight when the GPU got stuck.

v2: There's more than one timeout that deserves tracing!
v3: Silence checkpatch by not even using a product at all!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4372826998aa..9b235dae8dd9 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
 {
 	struct wedge_me *w = container_of(work, typeof(*w), work.work);
 
-	pr_err("%pS timed out, cancelling all further testing.\n",
-	       w->symbol);
+	pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
+
+	GEM_TRACE("%pS timed out.\n", w->symbol);
+	GEM_TRACE_DUMP();
+
 	i915_gem_set_wedged(w->i915);
 }
 
@@ -621,9 +624,19 @@ static int active_engine(void *data)
 		mutex_unlock(&engine->i915->drm.struct_mutex);
 
 		if (old) {
-			i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
+			if (i915_request_wait(old, 0, HZ) < 0) {
+				GEM_TRACE("%s timed out.\n", engine->name);
+				GEM_TRACE_DUMP();
+
+				i915_gem_set_wedged(engine->i915);
+				i915_request_put(old);
+				err = -EIO;
+				break;
+			}
 			i915_request_put(old);
 		}
+
+		cond_resched();
 	}
 
 	for (count = 0; count < ARRAY_SIZE(rq); count++)
@@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 
 	err = i915_subtests(tests, i915);
 
+	mutex_lock(&i915->drm.struct_mutex);
+	flush_test(i915, I915_WAIT_LOCKED);
+	mutex_unlock(&i915->drm.struct_mutex);
+
 	i915_modparams.enable_hangcheck = saved_hangcheck;
 	intel_runtime_pm_put(i915);
 
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 19+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide
  2018-03-22  7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
                   ` (4 preceding siblings ...)
  2018-03-22  7:49 ` [PATCH v2] " Chris Wilson
@ 2018-03-22  7:58 ` Patchwork
  2018-03-22  8:02 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2) Patchwork
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2018-03-22  7:58 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide
URL   : https://patchwork.freedesktop.org/series/40439/
State : success

== Summary ==

Series 40439v1 series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide
https://patchwork.freedesktop.org/api/1.0/series/40439/revisions/1/mbox/

---- Known issues:

Test debugfs_test:
        Subgroup read_all_entries:
                incomplete -> PASS       (fi-snb-2520m) fdo#103713
Test kms_flip:
        Subgroup basic-flip-vs-wf_vblank:
                fail       -> PASS       (fi-skl-6770hq) fdo#100368
Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-b:
                pass       -> DMESG-FAIL (fi-cnl-y3) fdo#104951
        Subgroup suspend-read-crc-pipe-c:
                pass       -> INCOMPLETE (fi-bxt-dsi) fdo#103927

fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#104951 https://bugs.freedesktop.org/show_bug.cgi?id=104951
fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:430s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:442s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:379s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:536s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:297s
fi-bxt-dsi       total:243  pass:216  dwarn:0   dfail:0   fail:0   skip:26 
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:511s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:513s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:503s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:416s
fi-cfl-s2        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:567s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:527s
fi-cnl-y3        total:285  pass:258  dwarn:0   dfail:1   fail:0   skip:26  time:587s
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:427s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:320s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:531s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:404s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:419s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:469s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:434s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:475s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:464s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:513s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:653s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:437s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:535s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:504s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:512s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:427s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:444s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:563s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:403s
fi-cfl-u failed to connect after reboot

dff9ece60048108782aab6d6123822c1d34b0e5a drm-tip: 2018y-03m-21d-20h-44m-14s UTC integration manifest
a41114e3c858 drm/i915: Flush pending interrupt following a GPU reset
10e13a64240a drm/i915: Use full serialisation around engine->irq_posted
aae1777597a8 drm/i915/selftests: Stress resets-vs-request-priority
6adaf914c971 drm/i915/selftests: Include the trace as a debug aide

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8443/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* ✗ Fi.CI.CHECKPATCH: warning for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
  2018-03-22  7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
                   ` (5 preceding siblings ...)
  2018-03-22  7:58 ` ✓ Fi.CI.BAT: success for series starting with [1/4] " Patchwork
@ 2018-03-22  8:02 ` Patchwork
  2018-03-22  8:19 ` ✓ Fi.CI.BAT: success " Patchwork
  2018-03-22 10:23 ` ✓ Fi.CI.IGT: " Patchwork
  8 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2018-03-22  8:02 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
URL   : https://patchwork.freedesktop.org/series/40439/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
d4b28d06e3b8 drm/i915/selftests: Include the trace as a debug aide
59301a5a5c45 drm/i915/selftests: Stress resets-vs-request-priority
4eb53ac79094 drm/i915: Use full serialisation around engine->irq_posted
cf4fae2b05ff drm/i915: Flush pending interrupt following a GPU reset
-:23: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#23: 
    2d..1 57964617us : __i915_request_unsubmit: rcs0 fence 29:1056 <- global_seqno 1060

total: 0 errors, 1 warnings, 0 checks, 30 lines checked

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
  2018-03-22  7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
                   ` (6 preceding siblings ...)
  2018-03-22  8:02 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2) Patchwork
@ 2018-03-22  8:19 ` Patchwork
  2018-03-22 10:23 ` ✓ Fi.CI.IGT: " Patchwork
  8 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2018-03-22  8:19 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
URL   : https://patchwork.freedesktop.org/series/40439/
State : success

== Summary ==

Series 40439v2 series starting with [v2] drm/i915/selftests: Include the trace as a debug aide
https://patchwork.freedesktop.org/api/1.0/series/40439/revisions/2/mbox/

---- Known issues:

Test debugfs_test:
        Subgroup read_all_entries:
                incomplete -> PASS       (fi-snb-2520m) fdo#103713 +1
Test gem_exec_suspend:
        Subgroup basic-s3:
                pass       -> INCOMPLETE (fi-skl-6260u) fdo#104108
Test kms_flip:
        Subgroup basic-flip-vs-wf_vblank:
                fail       -> PASS       (fi-skl-6770hq) fdo#100368

fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
fdo#104108 https://bugs.freedesktop.org/show_bug.cgi?id=104108
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:433s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:444s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:380s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:535s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:297s
fi-bxt-dsi       total:285  pass:255  dwarn:0   dfail:0   fail:0   skip:30  time:512s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:513s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:516s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:503s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:409s
fi-cfl-s2        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:564s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:535s
fi-cnl-y3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:588s
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:429s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:320s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:536s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:401s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:425s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:472s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:434s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:480s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:466s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:513s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:652s
fi-skl-6260u     total:108  pass:104  dwarn:0   dfail:0   fail:0   skip:3  
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:529s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:495s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:500s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:438s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:444s
fi-snb-2520m     total:242  pass:208  dwarn:0   dfail:0   fail:0   skip:33 
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:398s

dff9ece60048108782aab6d6123822c1d34b0e5a drm-tip: 2018y-03m-21d-20h-44m-14s UTC integration manifest
cf4fae2b05ff drm/i915: Flush pending interrupt following a GPU reset
4eb53ac79094 drm/i915: Use full serialisation around engine->irq_posted
59301a5a5c45 drm/i915/selftests: Stress resets-vs-request-priority
d4b28d06e3b8 drm/i915/selftests: Include the trace as a debug aide

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8444/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* ✓ Fi.CI.IGT: success for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
  2018-03-22  7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
                   ` (7 preceding siblings ...)
  2018-03-22  8:19 ` ✓ Fi.CI.BAT: success " Patchwork
@ 2018-03-22 10:23 ` Patchwork
  8 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2018-03-22 10:23 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
URL   : https://patchwork.freedesktop.org/series/40439/
State : success

== Summary ==

---- Known issues:

Test kms_cursor_legacy:
        Subgroup flip-vs-cursor-atomic:
                pass       -> FAIL       (shard-hsw) fdo#102670
Test kms_flip:
        Subgroup flip-vs-wf_vblank-interruptible:
                fail       -> PASS       (shard-hsw) fdo#100368 +1
Test kms_pipe_crc_basic:
        Subgroup read-crc-pipe-c-frame-sequence:
                pass       -> FAIL       (shard-apl) fdo#103481
Test kms_setmode:
        Subgroup basic:
                pass       -> FAIL       (shard-apl) fdo#99912
Test kms_vblank:
        Subgroup pipe-a-ts-continuation-suspend:
                pass       -> INCOMPLETE (shard-hsw) fdo#103540

fdo#102670 https://bugs.freedesktop.org/show_bug.cgi?id=102670
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103481 https://bugs.freedesktop.org/show_bug.cgi?id=103481
fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912
fdo#103540 https://bugs.freedesktop.org/show_bug.cgi?id=103540

shard-apl        total:3478 pass:1814 dwarn:1   dfail:0   fail:8   skip:1655 time:13097s
shard-hsw        total:3391 pass:1726 dwarn:1   dfail:0   fail:3   skip:1659 time:11609s
shard-snb        total:3478 pass:1357 dwarn:1   dfail:0   fail:3   skip:2117 time:7279s
Blacklisted hosts:
shard-kbl        total:3454 pass:1924 dwarn:1   dfail:0   fail:9   skip:1519 time:9758s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8444/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] drm/i915/selftests: Include the trace as a debug aide
  2018-03-22  7:49 ` [PATCH v2] " Chris Wilson
@ 2018-03-22 14:26   ` Mika Kuoppala
  2018-03-22 14:30     ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Mika Kuoppala @ 2018-03-22 14:26 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> that we can see what operations were in flight when the GPU got stuck.
>
> v2: There's more than one timeout that deserves tracing!
> v3: Silence checkpatch by not even using a product at all!
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
>  1 file changed, 20 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 4372826998aa..9b235dae8dd9 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
>  {
>  	struct wedge_me *w = container_of(work, typeof(*w), work.work);
>  
> -	pr_err("%pS timed out, cancelling all further testing.\n",
> -	       w->symbol);
> +	pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> +
> +	GEM_TRACE("%pS timed out.\n", w->symbol);
> +	GEM_TRACE_DUMP();
> +
>  	i915_gem_set_wedged(w->i915);
>  }
>  
> @@ -621,9 +624,19 @@ static int active_engine(void *data)
>  		mutex_unlock(&engine->i915->drm.struct_mutex);
>  
>  		if (old) {
> -			i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> +			if (i915_request_wait(old, 0, HZ) < 0) {
> +				GEM_TRACE("%s timed out.\n", engine->name);
> +				GEM_TRACE_DUMP();
> +
> +				i915_gem_set_wedged(engine->i915);
> +				i915_request_put(old);
> +				err = -EIO;
> +				break;
> +			}

Using err = i915_request_wait() could have saved one extra request_put
but I dunno if it would be any cleaner.

>  			i915_request_put(old);
>  		}
> +
> +		cond_resched();

To give more slack for other engines and main thread to proceed?

>  	}
>  
>  	for (count = 0; count < ARRAY_SIZE(rq); count++)
> @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
>  
>  	err = i915_subtests(tests, i915);
>  
> +	mutex_lock(&i915->drm.struct_mutex);
> +	flush_test(i915, I915_WAIT_LOCKED);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +

To wash out leftovers.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

>  	i915_modparams.enable_hangcheck = saved_hangcheck;
>  	intel_runtime_pm_put(i915);
>  
> -- 
> 2.16.2
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] drm/i915/selftests: Include the trace as a debug aide
  2018-03-22 14:26   ` Mika Kuoppala
@ 2018-03-22 14:30     ` Chris Wilson
  2018-03-22 19:29       ` Jeff McGee
  0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 14:30 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2018-03-22 14:26:41)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> > that we can see what operations were in flight when the GPU got stuck.
> >
> > v2: There's more than one timeout that deserves tracing!
> > v3: Silence checkpatch by not even using a product at all!
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
> >  1 file changed, 20 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > index 4372826998aa..9b235dae8dd9 100644
> > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
> >  {
> >       struct wedge_me *w = container_of(work, typeof(*w), work.work);
> >  
> > -     pr_err("%pS timed out, cancelling all further testing.\n",
> > -            w->symbol);
> > +     pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> > +
> > +     GEM_TRACE("%pS timed out.\n", w->symbol);
> > +     GEM_TRACE_DUMP();
> > +
> >       i915_gem_set_wedged(w->i915);
> >  }
> >  
> > @@ -621,9 +624,19 @@ static int active_engine(void *data)
> >               mutex_unlock(&engine->i915->drm.struct_mutex);
> >  
> >               if (old) {
> > -                     i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> > +                     if (i915_request_wait(old, 0, HZ) < 0) {
> > +                             GEM_TRACE("%s timed out.\n", engine->name);
> > +                             GEM_TRACE_DUMP();
> > +
> > +                             i915_gem_set_wedged(engine->i915);
> > +                             i915_request_put(old);
> > +                             err = -EIO;
> > +                             break;
> > +                     }
> 
> Using err = i915_request_wait() could have saved one extra request_put
> but I dunno if it would be any cleaner.

It's also -ETIME, which didn't fit my intention.

> 
> >                       i915_request_put(old);
> >               }
> > +
> > +             cond_resched();
> 
> To give more slack for other engines and main thread to proceed?

Yes. Otherwise, it spins mighty fine.
> 
> >       }
> >  
> >       for (count = 0; count < ARRAY_SIZE(rq); count++)
> > @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> >  
> >       err = i915_subtests(tests, i915);
> >  
> > +     mutex_lock(&i915->drm.struct_mutex);
> > +     flush_test(i915, I915_WAIT_LOCKED);
> > +     mutex_unlock(&i915->drm.struct_mutex);
> > +
> 
> To wash out leftovers.

Yeah, from the early abort we left requests unaccounted for and needed
to grab the struct_mutex to run retire. Otherwise we hit assertions
later on about trying to unload the driver with requests still inflight.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
  2018-03-22  7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
@ 2018-03-22 14:35   ` Mika Kuoppala
  2018-03-22 15:34   ` Jeff McGee
  2018-03-30 23:08   ` Chris Wilson
  2 siblings, 0 replies; 19+ messages in thread
From: Mika Kuoppala @ 2018-03-22 14:35 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Using engine->irq_posted for execlists, we are not always serialised by
> the tasklet as we supposed. On the reset paths, the tasklet is disabled
> and ignored. Instead, we manipulate the engine->irq_posted directly to
> account for the reset, but if an interrupt fired before the reset and so
> wrote to engine->irq_posted, that write may not be flushed from the
> local CPU's cacheline until much later as the tasklet is already active
> and so does not generate a mb(). To correctly serialise the interrupt
> with reset, we need serialisation on the set_bit() itself.
>
> And at last Mika can be happy.

Yes.

>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> CC: Michel Thierry <michel.thierry@intel.com>
> Cc: Jeff McGee <jeff.mcgee@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/i915_irq.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index fa7310766217..27aee25429b7 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
>  	bool tasklet = false;
>  
>  	if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> -		if (READ_ONCE(engine->execlists.active)) {
> -			__set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -			tasklet = true;
> -		}
> +		if (READ_ONCE(engine->execlists.active))
> +			tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> +						    &engine->irq_posted);
>  	}
>  
>  	if (iir & GT_RENDER_USER_INTERRUPT) {
> -- 
> 2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
  2018-03-22  7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
  2018-03-22 14:35   ` Mika Kuoppala
@ 2018-03-22 15:34   ` Jeff McGee
  2018-03-22 17:01     ` Chris Wilson
  2018-03-30 23:08   ` Chris Wilson
  2 siblings, 1 reply; 19+ messages in thread
From: Jeff McGee @ 2018-03-22 15:34 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Thu, Mar 22, 2018 at 07:35:32AM +0000, Chris Wilson wrote:
> Using engine->irq_posted for execlists, we are not always serialised by
> the tasklet as we supposed. On the reset paths, the tasklet is disabled
> and ignored. Instead, we manipulate the engine->irq_posted directly to
> account for the reset, but if an interrupt fired before the reset and so
> wrote to engine->irq_posted, that write may not be flushed from the
> local CPU's cacheline until much later as the tasklet is already active
> and so does not generate a mb(). To correctly serialise the interrupt
> with reset, we need serialisation on the set_bit() itself.
> 
> And at last Mika can be happy.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> CC: Michel Thierry <michel.thierry@intel.com>
> Cc: Jeff McGee <jeff.mcgee@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_irq.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index fa7310766217..27aee25429b7 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
>  	bool tasklet = false;
>  
>  	if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> -		if (READ_ONCE(engine->execlists.active)) {
> -			__set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -			tasklet = true;
> -		}
> +		if (READ_ONCE(engine->execlists.active))
> +			tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> +						    &engine->irq_posted);
>  	}
>  
>  	if (iir & GT_RENDER_USER_INTERRUPT) {
> -- 
> 2.16.2
> 

Confirmed that this along with the interrupt flush eliminates the cases
of finding CSB tail at its reset value (0x7) in the tasklet in my force
preemption tests.

Reviewed-by: Jeff McGee <jeff.mcgee@intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
  2018-03-22 15:34   ` Jeff McGee
@ 2018-03-22 17:01     ` Chris Wilson
  0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 17:01 UTC (permalink / raw)
  To: Jeff McGee; +Cc: intel-gfx, Mika

Quoting Jeff McGee (2018-03-22 15:34:45)
> On Thu, Mar 22, 2018 at 07:35:32AM +0000, Chris Wilson wrote:
> > Using engine->irq_posted for execlists, we are not always serialised by
> > the tasklet as we supposed. On the reset paths, the tasklet is disabled
> > and ignored. Instead, we manipulate the engine->irq_posted directly to
> > account for the reset, but if an interrupt fired before the reset and so
> > wrote to engine->irq_posted, that write may not be flushed from the
> > local CPU's cacheline until much later as the tasklet is already active
> > and so does not generate a mb(). To correctly serialise the interrupt
> > with reset, we need serialisation on the set_bit() itself.
> > 
> > And at last Mika can be happy.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Michał Winiarski <michal.winiarski@intel.com>
> > CC: Michel Thierry <michel.thierry@intel.com>
> > Cc: Jeff McGee <jeff.mcgee@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_irq.c | 7 +++----
> >  1 file changed, 3 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index fa7310766217..27aee25429b7 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> >       bool tasklet = false;
> >  
> >       if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> > -             if (READ_ONCE(engine->execlists.active)) {
> > -                     __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > -                     tasklet = true;
> > -             }
> > +             if (READ_ONCE(engine->execlists.active))
> > +                     tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> > +                                                 &engine->irq_posted);
> >       }
> >  
> >       if (iir & GT_RENDER_USER_INTERRUPT) {
> > -- 
> > 2.16.2
> > 
> 
> Confirmed that this along with the interrupt flush eliminates the cases
> of finding CSB tail at its reset value (0x7) in the tasklet in my force
> preemption tests.

At the moment, I'm concerned about the failures we have in CI before we
go building on top. So care to complete the set of r-b for us to move
on?
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] drm/i915/selftests: Include the trace as a debug aide
  2018-03-22 14:30     ` Chris Wilson
@ 2018-03-22 19:29       ` Jeff McGee
  2018-03-22 20:37         ` Chris Wilson
  0 siblings, 1 reply; 19+ messages in thread
From: Jeff McGee @ 2018-03-22 19:29 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Thu, Mar 22, 2018 at 02:30:09PM +0000, Chris Wilson wrote:
> Quoting Mika Kuoppala (2018-03-22 14:26:41)
> > Chris Wilson <chris@chris-wilson.co.uk> writes:
> > 
> > > If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> > > that we can see what operations were in flight when the GPU got stuck.
> > >
> > > v2: There's more than one timeout that deserves tracing!
> > > v3: Silence checkpatch by not even using a product at all!
> > >
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > ---
> > >  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
> > >  1 file changed, 20 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > index 4372826998aa..9b235dae8dd9 100644
> > > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
> > >  {
> > >       struct wedge_me *w = container_of(work, typeof(*w), work.work);
> > >  
> > > -     pr_err("%pS timed out, cancelling all further testing.\n",
> > > -            w->symbol);
> > > +     pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> > > +
> > > +     GEM_TRACE("%pS timed out.\n", w->symbol);
> > > +     GEM_TRACE_DUMP();
> > > +
> > >       i915_gem_set_wedged(w->i915);
> > >  }
> > >  
> > > @@ -621,9 +624,19 @@ static int active_engine(void *data)
> > >               mutex_unlock(&engine->i915->drm.struct_mutex);
> > >  
> > >               if (old) {
> > > -                     i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> > > +                     if (i915_request_wait(old, 0, HZ) < 0) {
> > > +                             GEM_TRACE("%s timed out.\n", engine->name);
> > > +                             GEM_TRACE_DUMP();
> > > +
> > > +                             i915_gem_set_wedged(engine->i915);
> > > +                             i915_request_put(old);
> > > +                             err = -EIO;
> > > +                             break;
> > > +                     }
> > 
> > Using err = i915_request_wait() could have saved one extra request_put
> > but I dunno if it would be any cleaner.
> 
> It's also -ETIME, which didn't fit my intention.
> 
> > 
> > >                       i915_request_put(old);
> > >               }
> > > +
> > > +             cond_resched();
> > 
> > To give more slack for other engines and main thread to proceed?
> 
> Yes. Otherwise, it spins mighty fine.
> > 
> > >       }
> > >  
> > >       for (count = 0; count < ARRAY_SIZE(rq); count++)
> > > @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> > >  
> > >       err = i915_subtests(tests, i915);
> > >  
> > > +     mutex_lock(&i915->drm.struct_mutex);
> > > +     flush_test(i915, I915_WAIT_LOCKED);
> > > +     mutex_unlock(&i915->drm.struct_mutex);
> > > +
> > 
> > To wash out leftovers.
> 
> Yeah, from the early abort we left requests unaccounted for and needed
> to grab the struct_mutex to run retire. Otherwise we hit assertions
> later on about trying to unload the driver with requests still inflight.
> -Chris

On this and the 3 others in this series...

Reviewed-by: Jeff McGee <jeff.mcgee@intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH v2] drm/i915/selftests: Include the trace as a debug aide
  2018-03-22 19:29       ` Jeff McGee
@ 2018-03-22 20:37         ` Chris Wilson
  0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 20:37 UTC (permalink / raw)
  To: Jeff McGee; +Cc: intel-gfx

Quoting Jeff McGee (2018-03-22 19:29:16)
> On Thu, Mar 22, 2018 at 02:30:09PM +0000, Chris Wilson wrote:
> > Quoting Mika Kuoppala (2018-03-22 14:26:41)
> > > Chris Wilson <chris@chris-wilson.co.uk> writes:
> > > 
> > > > If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> > > > that we can see what operations were in flight when the GPU got stuck.
> > > >
> > > > v2: There's more than one timeout that deserves tracing!
> > > > v3: Silence checkpatch by not even using a product at all!
> > > >
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > ---
> > > >  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
> > > >  1 file changed, 20 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > > index 4372826998aa..9b235dae8dd9 100644
> > > > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > > @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
> > > >  {
> > > >       struct wedge_me *w = container_of(work, typeof(*w), work.work);
> > > >  
> > > > -     pr_err("%pS timed out, cancelling all further testing.\n",
> > > > -            w->symbol);
> > > > +     pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> > > > +
> > > > +     GEM_TRACE("%pS timed out.\n", w->symbol);
> > > > +     GEM_TRACE_DUMP();
> > > > +
> > > >       i915_gem_set_wedged(w->i915);
> > > >  }
> > > >  
> > > > @@ -621,9 +624,19 @@ static int active_engine(void *data)
> > > >               mutex_unlock(&engine->i915->drm.struct_mutex);
> > > >  
> > > >               if (old) {
> > > > -                     i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> > > > +                     if (i915_request_wait(old, 0, HZ) < 0) {
> > > > +                             GEM_TRACE("%s timed out.\n", engine->name);
> > > > +                             GEM_TRACE_DUMP();
> > > > +
> > > > +                             i915_gem_set_wedged(engine->i915);
> > > > +                             i915_request_put(old);
> > > > +                             err = -EIO;
> > > > +                             break;
> > > > +                     }
> > > 
> > > Using err = i915_request_wait() could have saved one extra request_put
> > > but I dunno if it would be any cleaner.
> > 
> > It's also -ETIME, which didn't fit my intention.
> > 
> > > 
> > > >                       i915_request_put(old);
> > > >               }
> > > > +
> > > > +             cond_resched();
> > > 
> > > To give more slack for other engines and main thread to proceed?
> > 
> > Yes. Otherwise, it spins mighty fine.
> > > 
> > > >       }
> > > >  
> > > >       for (count = 0; count < ARRAY_SIZE(rq); count++)
> > > > @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> > > >  
> > > >       err = i915_subtests(tests, i915);
> > > >  
> > > > +     mutex_lock(&i915->drm.struct_mutex);
> > > > +     flush_test(i915, I915_WAIT_LOCKED);
> > > > +     mutex_unlock(&i915->drm.struct_mutex);
> > > > +
> > > 
> > > To wash out leftovers.
> > 
> > Yeah, from the early abort we left requests unaccounted for and needed
> > to grab the struct_mutex to run retire. Otherwise we hit assertions
> > later on about trying to unload the driver with requests still inflight.
> > -Chris
> 
> On this and the 3 others in this series...
> 
> Reviewed-by: Jeff McGee <jeff.mcgee@intel.com>

Much appreciated. Pushed, let's hope CI holds up and that we're ready to
start talking about the real changes required for forced preemption as
opposed to getting the existing code working.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
  2018-03-22  7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
  2018-03-22 14:35   ` Mika Kuoppala
  2018-03-22 15:34   ` Jeff McGee
@ 2018-03-30 23:08   ` Chris Wilson
  2018-03-31  8:59     ` Chris Wilson
  2 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2018-03-30 23:08 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika

Quoting Chris Wilson (2018-03-22 07:35:32)
> Using engine->irq_posted for execlists, we are not always serialised by
> the tasklet as we supposed. On the reset paths, the tasklet is disabled
> and ignored. Instead, we manipulate the engine->irq_posted directly to
> account for the reset, but if an interrupt fired before the reset and so
> wrote to engine->irq_posted, that write may not be flushed from the
> local CPU's cacheline until much later as the tasklet is already active
> and so does not generate a mb(). To correctly serialise the interrupt
> with reset, we need serialisation on the set_bit() itself.
> 
> And at last Mika can be happy.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> CC: Michel Thierry <michel.thierry@intel.com>
> Cc: Jeff McGee <jeff.mcgee@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_irq.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index fa7310766217..27aee25429b7 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
>         bool tasklet = false;
>  
>         if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> -               if (READ_ONCE(engine->execlists.active)) {
> -                       __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -                       tasklet = true;
> -               }
> +               if (READ_ONCE(engine->execlists.active))
> +                       tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> +                                                   &engine->irq_posted);

This is driving me mad. A very rare missed interrupt unless we
unconditionally kick tasklet:

        if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
-               if (READ_ONCE(engine->execlists.active))
-                       tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
-                                                   &engine->irq_posted);
+               if (READ_ONCE(engine->execlists.active)) {
+                       set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+                       tasklet = true;
+               }
        }

I can't see why.

Hmm, I wonder if we are seeing READ_ONCE(execlsts->active) false
negatives.

Getting close to admitting defeat :(
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
  2018-03-30 23:08   ` Chris Wilson
@ 2018-03-31  8:59     ` Chris Wilson
  0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-31  8:59 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2018-03-31 00:08:47)
> Quoting Chris Wilson (2018-03-22 07:35:32)
> > Using engine->irq_posted for execlists, we are not always serialised by
> > the tasklet as we supposed. On the reset paths, the tasklet is disabled
> > and ignored. Instead, we manipulate the engine->irq_posted directly to
> > account for the reset, but if an interrupt fired before the reset and so
> > wrote to engine->irq_posted, that write may not be flushed from the
> > local CPU's cacheline until much later as the tasklet is already active
> > and so does not generate a mb(). To correctly serialise the interrupt
> > with reset, we need serialisation on the set_bit() itself.
> > 
> > And at last Mika can be happy.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Michał Winiarski <michal.winiarski@intel.com>
> > CC: Michel Thierry <michel.thierry@intel.com>
> > Cc: Jeff McGee <jeff.mcgee@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_irq.c | 7 +++----
> >  1 file changed, 3 insertions(+), 4 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index fa7310766217..27aee25429b7 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> >         bool tasklet = false;
> >  
> >         if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> > -               if (READ_ONCE(engine->execlists.active)) {
> > -                       __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > -                       tasklet = true;
> > -               }
> > +               if (READ_ONCE(engine->execlists.active))
> > +                       tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> > +                                                   &engine->irq_posted);
> 
> This is driving me mad. A very rare missed interrupt unless we
> unconditionally kick tasklet:
> 
>         if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> -               if (READ_ONCE(engine->execlists.active))
> -                       tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> -                                                   &engine->irq_posted);
> +               if (READ_ONCE(engine->execlists.active)) {
> +                       set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> +                       tasklet = true;
> +               }
>         }
> 
> I can't see why.
> 
> Hmm, I wonder if we are seeing READ_ONCE(execlsts->active) false
> negatives.

Fortunately, doesn't appear to be that.

@@ -1405,9 +1405,10 @@  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
 	bool tasklet = false;
 
 	if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
-		if (READ_ONCE(engine->execlists.active))
-			tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
-						    &engine->irq_posted);
+		GEM_BUG_ON(!READ_ONCE(execlists->tasklet.state) &&
+			   test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted));
+		tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
+					    &engine->irq_posted);
 	}

Hasn't even hit a BUG, which is a little disconcerting.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2018-03-31  8:59 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-22  7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
2018-03-22  7:35 ` [PATCH 2/4] drm/i915/selftests: Stress resets-vs-request-priority Chris Wilson
2018-03-22  7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
2018-03-22 14:35   ` Mika Kuoppala
2018-03-22 15:34   ` Jeff McGee
2018-03-22 17:01     ` Chris Wilson
2018-03-30 23:08   ` Chris Wilson
2018-03-31  8:59     ` Chris Wilson
2018-03-22  7:35 ` [PATCH 4/4] drm/i915: Flush pending interrupt following a GPU reset Chris Wilson
2018-03-22  7:43 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide Patchwork
2018-03-22  7:49 ` [PATCH v2] " Chris Wilson
2018-03-22 14:26   ` Mika Kuoppala
2018-03-22 14:30     ` Chris Wilson
2018-03-22 19:29       ` Jeff McGee
2018-03-22 20:37         ` Chris Wilson
2018-03-22  7:58 ` ✓ Fi.CI.BAT: success for series starting with [1/4] " Patchwork
2018-03-22  8:02 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2) Patchwork
2018-03-22  8:19 ` ✓ Fi.CI.BAT: success " Patchwork
2018-03-22 10:23 ` ✓ Fi.CI.IGT: " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.