* [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide
@ 2018-03-22 7:35 Chris Wilson
2018-03-22 7:35 ` [PATCH 2/4] drm/i915/selftests: Stress resets-vs-request-priority Chris Wilson
` (8 more replies)
0 siblings, 9 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 7:35 UTC (permalink / raw)
To: intel-gfx
If we fail to reset the GPU in a timely fashion, dump the GEM trace so
that we can see what operations were in flight when the GPU got stuck.
v2: There's more than one timeout that deserves tracing!
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
1 file changed, 20 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4372826998aa..1969a65072ca 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
{
struct wedge_me *w = container_of(work, typeof(*w), work.work);
- pr_err("%pS timed out, cancelling all further testing.\n",
- w->symbol);
+ pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
+
+ GEM_TRACE("%pS timed out.\n", w->symbol);
+ GEM_TRACE_DUMP();
+
i915_gem_set_wedged(w->i915);
}
@@ -621,9 +624,19 @@ static int active_engine(void *data)
mutex_unlock(&engine->i915->drm.struct_mutex);
if (old) {
- i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
+ if (i915_request_wait(old, 0, 10*HZ) < 0) {
+ GEM_TRACE("%s timed out.\n", engine->name);
+ GEM_TRACE_DUMP();
+
+ i915_gem_set_wedged(engine->i915);
+ i915_request_put(old);
+ err = -EIO;
+ break;
+ }
i915_request_put(old);
}
+
+ cond_resched();
}
for (count = 0; count < ARRAY_SIZE(rq); count++)
@@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
err = i915_subtests(tests, i915);
+ mutex_lock(&i915->drm.struct_mutex);
+ flush_test(i915, I915_WAIT_LOCKED);
+ mutex_unlock(&i915->drm.struct_mutex);
+
i915_modparams.enable_hangcheck = saved_hangcheck;
intel_runtime_pm_put(i915);
--
2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 2/4] drm/i915/selftests: Stress resets-vs-request-priority
2018-03-22 7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
@ 2018-03-22 7:35 ` Chris Wilson
2018-03-22 7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
` (7 subsequent siblings)
8 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 7:35 UTC (permalink / raw)
To: intel-gfx
Watch what happens if we try to reset with a queue of requests with
varying priorities -- that may need reordering or preemption across the
reset.
v2: Tweak priorities to avoid starving the hanging thread.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 189 +++++++++++++++--------
1 file changed, 126 insertions(+), 63 deletions(-)
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 1969a65072ca..c5ed4006c319 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -25,6 +25,7 @@
#include <linux/kthread.h>
#include "../i915_selftest.h"
+#include "i915_random.h"
#include "mock_context.h"
#include "mock_drm.h"
@@ -486,6 +487,8 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
do {
+ u32 seqno = intel_engine_get_seqno(engine);
+
if (active) {
struct i915_request *rq;
@@ -514,12 +517,13 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
break;
}
+ GEM_BUG_ON(!rq->global_seqno);
+ seqno = rq->global_seqno - 1;
i915_request_put(rq);
}
engine->hangcheck.stalled = true;
- engine->hangcheck.seqno =
- intel_engine_get_seqno(engine);
+ engine->hangcheck.seqno = seqno;
err = i915_reset_engine(engine, NULL);
if (err) {
@@ -576,11 +580,25 @@ static int igt_reset_active_engine(void *arg)
return __igt_reset_engine(arg, true);
}
+struct active_engine {
+ struct task_struct *task;
+ struct intel_engine_cs *engine;
+ unsigned long resets;
+ unsigned int flags;
+};
+
+#define TEST_ACTIVE BIT(0)
+#define TEST_OTHERS BIT(1)
+#define TEST_SELF BIT(2)
+#define TEST_PRIORITY BIT(3)
+
static int active_engine(void *data)
{
- struct intel_engine_cs *engine = data;
- struct i915_request *rq[2] = {};
- struct i915_gem_context *ctx[2];
+ I915_RND_STATE(prng);
+ struct active_engine *arg = data;
+ struct intel_engine_cs *engine = arg->engine;
+ struct i915_request *rq[8] = {};
+ struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
struct drm_file *file;
unsigned long count = 0;
int err = 0;
@@ -589,25 +607,20 @@ static int active_engine(void *data)
if (IS_ERR(file))
return PTR_ERR(file);
- mutex_lock(&engine->i915->drm.struct_mutex);
- ctx[0] = live_context(engine->i915, file);
- mutex_unlock(&engine->i915->drm.struct_mutex);
- if (IS_ERR(ctx[0])) {
- err = PTR_ERR(ctx[0]);
- goto err_file;
- }
-
- mutex_lock(&engine->i915->drm.struct_mutex);
- ctx[1] = live_context(engine->i915, file);
- mutex_unlock(&engine->i915->drm.struct_mutex);
- if (IS_ERR(ctx[1])) {
- err = PTR_ERR(ctx[1]);
- i915_gem_context_put(ctx[0]);
- goto err_file;
+ for (count = 0; count < ARRAY_SIZE(ctx); count++) {
+ mutex_lock(&engine->i915->drm.struct_mutex);
+ ctx[count] = live_context(engine->i915, file);
+ mutex_unlock(&engine->i915->drm.struct_mutex);
+ if (IS_ERR(ctx[count])) {
+ err = PTR_ERR(ctx[count]);
+ while (--count)
+ i915_gem_context_put(ctx[count]);
+ goto err_file;
+ }
}
while (!kthread_should_stop()) {
- unsigned int idx = count++ & 1;
+ unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
struct i915_request *old = rq[idx];
struct i915_request *new;
@@ -619,6 +632,10 @@ static int active_engine(void *data)
break;
}
+ if (arg->flags & TEST_PRIORITY)
+ ctx[idx]->priority =
+ i915_prandom_u32_max_state(512, &prng);
+
rq[idx] = i915_request_get(new);
i915_request_add(new);
mutex_unlock(&engine->i915->drm.struct_mutex);
@@ -647,8 +664,9 @@ static int active_engine(void *data)
return err;
}
-static int __igt_reset_engine_others(struct drm_i915_private *i915,
- bool active)
+static int __igt_reset_engines(struct drm_i915_private *i915,
+ const char *test_name,
+ unsigned int flags)
{
struct intel_engine_cs *engine, *other;
enum intel_engine_id id, tmp;
@@ -662,50 +680,61 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
if (!intel_has_reset_engine(i915))
return 0;
- if (active) {
+ if (flags & TEST_ACTIVE) {
mutex_lock(&i915->drm.struct_mutex);
err = hang_init(&h, i915);
mutex_unlock(&i915->drm.struct_mutex);
if (err)
return err;
+
+ if (flags & TEST_PRIORITY)
+ h.ctx->priority = 1024;
}
for_each_engine(engine, i915, id) {
- struct task_struct *threads[I915_NUM_ENGINES] = {};
- unsigned long resets[I915_NUM_ENGINES];
+ struct active_engine threads[I915_NUM_ENGINES] = {};
unsigned long global = i915_reset_count(&i915->gpu_error);
- unsigned long count = 0;
+ unsigned long count = 0, reported;
IGT_TIMEOUT(end_time);
- if (active && !intel_engine_can_store_dword(engine))
+ if (flags & TEST_ACTIVE &&
+ !intel_engine_can_store_dword(engine))
continue;
memset(threads, 0, sizeof(threads));
for_each_engine(other, i915, tmp) {
struct task_struct *tsk;
- resets[tmp] = i915_reset_engine_count(&i915->gpu_error,
- other);
+ threads[tmp].resets =
+ i915_reset_engine_count(&i915->gpu_error,
+ other);
- if (other == engine)
+ if (!(flags & TEST_OTHERS))
continue;
- tsk = kthread_run(active_engine, other,
+ if (other == engine && !(flags & TEST_SELF))
+ continue;
+
+ threads[tmp].engine = other;
+ threads[tmp].flags = flags;
+
+ tsk = kthread_run(active_engine, &threads[tmp],
"igt/%s", other->name);
if (IS_ERR(tsk)) {
err = PTR_ERR(tsk);
goto unwind;
}
- threads[tmp] = tsk;
+ threads[tmp].task = tsk;
get_task_struct(tsk);
}
set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
do {
- if (active) {
- struct i915_request *rq;
+ u32 seqno = intel_engine_get_seqno(engine);
+ struct i915_request *rq = NULL;
+ if (flags & TEST_ACTIVE) {
mutex_lock(&i915->drm.struct_mutex);
rq = hang_create_request(&h, engine);
if (IS_ERR(rq)) {
@@ -731,33 +760,38 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
break;
}
- i915_request_put(rq);
+ GEM_BUG_ON(!rq->global_seqno);
+ seqno = rq->global_seqno - 1;
}
engine->hangcheck.stalled = true;
- engine->hangcheck.seqno =
- intel_engine_get_seqno(engine);
+ engine->hangcheck.seqno = seqno;
err = i915_reset_engine(engine, NULL);
if (err) {
- pr_err("i915_reset_engine(%s:%s) failed, err=%d\n",
- engine->name, active ? "active" : "idle", err);
+ pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
+ engine->name, test_name, err);
break;
}
engine->hangcheck.stalled = false;
count++;
+
+ if (rq) {
+ i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
+ i915_request_put(rq);
+ }
} while (time_before(jiffies, end_time));
clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
pr_info("i915_reset_engine(%s:%s): %lu resets\n",
- engine->name, active ? "active" : "idle", count);
-
- if (i915_reset_engine_count(&i915->gpu_error, engine) -
- resets[engine->id] != (active ? count : 0)) {
- pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
- engine->name, active ? "active" : "idle", count,
- i915_reset_engine_count(&i915->gpu_error,
- engine) - resets[engine->id]);
+ engine->name, test_name, count);
+
+ reported = i915_reset_engine_count(&i915->gpu_error, engine);
+ reported -= threads[engine->id].resets;
+ if (reported != (flags & TEST_ACTIVE ? count : 0)) {
+ pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
+ engine->name, test_name, count, reported,
+ (flags & TEST_ACTIVE ? count : 0));
if (!err)
err = -EINVAL;
}
@@ -766,24 +800,26 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
for_each_engine(other, i915, tmp) {
int ret;
- if (!threads[tmp])
+ if (!threads[tmp].task)
continue;
- ret = kthread_stop(threads[tmp]);
+ ret = kthread_stop(threads[tmp].task);
if (ret) {
pr_err("kthread for other engine %s failed, err=%d\n",
other->name, ret);
if (!err)
err = ret;
}
- put_task_struct(threads[tmp]);
+ put_task_struct(threads[tmp].task);
- if (resets[tmp] != i915_reset_engine_count(&i915->gpu_error,
- other)) {
+ if (other != engine &&
+ threads[tmp].resets !=
+ i915_reset_engine_count(&i915->gpu_error, other)) {
pr_err("Innocent engine %s was reset (count=%ld)\n",
other->name,
i915_reset_engine_count(&i915->gpu_error,
- other) - resets[tmp]);
+ other) -
+ threads[tmp].resets);
if (!err)
err = -EINVAL;
}
@@ -807,7 +843,7 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
if (i915_terminally_wedged(&i915->gpu_error))
err = -EIO;
- if (active) {
+ if (flags & TEST_ACTIVE) {
mutex_lock(&i915->drm.struct_mutex);
hang_fini(&h);
mutex_unlock(&i915->drm.struct_mutex);
@@ -816,14 +852,42 @@ static int __igt_reset_engine_others(struct drm_i915_private *i915,
return err;
}
-static int igt_reset_idle_engine_others(void *arg)
+static int igt_reset_engines(void *arg)
{
- return __igt_reset_engine_others(arg, false);
-}
+ static const struct {
+ const char *name;
+ unsigned int flags;
+ } phases[] = {
+ { "idle", 0 },
+ { "active", TEST_ACTIVE },
+ { "others-idle", TEST_OTHERS },
+ { "others-active", TEST_OTHERS | TEST_ACTIVE },
+ {
+ "others-priority",
+ TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
+ },
+ {
+ "self-priority",
+ TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
+ },
+ { }
+ };
+ struct drm_i915_private *i915 = arg;
+ typeof(*phases) *p;
+ int err;
-static int igt_reset_active_engine_others(void *arg)
-{
- return __igt_reset_engine_others(arg, true);
+ for (p = phases; p->name; p++) {
+ if (p->flags & TEST_PRIORITY) {
+ if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
+ continue;
+ }
+
+ err = __igt_reset_engines(arg, p->name, p->flags);
+ if (err)
+ return err;
+ }
+
+ return 0;
}
static u32 fake_hangcheck(struct i915_request *rq)
@@ -1122,8 +1186,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
SUBTEST(igt_hang_sanitycheck),
SUBTEST(igt_reset_idle_engine),
SUBTEST(igt_reset_active_engine),
- SUBTEST(igt_reset_idle_engine_others),
- SUBTEST(igt_reset_active_engine_others),
+ SUBTEST(igt_reset_engines),
SUBTEST(igt_wait_reset),
SUBTEST(igt_reset_queue),
SUBTEST(igt_handle_error),
--
2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
2018-03-22 7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
2018-03-22 7:35 ` [PATCH 2/4] drm/i915/selftests: Stress resets-vs-request-priority Chris Wilson
@ 2018-03-22 7:35 ` Chris Wilson
2018-03-22 14:35 ` Mika Kuoppala
` (2 more replies)
2018-03-22 7:35 ` [PATCH 4/4] drm/i915: Flush pending interrupt following a GPU reset Chris Wilson
` (6 subsequent siblings)
8 siblings, 3 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 7:35 UTC (permalink / raw)
To: intel-gfx
Using engine->irq_posted for execlists, we are not always serialised by
the tasklet as we supposed. On the reset paths, the tasklet is disabled
and ignored. Instead, we manipulate the engine->irq_posted directly to
account for the reset, but if an interrupt fired before the reset and so
wrote to engine->irq_posted, that write may not be flushed from the
local CPU's cacheline until much later as the tasklet is already active
and so does not generate a mb(). To correctly serialise the interrupt
with reset, we need serialisation on the set_bit() itself.
And at last Mika can be happy.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
CC: Michel Thierry <michel.thierry@intel.com>
Cc: Jeff McGee <jeff.mcgee@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
drivers/gpu/drm/i915/i915_irq.c | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index fa7310766217..27aee25429b7 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
bool tasklet = false;
if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
- if (READ_ONCE(engine->execlists.active)) {
- __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
- tasklet = true;
- }
+ if (READ_ONCE(engine->execlists.active))
+ tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
+ &engine->irq_posted);
}
if (iir & GT_RENDER_USER_INTERRUPT) {
--
2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread
* [PATCH 4/4] drm/i915: Flush pending interrupt following a GPU reset
2018-03-22 7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
2018-03-22 7:35 ` [PATCH 2/4] drm/i915/selftests: Stress resets-vs-request-priority Chris Wilson
2018-03-22 7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
@ 2018-03-22 7:35 ` Chris Wilson
2018-03-22 7:43 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide Patchwork
` (5 subsequent siblings)
8 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 7:35 UTC (permalink / raw)
To: intel-gfx
After resetting the GPU (or subset of engines), call synchronize_irq()
to flush any pending irq before proceeding with the cleanup. For a
device level reset, we disable the interupts around the reset, but when
resetting just one engine, we have to avoid such global disabling. This
leaves us open to an interrupt arriving for the engine as we try to
reset it. We already do try to flush the IIR following the reset, but we
have to ensure that the in-flight interrupt does not land after we start
cleaning up after the reset; enter synchronize_irq().
As it current stands, we very rarely, but fatally, see sequences such as:
2.... 57964564us : execlists_reset_prepare: rcs0
2.... 57964613us : execlists_reset: rcs0 seqno=424
0d.h1 57964615us : gen8_cs_irq_handler: rcs0 CS active=1
2d..1 57964617us : __i915_request_unsubmit: rcs0 fence 29:1056 <- global_seqno 1060
2.... 57964703us : execlists_reset_finish: rcs0
0..s. 57964705us : execlists_submission_tasklet: rcs0 awake?=1, active=0, irq-posted?=1
v2: Move the sync into the execlists reset handler so that we coordinate
the flush with disabling the interrupt handling and canceling the
pending interrupt.
v3: Just use synchronize_hardirq() to avoid the might_sleep(), we do not
yet have threaded-irq to worry about.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Jeff McGee <jeff.mcgee@intel.com>
---
drivers/gpu/drm/i915/intel_lrc.c | 7 ++++---
drivers/gpu/drm/i915/intel_uncore.c | 4 +++-
2 files changed, 7 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 67b6a0f658d6..ce09c5ad334f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -805,6 +805,10 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
spin_unlock(&engine->timeline->lock);
+ /* Mark all CS interrupts as complete */
+ smp_store_mb(execlists->active, 0);
+ synchronize_hardirq(engine->i915->drm.irq);
+
/*
* The port is checked prior to scheduling a tasklet, but
* just in case we have suspended the tasklet to do the
@@ -813,9 +817,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
*/
clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
- /* Mark all CS interrupts as complete */
- execlists->active = 0;
-
local_irq_restore(flags);
}
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 4c616d074a97..f37ecfc69e49 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -2116,8 +2116,10 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
i915_stop_engines(dev_priv, engine_mask);
ret = -ENODEV;
- if (reset)
+ if (reset) {
+ GEM_TRACE("engine_mask=%x\n", engine_mask);
ret = reset(dev_priv, engine_mask);
+ }
if (ret != -ETIMEDOUT)
break;
--
2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread
* ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide
2018-03-22 7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
` (2 preceding siblings ...)
2018-03-22 7:35 ` [PATCH 4/4] drm/i915: Flush pending interrupt following a GPU reset Chris Wilson
@ 2018-03-22 7:43 ` Patchwork
2018-03-22 7:49 ` [PATCH v2] " Chris Wilson
` (4 subsequent siblings)
8 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2018-03-22 7:43 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide
URL : https://patchwork.freedesktop.org/series/40439/
State : warning
== Summary ==
$ dim checkpatch origin/drm-tip
6adaf914c971 drm/i915/selftests: Include the trace as a debug aide
-:36: CHECK:SPACING: spaces preferred around that '*' (ctx:VxV)
#36: FILE: drivers/gpu/drm/i915/selftests/intel_hangcheck.c:627:
+ if (i915_request_wait(old, 0, 10*HZ) < 0) {
^
total: 0 errors, 0 warnings, 1 checks, 43 lines checked
aae1777597a8 drm/i915/selftests: Stress resets-vs-request-priority
10e13a64240a drm/i915: Use full serialisation around engine->irq_posted
a41114e3c858 drm/i915: Flush pending interrupt following a GPU reset
-:23: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#23:
2d..1 57964617us : __i915_request_unsubmit: rcs0 fence 29:1056 <- global_seqno 1060
total: 0 errors, 1 warnings, 0 checks, 30 lines checked
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH v2] drm/i915/selftests: Include the trace as a debug aide
2018-03-22 7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
` (3 preceding siblings ...)
2018-03-22 7:43 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide Patchwork
@ 2018-03-22 7:49 ` Chris Wilson
2018-03-22 14:26 ` Mika Kuoppala
2018-03-22 7:58 ` ✓ Fi.CI.BAT: success for series starting with [1/4] " Patchwork
` (3 subsequent siblings)
8 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 7:49 UTC (permalink / raw)
To: intel-gfx
If we fail to reset the GPU in a timely fashion, dump the GEM trace so
that we can see what operations were in flight when the GPU got stuck.
v2: There's more than one timeout that deserves tracing!
v3: Silence checkpatch by not even using a product at all!
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
1 file changed, 20 insertions(+), 3 deletions(-)
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4372826998aa..9b235dae8dd9 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
{
struct wedge_me *w = container_of(work, typeof(*w), work.work);
- pr_err("%pS timed out, cancelling all further testing.\n",
- w->symbol);
+ pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
+
+ GEM_TRACE("%pS timed out.\n", w->symbol);
+ GEM_TRACE_DUMP();
+
i915_gem_set_wedged(w->i915);
}
@@ -621,9 +624,19 @@ static int active_engine(void *data)
mutex_unlock(&engine->i915->drm.struct_mutex);
if (old) {
- i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
+ if (i915_request_wait(old, 0, HZ) < 0) {
+ GEM_TRACE("%s timed out.\n", engine->name);
+ GEM_TRACE_DUMP();
+
+ i915_gem_set_wedged(engine->i915);
+ i915_request_put(old);
+ err = -EIO;
+ break;
+ }
i915_request_put(old);
}
+
+ cond_resched();
}
for (count = 0; count < ARRAY_SIZE(rq); count++)
@@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
err = i915_subtests(tests, i915);
+ mutex_lock(&i915->drm.struct_mutex);
+ flush_test(i915, I915_WAIT_LOCKED);
+ mutex_unlock(&i915->drm.struct_mutex);
+
i915_modparams.enable_hangcheck = saved_hangcheck;
intel_runtime_pm_put(i915);
--
2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply related [flat|nested] 19+ messages in thread
* ✓ Fi.CI.BAT: success for series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide
2018-03-22 7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
` (4 preceding siblings ...)
2018-03-22 7:49 ` [PATCH v2] " Chris Wilson
@ 2018-03-22 7:58 ` Patchwork
2018-03-22 8:02 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2) Patchwork
` (2 subsequent siblings)
8 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2018-03-22 7:58 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide
URL : https://patchwork.freedesktop.org/series/40439/
State : success
== Summary ==
Series 40439v1 series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide
https://patchwork.freedesktop.org/api/1.0/series/40439/revisions/1/mbox/
---- Known issues:
Test debugfs_test:
Subgroup read_all_entries:
incomplete -> PASS (fi-snb-2520m) fdo#103713
Test kms_flip:
Subgroup basic-flip-vs-wf_vblank:
fail -> PASS (fi-skl-6770hq) fdo#100368
Test kms_pipe_crc_basic:
Subgroup suspend-read-crc-pipe-b:
pass -> DMESG-FAIL (fi-cnl-y3) fdo#104951
Subgroup suspend-read-crc-pipe-c:
pass -> INCOMPLETE (fi-bxt-dsi) fdo#103927
fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#104951 https://bugs.freedesktop.org/show_bug.cgi?id=104951
fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927
fi-bdw-5557u total:285 pass:264 dwarn:0 dfail:0 fail:0 skip:21 time:430s
fi-bdw-gvtdvm total:285 pass:261 dwarn:0 dfail:0 fail:0 skip:24 time:442s
fi-blb-e6850 total:285 pass:220 dwarn:1 dfail:0 fail:0 skip:64 time:379s
fi-bsw-n3050 total:285 pass:239 dwarn:0 dfail:0 fail:0 skip:46 time:536s
fi-bwr-2160 total:285 pass:180 dwarn:0 dfail:0 fail:0 skip:105 time:297s
fi-bxt-dsi total:243 pass:216 dwarn:0 dfail:0 fail:0 skip:26
fi-bxt-j4205 total:285 pass:256 dwarn:0 dfail:0 fail:0 skip:29 time:511s
fi-byt-j1900 total:285 pass:250 dwarn:0 dfail:0 fail:0 skip:35 time:513s
fi-byt-n2820 total:285 pass:246 dwarn:0 dfail:0 fail:0 skip:39 time:503s
fi-cfl-8700k total:285 pass:257 dwarn:0 dfail:0 fail:0 skip:28 time:416s
fi-cfl-s2 total:285 pass:259 dwarn:0 dfail:0 fail:0 skip:26 time:567s
fi-cnl-drrs total:285 pass:254 dwarn:3 dfail:0 fail:0 skip:28 time:527s
fi-cnl-y3 total:285 pass:258 dwarn:0 dfail:1 fail:0 skip:26 time:587s
fi-elk-e7500 total:285 pass:225 dwarn:1 dfail:0 fail:0 skip:59 time:427s
fi-gdg-551 total:285 pass:176 dwarn:0 dfail:0 fail:1 skip:108 time:320s
fi-glk-1 total:285 pass:257 dwarn:0 dfail:0 fail:0 skip:28 time:531s
fi-hsw-4770 total:285 pass:258 dwarn:0 dfail:0 fail:0 skip:27 time:404s
fi-ilk-650 total:285 pass:225 dwarn:0 dfail:0 fail:0 skip:60 time:419s
fi-ivb-3520m total:285 pass:256 dwarn:0 dfail:0 fail:0 skip:29 time:469s
fi-ivb-3770 total:285 pass:252 dwarn:0 dfail:0 fail:0 skip:33 time:434s
fi-kbl-7500u total:285 pass:260 dwarn:1 dfail:0 fail:0 skip:24 time:475s
fi-kbl-7567u total:285 pass:265 dwarn:0 dfail:0 fail:0 skip:20 time:464s
fi-kbl-r total:285 pass:258 dwarn:0 dfail:0 fail:0 skip:27 time:513s
fi-pnv-d510 total:285 pass:219 dwarn:1 dfail:0 fail:0 skip:65 time:653s
fi-skl-6260u total:285 pass:265 dwarn:0 dfail:0 fail:0 skip:20 time:437s
fi-skl-6600u total:285 pass:258 dwarn:0 dfail:0 fail:0 skip:27 time:535s
fi-skl-6700k2 total:285 pass:261 dwarn:0 dfail:0 fail:0 skip:24 time:504s
fi-skl-6770hq total:285 pass:265 dwarn:0 dfail:0 fail:0 skip:20 time:512s
fi-skl-guc total:285 pass:257 dwarn:0 dfail:0 fail:0 skip:28 time:427s
fi-skl-gvtdvm total:285 pass:262 dwarn:0 dfail:0 fail:0 skip:23 time:444s
fi-snb-2520m total:285 pass:245 dwarn:0 dfail:0 fail:0 skip:40 time:563s
fi-snb-2600 total:285 pass:245 dwarn:0 dfail:0 fail:0 skip:40 time:403s
fi-cfl-u failed to connect after reboot
dff9ece60048108782aab6d6123822c1d34b0e5a drm-tip: 2018y-03m-21d-20h-44m-14s UTC integration manifest
a41114e3c858 drm/i915: Flush pending interrupt following a GPU reset
10e13a64240a drm/i915: Use full serialisation around engine->irq_posted
aae1777597a8 drm/i915/selftests: Stress resets-vs-request-priority
6adaf914c971 drm/i915/selftests: Include the trace as a debug aide
== Logs ==
For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8443/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* ✗ Fi.CI.CHECKPATCH: warning for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
2018-03-22 7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
` (5 preceding siblings ...)
2018-03-22 7:58 ` ✓ Fi.CI.BAT: success for series starting with [1/4] " Patchwork
@ 2018-03-22 8:02 ` Patchwork
2018-03-22 8:19 ` ✓ Fi.CI.BAT: success " Patchwork
2018-03-22 10:23 ` ✓ Fi.CI.IGT: " Patchwork
8 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2018-03-22 8:02 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
URL : https://patchwork.freedesktop.org/series/40439/
State : warning
== Summary ==
$ dim checkpatch origin/drm-tip
d4b28d06e3b8 drm/i915/selftests: Include the trace as a debug aide
59301a5a5c45 drm/i915/selftests: Stress resets-vs-request-priority
4eb53ac79094 drm/i915: Use full serialisation around engine->irq_posted
cf4fae2b05ff drm/i915: Flush pending interrupt following a GPU reset
-:23: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#23:
2d..1 57964617us : __i915_request_unsubmit: rcs0 fence 29:1056 <- global_seqno 1060
total: 0 errors, 1 warnings, 0 checks, 30 lines checked
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* ✓ Fi.CI.BAT: success for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
2018-03-22 7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
` (6 preceding siblings ...)
2018-03-22 8:02 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2) Patchwork
@ 2018-03-22 8:19 ` Patchwork
2018-03-22 10:23 ` ✓ Fi.CI.IGT: " Patchwork
8 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2018-03-22 8:19 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
URL : https://patchwork.freedesktop.org/series/40439/
State : success
== Summary ==
Series 40439v2 series starting with [v2] drm/i915/selftests: Include the trace as a debug aide
https://patchwork.freedesktop.org/api/1.0/series/40439/revisions/2/mbox/
---- Known issues:
Test debugfs_test:
Subgroup read_all_entries:
incomplete -> PASS (fi-snb-2520m) fdo#103713 +1
Test gem_exec_suspend:
Subgroup basic-s3:
pass -> INCOMPLETE (fi-skl-6260u) fdo#104108
Test kms_flip:
Subgroup basic-flip-vs-wf_vblank:
fail -> PASS (fi-skl-6770hq) fdo#100368
fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
fdo#104108 https://bugs.freedesktop.org/show_bug.cgi?id=104108
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fi-bdw-5557u total:285 pass:264 dwarn:0 dfail:0 fail:0 skip:21 time:433s
fi-bdw-gvtdvm total:285 pass:261 dwarn:0 dfail:0 fail:0 skip:24 time:444s
fi-blb-e6850 total:285 pass:220 dwarn:1 dfail:0 fail:0 skip:64 time:380s
fi-bsw-n3050 total:285 pass:239 dwarn:0 dfail:0 fail:0 skip:46 time:535s
fi-bwr-2160 total:285 pass:180 dwarn:0 dfail:0 fail:0 skip:105 time:297s
fi-bxt-dsi total:285 pass:255 dwarn:0 dfail:0 fail:0 skip:30 time:512s
fi-bxt-j4205 total:285 pass:256 dwarn:0 dfail:0 fail:0 skip:29 time:513s
fi-byt-j1900 total:285 pass:250 dwarn:0 dfail:0 fail:0 skip:35 time:516s
fi-byt-n2820 total:285 pass:246 dwarn:0 dfail:0 fail:0 skip:39 time:503s
fi-cfl-8700k total:285 pass:257 dwarn:0 dfail:0 fail:0 skip:28 time:409s
fi-cfl-s2 total:285 pass:259 dwarn:0 dfail:0 fail:0 skip:26 time:564s
fi-cnl-drrs total:285 pass:254 dwarn:3 dfail:0 fail:0 skip:28 time:535s
fi-cnl-y3 total:285 pass:259 dwarn:0 dfail:0 fail:0 skip:26 time:588s
fi-elk-e7500 total:285 pass:225 dwarn:1 dfail:0 fail:0 skip:59 time:429s
fi-gdg-551 total:285 pass:176 dwarn:0 dfail:0 fail:1 skip:108 time:320s
fi-glk-1 total:285 pass:257 dwarn:0 dfail:0 fail:0 skip:28 time:536s
fi-hsw-4770 total:285 pass:258 dwarn:0 dfail:0 fail:0 skip:27 time:401s
fi-ilk-650 total:285 pass:225 dwarn:0 dfail:0 fail:0 skip:60 time:425s
fi-ivb-3520m total:285 pass:256 dwarn:0 dfail:0 fail:0 skip:29 time:472s
fi-ivb-3770 total:285 pass:252 dwarn:0 dfail:0 fail:0 skip:33 time:434s
fi-kbl-7500u total:285 pass:260 dwarn:1 dfail:0 fail:0 skip:24 time:480s
fi-kbl-7567u total:285 pass:265 dwarn:0 dfail:0 fail:0 skip:20 time:466s
fi-kbl-r total:285 pass:258 dwarn:0 dfail:0 fail:0 skip:27 time:513s
fi-pnv-d510 total:285 pass:219 dwarn:1 dfail:0 fail:0 skip:65 time:652s
fi-skl-6260u total:108 pass:104 dwarn:0 dfail:0 fail:0 skip:3
fi-skl-6600u total:285 pass:258 dwarn:0 dfail:0 fail:0 skip:27 time:529s
fi-skl-6700k2 total:285 pass:261 dwarn:0 dfail:0 fail:0 skip:24 time:495s
fi-skl-6770hq total:285 pass:265 dwarn:0 dfail:0 fail:0 skip:20 time:500s
fi-skl-guc total:285 pass:257 dwarn:0 dfail:0 fail:0 skip:28 time:438s
fi-skl-gvtdvm total:285 pass:262 dwarn:0 dfail:0 fail:0 skip:23 time:444s
fi-snb-2520m total:242 pass:208 dwarn:0 dfail:0 fail:0 skip:33
fi-snb-2600 total:285 pass:245 dwarn:0 dfail:0 fail:0 skip:40 time:398s
dff9ece60048108782aab6d6123822c1d34b0e5a drm-tip: 2018y-03m-21d-20h-44m-14s UTC integration manifest
cf4fae2b05ff drm/i915: Flush pending interrupt following a GPU reset
4eb53ac79094 drm/i915: Use full serialisation around engine->irq_posted
59301a5a5c45 drm/i915/selftests: Stress resets-vs-request-priority
d4b28d06e3b8 drm/i915/selftests: Include the trace as a debug aide
== Logs ==
For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8444/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* ✓ Fi.CI.IGT: success for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
2018-03-22 7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
` (7 preceding siblings ...)
2018-03-22 8:19 ` ✓ Fi.CI.BAT: success " Patchwork
@ 2018-03-22 10:23 ` Patchwork
8 siblings, 0 replies; 19+ messages in thread
From: Patchwork @ 2018-03-22 10:23 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
== Series Details ==
Series: series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2)
URL : https://patchwork.freedesktop.org/series/40439/
State : success
== Summary ==
---- Known issues:
Test kms_cursor_legacy:
Subgroup flip-vs-cursor-atomic:
pass -> FAIL (shard-hsw) fdo#102670
Test kms_flip:
Subgroup flip-vs-wf_vblank-interruptible:
fail -> PASS (shard-hsw) fdo#100368 +1
Test kms_pipe_crc_basic:
Subgroup read-crc-pipe-c-frame-sequence:
pass -> FAIL (shard-apl) fdo#103481
Test kms_setmode:
Subgroup basic:
pass -> FAIL (shard-apl) fdo#99912
Test kms_vblank:
Subgroup pipe-a-ts-continuation-suspend:
pass -> INCOMPLETE (shard-hsw) fdo#103540
fdo#102670 https://bugs.freedesktop.org/show_bug.cgi?id=102670
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103481 https://bugs.freedesktop.org/show_bug.cgi?id=103481
fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912
fdo#103540 https://bugs.freedesktop.org/show_bug.cgi?id=103540
shard-apl total:3478 pass:1814 dwarn:1 dfail:0 fail:8 skip:1655 time:13097s
shard-hsw total:3391 pass:1726 dwarn:1 dfail:0 fail:3 skip:1659 time:11609s
shard-snb total:3478 pass:1357 dwarn:1 dfail:0 fail:3 skip:2117 time:7279s
Blacklisted hosts:
shard-kbl total:3454 pass:1924 dwarn:1 dfail:0 fail:9 skip:1519 time:9758s
== Logs ==
For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8444/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v2] drm/i915/selftests: Include the trace as a debug aide
2018-03-22 7:49 ` [PATCH v2] " Chris Wilson
@ 2018-03-22 14:26 ` Mika Kuoppala
2018-03-22 14:30 ` Chris Wilson
0 siblings, 1 reply; 19+ messages in thread
From: Mika Kuoppala @ 2018-03-22 14:26 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
Chris Wilson <chris@chris-wilson.co.uk> writes:
> If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> that we can see what operations were in flight when the GPU got stuck.
>
> v2: There's more than one timeout that deserves tracing!
> v3: Silence checkpatch by not even using a product at all!
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
> drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
> 1 file changed, 20 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 4372826998aa..9b235dae8dd9 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
> {
> struct wedge_me *w = container_of(work, typeof(*w), work.work);
>
> - pr_err("%pS timed out, cancelling all further testing.\n",
> - w->symbol);
> + pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> +
> + GEM_TRACE("%pS timed out.\n", w->symbol);
> + GEM_TRACE_DUMP();
> +
> i915_gem_set_wedged(w->i915);
> }
>
> @@ -621,9 +624,19 @@ static int active_engine(void *data)
> mutex_unlock(&engine->i915->drm.struct_mutex);
>
> if (old) {
> - i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> + if (i915_request_wait(old, 0, HZ) < 0) {
> + GEM_TRACE("%s timed out.\n", engine->name);
> + GEM_TRACE_DUMP();
> +
> + i915_gem_set_wedged(engine->i915);
> + i915_request_put(old);
> + err = -EIO;
> + break;
> + }
Using err = i915_request_wait() could have saved one extra request_put
but I dunno if it would be any cleaner.
> i915_request_put(old);
> }
> +
> + cond_resched();
To give more slack for other engines and main thread to proceed?
> }
>
> for (count = 0; count < ARRAY_SIZE(rq); count++)
> @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
>
> err = i915_subtests(tests, i915);
>
> + mutex_lock(&i915->drm.struct_mutex);
> + flush_test(i915, I915_WAIT_LOCKED);
> + mutex_unlock(&i915->drm.struct_mutex);
> +
To wash out leftovers.
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> i915_modparams.enable_hangcheck = saved_hangcheck;
> intel_runtime_pm_put(i915);
>
> --
> 2.16.2
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v2] drm/i915/selftests: Include the trace as a debug aide
2018-03-22 14:26 ` Mika Kuoppala
@ 2018-03-22 14:30 ` Chris Wilson
2018-03-22 19:29 ` Jeff McGee
0 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 14:30 UTC (permalink / raw)
To: Mika Kuoppala, intel-gfx
Quoting Mika Kuoppala (2018-03-22 14:26:41)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
>
> > If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> > that we can see what operations were in flight when the GPU got stuck.
> >
> > v2: There's more than one timeout that deserves tracing!
> > v3: Silence checkpatch by not even using a product at all!
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
> > 1 file changed, 20 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > index 4372826998aa..9b235dae8dd9 100644
> > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
> > {
> > struct wedge_me *w = container_of(work, typeof(*w), work.work);
> >
> > - pr_err("%pS timed out, cancelling all further testing.\n",
> > - w->symbol);
> > + pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> > +
> > + GEM_TRACE("%pS timed out.\n", w->symbol);
> > + GEM_TRACE_DUMP();
> > +
> > i915_gem_set_wedged(w->i915);
> > }
> >
> > @@ -621,9 +624,19 @@ static int active_engine(void *data)
> > mutex_unlock(&engine->i915->drm.struct_mutex);
> >
> > if (old) {
> > - i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> > + if (i915_request_wait(old, 0, HZ) < 0) {
> > + GEM_TRACE("%s timed out.\n", engine->name);
> > + GEM_TRACE_DUMP();
> > +
> > + i915_gem_set_wedged(engine->i915);
> > + i915_request_put(old);
> > + err = -EIO;
> > + break;
> > + }
>
> Using err = i915_request_wait() could have saved one extra request_put
> but I dunno if it would be any cleaner.
It's also -ETIME, which didn't fit my intention.
>
> > i915_request_put(old);
> > }
> > +
> > + cond_resched();
>
> To give more slack for other engines and main thread to proceed?
Yes. Otherwise, it spins mighty fine.
>
> > }
> >
> > for (count = 0; count < ARRAY_SIZE(rq); count++)
> > @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> >
> > err = i915_subtests(tests, i915);
> >
> > + mutex_lock(&i915->drm.struct_mutex);
> > + flush_test(i915, I915_WAIT_LOCKED);
> > + mutex_unlock(&i915->drm.struct_mutex);
> > +
>
> To wash out leftovers.
Yeah, from the early abort we left requests unaccounted for and needed
to grab the struct_mutex to run retire. Otherwise we hit assertions
later on about trying to unload the driver with requests still inflight.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
2018-03-22 7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
@ 2018-03-22 14:35 ` Mika Kuoppala
2018-03-22 15:34 ` Jeff McGee
2018-03-30 23:08 ` Chris Wilson
2 siblings, 0 replies; 19+ messages in thread
From: Mika Kuoppala @ 2018-03-22 14:35 UTC (permalink / raw)
To: Chris Wilson, intel-gfx
Chris Wilson <chris@chris-wilson.co.uk> writes:
> Using engine->irq_posted for execlists, we are not always serialised by
> the tasklet as we supposed. On the reset paths, the tasklet is disabled
> and ignored. Instead, we manipulate the engine->irq_posted directly to
> account for the reset, but if an interrupt fired before the reset and so
> wrote to engine->irq_posted, that write may not be flushed from the
> local CPU's cacheline until much later as the tasklet is already active
> and so does not generate a mb(). To correctly serialise the interrupt
> with reset, we need serialisation on the set_bit() itself.
>
> And at last Mika can be happy.
Yes.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> CC: Michel Thierry <michel.thierry@intel.com>
> Cc: Jeff McGee <jeff.mcgee@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
> drivers/gpu/drm/i915/i915_irq.c | 7 +++----
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index fa7310766217..27aee25429b7 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> bool tasklet = false;
>
> if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> - if (READ_ONCE(engine->execlists.active)) {
> - __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> - tasklet = true;
> - }
> + if (READ_ONCE(engine->execlists.active))
> + tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> + &engine->irq_posted);
> }
>
> if (iir & GT_RENDER_USER_INTERRUPT) {
> --
> 2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
2018-03-22 7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
2018-03-22 14:35 ` Mika Kuoppala
@ 2018-03-22 15:34 ` Jeff McGee
2018-03-22 17:01 ` Chris Wilson
2018-03-30 23:08 ` Chris Wilson
2 siblings, 1 reply; 19+ messages in thread
From: Jeff McGee @ 2018-03-22 15:34 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
On Thu, Mar 22, 2018 at 07:35:32AM +0000, Chris Wilson wrote:
> Using engine->irq_posted for execlists, we are not always serialised by
> the tasklet as we supposed. On the reset paths, the tasklet is disabled
> and ignored. Instead, we manipulate the engine->irq_posted directly to
> account for the reset, but if an interrupt fired before the reset and so
> wrote to engine->irq_posted, that write may not be flushed from the
> local CPU's cacheline until much later as the tasklet is already active
> and so does not generate a mb(). To correctly serialise the interrupt
> with reset, we need serialisation on the set_bit() itself.
>
> And at last Mika can be happy.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> CC: Michel Thierry <michel.thierry@intel.com>
> Cc: Jeff McGee <jeff.mcgee@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
> drivers/gpu/drm/i915/i915_irq.c | 7 +++----
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index fa7310766217..27aee25429b7 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> bool tasklet = false;
>
> if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> - if (READ_ONCE(engine->execlists.active)) {
> - __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> - tasklet = true;
> - }
> + if (READ_ONCE(engine->execlists.active))
> + tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> + &engine->irq_posted);
> }
>
> if (iir & GT_RENDER_USER_INTERRUPT) {
> --
> 2.16.2
>
Confirmed that this along with the interrupt flush eliminates the cases
of finding CSB tail at its reset value (0x7) in the tasklet in my force
preemption tests.
Reviewed-by: Jeff McGee <jeff.mcgee@intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
2018-03-22 15:34 ` Jeff McGee
@ 2018-03-22 17:01 ` Chris Wilson
0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 17:01 UTC (permalink / raw)
To: Jeff McGee; +Cc: intel-gfx, Mika
Quoting Jeff McGee (2018-03-22 15:34:45)
> On Thu, Mar 22, 2018 at 07:35:32AM +0000, Chris Wilson wrote:
> > Using engine->irq_posted for execlists, we are not always serialised by
> > the tasklet as we supposed. On the reset paths, the tasklet is disabled
> > and ignored. Instead, we manipulate the engine->irq_posted directly to
> > account for the reset, but if an interrupt fired before the reset and so
> > wrote to engine->irq_posted, that write may not be flushed from the
> > local CPU's cacheline until much later as the tasklet is already active
> > and so does not generate a mb(). To correctly serialise the interrupt
> > with reset, we need serialisation on the set_bit() itself.
> >
> > And at last Mika can be happy.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Michał Winiarski <michal.winiarski@intel.com>
> > CC: Michel Thierry <michel.thierry@intel.com>
> > Cc: Jeff McGee <jeff.mcgee@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> > drivers/gpu/drm/i915/i915_irq.c | 7 +++----
> > 1 file changed, 3 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index fa7310766217..27aee25429b7 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> > bool tasklet = false;
> >
> > if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> > - if (READ_ONCE(engine->execlists.active)) {
> > - __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > - tasklet = true;
> > - }
> > + if (READ_ONCE(engine->execlists.active))
> > + tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> > + &engine->irq_posted);
> > }
> >
> > if (iir & GT_RENDER_USER_INTERRUPT) {
> > --
> > 2.16.2
> >
>
> Confirmed that this along with the interrupt flush eliminates the cases
> of finding CSB tail at its reset value (0x7) in the tasklet in my force
> preemption tests.
At the moment, I'm concerned about the failures we have in CI before we
go building on top. So care to complete the set of r-b for us to move
on?
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v2] drm/i915/selftests: Include the trace as a debug aide
2018-03-22 14:30 ` Chris Wilson
@ 2018-03-22 19:29 ` Jeff McGee
2018-03-22 20:37 ` Chris Wilson
0 siblings, 1 reply; 19+ messages in thread
From: Jeff McGee @ 2018-03-22 19:29 UTC (permalink / raw)
To: Chris Wilson; +Cc: intel-gfx
On Thu, Mar 22, 2018 at 02:30:09PM +0000, Chris Wilson wrote:
> Quoting Mika Kuoppala (2018-03-22 14:26:41)
> > Chris Wilson <chris@chris-wilson.co.uk> writes:
> >
> > > If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> > > that we can see what operations were in flight when the GPU got stuck.
> > >
> > > v2: There's more than one timeout that deserves tracing!
> > > v3: Silence checkpatch by not even using a product at all!
> > >
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > ---
> > > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
> > > 1 file changed, 20 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > index 4372826998aa..9b235dae8dd9 100644
> > > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
> > > {
> > > struct wedge_me *w = container_of(work, typeof(*w), work.work);
> > >
> > > - pr_err("%pS timed out, cancelling all further testing.\n",
> > > - w->symbol);
> > > + pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> > > +
> > > + GEM_TRACE("%pS timed out.\n", w->symbol);
> > > + GEM_TRACE_DUMP();
> > > +
> > > i915_gem_set_wedged(w->i915);
> > > }
> > >
> > > @@ -621,9 +624,19 @@ static int active_engine(void *data)
> > > mutex_unlock(&engine->i915->drm.struct_mutex);
> > >
> > > if (old) {
> > > - i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> > > + if (i915_request_wait(old, 0, HZ) < 0) {
> > > + GEM_TRACE("%s timed out.\n", engine->name);
> > > + GEM_TRACE_DUMP();
> > > +
> > > + i915_gem_set_wedged(engine->i915);
> > > + i915_request_put(old);
> > > + err = -EIO;
> > > + break;
> > > + }
> >
> > Using err = i915_request_wait() could have saved one extra request_put
> > but I dunno if it would be any cleaner.
>
> It's also -ETIME, which didn't fit my intention.
>
> >
> > > i915_request_put(old);
> > > }
> > > +
> > > + cond_resched();
> >
> > To give more slack for other engines and main thread to proceed?
>
> Yes. Otherwise, it spins mighty fine.
> >
> > > }
> > >
> > > for (count = 0; count < ARRAY_SIZE(rq); count++)
> > > @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> > >
> > > err = i915_subtests(tests, i915);
> > >
> > > + mutex_lock(&i915->drm.struct_mutex);
> > > + flush_test(i915, I915_WAIT_LOCKED);
> > > + mutex_unlock(&i915->drm.struct_mutex);
> > > +
> >
> > To wash out leftovers.
>
> Yeah, from the early abort we left requests unaccounted for and needed
> to grab the struct_mutex to run retire. Otherwise we hit assertions
> later on about trying to unload the driver with requests still inflight.
> -Chris
On this and the 3 others in this series...
Reviewed-by: Jeff McGee <jeff.mcgee@intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH v2] drm/i915/selftests: Include the trace as a debug aide
2018-03-22 19:29 ` Jeff McGee
@ 2018-03-22 20:37 ` Chris Wilson
0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-22 20:37 UTC (permalink / raw)
To: Jeff McGee; +Cc: intel-gfx
Quoting Jeff McGee (2018-03-22 19:29:16)
> On Thu, Mar 22, 2018 at 02:30:09PM +0000, Chris Wilson wrote:
> > Quoting Mika Kuoppala (2018-03-22 14:26:41)
> > > Chris Wilson <chris@chris-wilson.co.uk> writes:
> > >
> > > > If we fail to reset the GPU in a timely fashion, dump the GEM trace so
> > > > that we can see what operations were in flight when the GPU got stuck.
> > > >
> > > > v2: There's more than one timeout that deserves tracing!
> > > > v3: Silence checkpatch by not even using a product at all!
> > > >
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > ---
> > > > drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 23 ++++++++++++++++++++---
> > > > 1 file changed, 20 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > > index 4372826998aa..9b235dae8dd9 100644
> > > > --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > > +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> > > > @@ -260,8 +260,11 @@ static void wedge_me(struct work_struct *work)
> > > > {
> > > > struct wedge_me *w = container_of(work, typeof(*w), work.work);
> > > >
> > > > - pr_err("%pS timed out, cancelling all further testing.\n",
> > > > - w->symbol);
> > > > + pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> > > > +
> > > > + GEM_TRACE("%pS timed out.\n", w->symbol);
> > > > + GEM_TRACE_DUMP();
> > > > +
> > > > i915_gem_set_wedged(w->i915);
> > > > }
> > > >
> > > > @@ -621,9 +624,19 @@ static int active_engine(void *data)
> > > > mutex_unlock(&engine->i915->drm.struct_mutex);
> > > >
> > > > if (old) {
> > > > - i915_request_wait(old, 0, MAX_SCHEDULE_TIMEOUT);
> > > > + if (i915_request_wait(old, 0, HZ) < 0) {
> > > > + GEM_TRACE("%s timed out.\n", engine->name);
> > > > + GEM_TRACE_DUMP();
> > > > +
> > > > + i915_gem_set_wedged(engine->i915);
> > > > + i915_request_put(old);
> > > > + err = -EIO;
> > > > + break;
> > > > + }
> > >
> > > Using err = i915_request_wait() could have saved one extra request_put
> > > but I dunno if it would be any cleaner.
> >
> > It's also -ETIME, which didn't fit my intention.
> >
> > >
> > > > i915_request_put(old);
> > > > }
> > > > +
> > > > + cond_resched();
> > >
> > > To give more slack for other engines and main thread to proceed?
> >
> > Yes. Otherwise, it spins mighty fine.
> > >
> > > > }
> > > >
> > > > for (count = 0; count < ARRAY_SIZE(rq); count++)
> > > > @@ -1126,6 +1139,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
> > > >
> > > > err = i915_subtests(tests, i915);
> > > >
> > > > + mutex_lock(&i915->drm.struct_mutex);
> > > > + flush_test(i915, I915_WAIT_LOCKED);
> > > > + mutex_unlock(&i915->drm.struct_mutex);
> > > > +
> > >
> > > To wash out leftovers.
> >
> > Yeah, from the early abort we left requests unaccounted for and needed
> > to grab the struct_mutex to run retire. Otherwise we hit assertions
> > later on about trying to unload the driver with requests still inflight.
> > -Chris
>
> On this and the 3 others in this series...
>
> Reviewed-by: Jeff McGee <jeff.mcgee@intel.com>
Much appreciated. Pushed, let's hope CI holds up and that we're ready to
start talking about the real changes required for forced preemption as
opposed to getting the existing code working.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
2018-03-22 7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
2018-03-22 14:35 ` Mika Kuoppala
2018-03-22 15:34 ` Jeff McGee
@ 2018-03-30 23:08 ` Chris Wilson
2018-03-31 8:59 ` Chris Wilson
2 siblings, 1 reply; 19+ messages in thread
From: Chris Wilson @ 2018-03-30 23:08 UTC (permalink / raw)
To: intel-gfx; +Cc: Mika
Quoting Chris Wilson (2018-03-22 07:35:32)
> Using engine->irq_posted for execlists, we are not always serialised by
> the tasklet as we supposed. On the reset paths, the tasklet is disabled
> and ignored. Instead, we manipulate the engine->irq_posted directly to
> account for the reset, but if an interrupt fired before the reset and so
> wrote to engine->irq_posted, that write may not be flushed from the
> local CPU's cacheline until much later as the tasklet is already active
> and so does not generate a mb(). To correctly serialise the interrupt
> with reset, we need serialisation on the set_bit() itself.
>
> And at last Mika can be happy.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> CC: Michel Thierry <michel.thierry@intel.com>
> Cc: Jeff McGee <jeff.mcgee@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
> drivers/gpu/drm/i915/i915_irq.c | 7 +++----
> 1 file changed, 3 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index fa7310766217..27aee25429b7 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> bool tasklet = false;
>
> if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> - if (READ_ONCE(engine->execlists.active)) {
> - __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> - tasklet = true;
> - }
> + if (READ_ONCE(engine->execlists.active))
> + tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> + &engine->irq_posted);
This is driving me mad. A very rare missed interrupt unless we
unconditionally kick tasklet:
if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
- if (READ_ONCE(engine->execlists.active))
- tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
- &engine->irq_posted);
+ if (READ_ONCE(engine->execlists.active)) {
+ set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
+ tasklet = true;
+ }
}
I can't see why.
Hmm, I wonder if we are seeing READ_ONCE(execlsts->active) false
negatives.
Getting close to admitting defeat :(
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted
2018-03-30 23:08 ` Chris Wilson
@ 2018-03-31 8:59 ` Chris Wilson
0 siblings, 0 replies; 19+ messages in thread
From: Chris Wilson @ 2018-03-31 8:59 UTC (permalink / raw)
To: intel-gfx
Quoting Chris Wilson (2018-03-31 00:08:47)
> Quoting Chris Wilson (2018-03-22 07:35:32)
> > Using engine->irq_posted for execlists, we are not always serialised by
> > the tasklet as we supposed. On the reset paths, the tasklet is disabled
> > and ignored. Instead, we manipulate the engine->irq_posted directly to
> > account for the reset, but if an interrupt fired before the reset and so
> > wrote to engine->irq_posted, that write may not be flushed from the
> > local CPU's cacheline until much later as the tasklet is already active
> > and so does not generate a mb(). To correctly serialise the interrupt
> > with reset, we need serialisation on the set_bit() itself.
> >
> > And at last Mika can be happy.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Michał Winiarski <michal.winiarski@intel.com>
> > CC: Michel Thierry <michel.thierry@intel.com>
> > Cc: Jeff McGee <jeff.mcgee@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> > drivers/gpu/drm/i915/i915_irq.c | 7 +++----
> > 1 file changed, 3 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index fa7310766217..27aee25429b7 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -1405,10 +1405,9 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
> > bool tasklet = false;
> >
> > if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> > - if (READ_ONCE(engine->execlists.active)) {
> > - __set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > - tasklet = true;
> > - }
> > + if (READ_ONCE(engine->execlists.active))
> > + tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> > + &engine->irq_posted);
>
> This is driving me mad. A very rare missed interrupt unless we
> unconditionally kick tasklet:
>
> if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
> - if (READ_ONCE(engine->execlists.active))
> - tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
> - &engine->irq_posted);
> + if (READ_ONCE(engine->execlists.active)) {
> + set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> + tasklet = true;
> + }
> }
>
> I can't see why.
>
> Hmm, I wonder if we are seeing READ_ONCE(execlsts->active) false
> negatives.
Fortunately, doesn't appear to be that.
@@ -1405,9 +1405,10 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
bool tasklet = false;
if (iir & GT_CONTEXT_SWITCH_INTERRUPT) {
- if (READ_ONCE(engine->execlists.active))
- tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
- &engine->irq_posted);
+ GEM_BUG_ON(!READ_ONCE(execlists->tasklet.state) &&
+ test_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted));
+ tasklet = !test_and_set_bit(ENGINE_IRQ_EXECLIST,
+ &engine->irq_posted);
}
Hasn't even hit a BUG, which is a little disconcerting.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx
^ permalink raw reply [flat|nested] 19+ messages in thread
end of thread, other threads:[~2018-03-31 8:59 UTC | newest]
Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-22 7:35 [PATCH 1/4] drm/i915/selftests: Include the trace as a debug aide Chris Wilson
2018-03-22 7:35 ` [PATCH 2/4] drm/i915/selftests: Stress resets-vs-request-priority Chris Wilson
2018-03-22 7:35 ` [PATCH 3/4] drm/i915: Use full serialisation around engine->irq_posted Chris Wilson
2018-03-22 14:35 ` Mika Kuoppala
2018-03-22 15:34 ` Jeff McGee
2018-03-22 17:01 ` Chris Wilson
2018-03-30 23:08 ` Chris Wilson
2018-03-31 8:59 ` Chris Wilson
2018-03-22 7:35 ` [PATCH 4/4] drm/i915: Flush pending interrupt following a GPU reset Chris Wilson
2018-03-22 7:43 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/4] drm/i915/selftests: Include the trace as a debug aide Patchwork
2018-03-22 7:49 ` [PATCH v2] " Chris Wilson
2018-03-22 14:26 ` Mika Kuoppala
2018-03-22 14:30 ` Chris Wilson
2018-03-22 19:29 ` Jeff McGee
2018-03-22 20:37 ` Chris Wilson
2018-03-22 7:58 ` ✓ Fi.CI.BAT: success for series starting with [1/4] " Patchwork
2018-03-22 8:02 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [v2] drm/i915/selftests: Include the trace as a debug aide (rev2) Patchwork
2018-03-22 8:19 ` ✓ Fi.CI.BAT: success " Patchwork
2018-03-22 10:23 ` ✓ Fi.CI.IGT: " Patchwork
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.