intel-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex
@ 2020-02-10 20:57 Chris Wilson
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 2/7] drm/i915/selftests: Exercise timeslice rewinding Chris Wilson
                   ` (9 more replies)
  0 siblings, 10 replies; 22+ messages in thread
From: Chris Wilson @ 2020-02-10 20:57 UTC (permalink / raw)
  To: intel-gfx; +Cc: Matthew Auld

We manipulate ring->head while active in i915_request_retire underneath
the timeline manipulation. We cannot rely on a stable ring->head outside
of the timeline->mutex, in particular while setting up the context for
resume and reset.

Closes: https://gitlab.freedesktop.org/drm/intel/issues/1126
Fixes: 0881954965e3 ("drm/i915: Introduce intel_context.pin_mutex for pin management")
Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex")
References: f3c0efc9fe7a ("drm/i915/execlists: Leave resetting ring to intel_ring")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c    | 36 ++++++++++++--------------
 drivers/gpu/drm/i915/gt/selftest_lrc.c |  2 +-
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 929be03bbe7e..70d91ad923ef 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -235,7 +235,8 @@ static void execlists_init_reg_state(u32 *reg_state,
 				     bool close);
 static void
 __execlists_update_reg_state(const struct intel_context *ce,
-			     const struct intel_engine_cs *engine);
+			     const struct intel_engine_cs *engine,
+			     u32 head);
 
 static void mark_eio(struct i915_request *rq)
 {
@@ -1184,12 +1185,11 @@ static void reset_active(struct i915_request *rq,
 		head = rq->tail;
 	else
 		head = active_request(ce->timeline, rq)->head;
-	ce->ring->head = intel_ring_wrap(ce->ring, head);
-	intel_ring_update_space(ce->ring);
+	head = intel_ring_wrap(ce->ring, head);
 
 	/* Scrub the context image to prevent replaying the previous batch */
 	restore_default_state(ce, engine);
-	__execlists_update_reg_state(ce, engine);
+	__execlists_update_reg_state(ce, engine, head);
 
 	/* We've switched away, so this should be a no-op, but intent matters */
 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
@@ -2878,16 +2878,17 @@ static void execlists_context_unpin(struct intel_context *ce)
 
 static void
 __execlists_update_reg_state(const struct intel_context *ce,
-			     const struct intel_engine_cs *engine)
+			     const struct intel_engine_cs *engine,
+			     u32 head)
 {
 	struct intel_ring *ring = ce->ring;
 	u32 *regs = ce->lrc_reg_state;
 
-	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
+	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
 	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
 
 	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
-	regs[CTX_RING_HEAD] = ring->head;
+	regs[CTX_RING_HEAD] = head;
 	regs[CTX_RING_TAIL] = ring->tail;
 
 	/* RPCS */
@@ -2916,7 +2917,7 @@ __execlists_context_pin(struct intel_context *ce,
 
 	ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
 	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
-	__execlists_update_reg_state(ce, engine);
+	__execlists_update_reg_state(ce, engine, ce->ring->tail);
 
 	return 0;
 }
@@ -2941,7 +2942,7 @@ static void execlists_context_reset(struct intel_context *ce)
 	/* Scrub away the garbage */
 	execlists_init_reg_state(ce->lrc_reg_state,
 				 ce, ce->engine, ce->ring, true);
-	__execlists_update_reg_state(ce, ce->engine);
+	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
 
 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
 }
@@ -3538,6 +3539,7 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 	struct intel_context *ce;
 	struct i915_request *rq;
+	u32 head;
 
 	mb(); /* paranoia: read the CSB pointers from after the reset */
 	clflush(execlists->csb_write);
@@ -3565,15 +3567,15 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
 
 	if (i915_request_completed(rq)) {
 		/* Idle context; tidy up the ring so we can restart afresh */
-		ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
+		head = intel_ring_wrap(ce->ring, rq->tail);
 		goto out_replay;
 	}
 
 	/* Context has requests still in-flight; it should not be idle! */
 	GEM_BUG_ON(i915_active_is_idle(&ce->active));
 	rq = active_request(ce->timeline, rq);
-	ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
-	GEM_BUG_ON(ce->ring->head == ce->ring->tail);
+	head = intel_ring_wrap(ce->ring, rq->head);
+	GEM_BUG_ON(head == ce->ring->tail);
 
 	/*
 	 * If this request hasn't started yet, e.g. it is waiting on a
@@ -3618,10 +3620,9 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
 
 out_replay:
 	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
-		     ce->ring->head, ce->ring->tail);
-	intel_ring_update_space(ce->ring);
+		     head, ce->ring->tail);
 	__execlists_reset_reg_state(ce, engine);
-	__execlists_update_reg_state(ce, engine);
+	__execlists_update_reg_state(ce, engine, head);
 	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
 
 unwind:
@@ -5265,10 +5266,7 @@ void intel_lr_context_reset(struct intel_engine_cs *engine,
 		restore_default_state(ce, engine);
 
 	/* Rerun the request; its payload has been neutered (if guilty). */
-	ce->ring->head = head;
-	intel_ring_update_space(ce->ring);
-
-	__execlists_update_reg_state(ce, engine);
+	__execlists_update_reg_state(ce, engine, head);
 }
 
 bool
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 7ef68500b2bd..82fa0712808e 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -201,7 +201,7 @@ static int live_unlite_restore(struct intel_gt *gt, int prio)
 		}
 		GEM_BUG_ON(!ce[1]->ring->size);
 		intel_ring_reset(ce[1]->ring, ce[1]->ring->size / 2);
-		__execlists_update_reg_state(ce[1], engine);
+		__execlists_update_reg_state(ce[1], engine, ce[1]->ring->head);
 
 		rq[0] = igt_spinner_create_request(&spin, ce[0], MI_ARB_CHECK);
 		if (IS_ERR(rq[0])) {
-- 
2.25.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-gfx] [PATCH 2/7] drm/i915/selftests: Exercise timeslice rewinding
  2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
@ 2020-02-10 20:57 ` Chris Wilson
  2020-02-11 14:50   ` Mika Kuoppala
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 3/7] drm/i915/selftests: Relax timeout for error-interrupt reset processing Chris Wilson
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 22+ messages in thread
From: Chris Wilson @ 2020-02-10 20:57 UTC (permalink / raw)
  To: intel-gfx

Originally, I did not expect having to rewind a context upon
timeslicing: the point was to replace the executing context with an idle
one! However, given a second context that depends on requests from the
first, we may have to split the requests along the first context to
execute the second, causing us to replay the first context and have to
rewind the RING_TAIL.

References: 5ba32c7be81e ("drm/i915/execlists: Always force a context reload when rewinding RING_TAIL")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 202 ++++++++++++++++++++++++-
 1 file changed, 201 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 82fa0712808e..8b7383f6d9b3 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -76,8 +76,11 @@ static int wait_for_submit(struct intel_engine_cs *engine,
 	do {
 		cond_resched();
 		intel_engine_flush_submission(engine);
-		if (i915_request_is_active(rq))
+		if (i915_request_is_active(rq) &&
+		    !READ_ONCE(engine->execlists.pending[0])) {
+			tasklet_unlock_wait(&engine->execlists.tasklet);
 			return 0;
+		}
 	} while (time_before(jiffies, timeout));
 
 	return -ETIME;
@@ -772,6 +775,202 @@ static int live_timeslice_preempt(void *arg)
 	return err;
 }
 
+static struct i915_request *
+create_rewinder(struct intel_context *ce,
+		struct i915_request *wait,
+		int slot)
+{
+	struct i915_request *rq;
+	u32 offset = i915_ggtt_offset(ce->engine->status_page.vma) + 4000;
+	u32 *cs;
+	int err;
+
+	rq = intel_context_create_request(ce);
+	if (IS_ERR(rq))
+		return rq;
+
+	if (wait) {
+		err = i915_request_await_dma_fence(rq, &wait->fence);
+		if (err)
+			goto err;
+	}
+
+	cs = intel_ring_begin(rq, 10);
+	if (IS_ERR(cs)) {
+		err = PTR_ERR(cs);
+		goto err;
+	}
+
+	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+	*cs++ = MI_NOOP;
+
+	*cs++ = MI_SEMAPHORE_WAIT |
+		MI_SEMAPHORE_GLOBAL_GTT |
+		MI_SEMAPHORE_POLL |
+		MI_SEMAPHORE_SAD_NEQ_SDD;
+	*cs++ = 0;
+	*cs++ = offset;
+	*cs++ = 0;
+
+	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(rq->engine->mmio_base));
+	*cs++ = offset + slot * sizeof(u32);
+	*cs++ = 0;
+
+	intel_ring_advance(rq, cs);
+
+	rq->sched.attr.priority = I915_PRIORITY_MASK;
+	err = 0;
+err:
+	i915_request_get(rq);
+	i915_request_add(rq);
+	if (err) {
+		i915_request_put(rq);
+		return ERR_PTR(err);
+	}
+
+	return rq;
+}
+
+static int live_timeslice_rewind(void *arg)
+{
+	struct intel_gt *gt = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	/*
+	 * The usual presumption on timeslice expiration is that we replace
+	 * the active context with another. However, given a chain of
+	 * dependencies we may end up with replacing the context with itself,
+	 * but only a few of those requests, forcing us to rewind the
+	 * RING_TAIL of the original request.
+	 */
+	if (!IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
+		return 0;
+
+	for_each_engine(engine, gt, id) {
+		struct i915_request *rq[3] = {};
+		struct intel_context *ce;
+		unsigned long heartbeat;
+		unsigned long timeslice;
+		int i, err = 0;
+		u32 *slot;
+
+		if (!intel_engine_has_timeslices(engine))
+			continue;
+
+		/*
+		 * A:rq1 -- semaphore wait, timestamp X
+		 * A:rq2 -- write timestamp Y
+		 *
+		 * B:rq1 [await A:rq1] -- write timestamp Z
+		 *
+		 * Force timeslice, release sempahore.
+		 *
+		 * Expect evaluation order XZY
+		 */
+
+		engine_heartbeat_disable(engine, &heartbeat);
+		timeslice = xchg(&engine->props.timeslice_duration_ms, 1);
+
+		slot = memset(engine->status_page.addr + 1000,
+			      0, 4 * sizeof(u32));
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce)) {
+			err = PTR_ERR(ce);
+			goto err;
+		}
+
+		rq[0] = create_rewinder(ce, NULL, 1);
+		if (IS_ERR(rq[0])) {
+			intel_context_put(ce);
+			goto err;
+		}
+
+		rq[1] = create_rewinder(ce, NULL, 2);
+		intel_context_put(ce);
+		if (IS_ERR(rq[1]))
+			goto err;
+
+		err = wait_for_submit(engine, rq[1], HZ / 2);
+		if (err) {
+			pr_err("%s: failed to submit first context\n",
+			       engine->name);
+			goto err;
+		}
+
+		ce = intel_context_create(engine);
+		if (IS_ERR(ce)) {
+			err = PTR_ERR(ce);
+			goto err;
+		}
+
+		rq[2] = create_rewinder(ce, rq[0], 3);
+		intel_context_put(ce);
+		if (IS_ERR(rq[2]))
+			goto err;
+
+		err = wait_for_submit(engine, rq[2], HZ / 2);
+		if (err) {
+			pr_err("%s: failed to submit second context\n",
+			       engine->name);
+			goto err;
+		}
+		GEM_BUG_ON(!timer_pending(&engine->execlists.timer));
+
+		/* Wait for the timeslice to kick in */
+		del_timer(&engine->execlists.timer);
+		tasklet_hi_schedule(&engine->execlists.tasklet);
+		intel_engine_flush_submission(engine);
+
+		/* Release the hounds! */
+		slot[0] = 1;
+		wmb();
+
+		for (i = 1; i <= 3; i++) {
+			unsigned long timeout = jiffies + HZ / 2;
+
+			while (!READ_ONCE(slot[i]) &&
+			       time_before(jiffies, timeout))
+				;
+
+			if (!time_before(jiffies, timeout)) {
+				pr_err("%s: rq[%d] timed out\n",
+				       engine->name, i - 1);
+				err = -ETIME;
+				goto err;
+			}
+
+			pr_debug("%s: slot[%d]:%x\n", engine->name, i, slot[i]);
+		}
+
+		/* XZY: XZ < XY */
+		if (slot[3] - slot[1] >= slot[2] - slot[1]) {
+			pr_err("%s: timeslicing did not run context B [%u] before A [%u]!\n",
+			       engine->name,
+			       slot[3] - slot[1],
+			       slot[2] - slot[1]);
+			err = -EINVAL;
+		}
+
+err:
+		memset(slot, 0xff, 4 * sizeof(u32));
+		wmb();
+
+		engine->props.timeslice_duration_ms = timeslice;
+		engine_heartbeat_enable(engine, heartbeat);
+		for (i = 0; i < 3; i++)
+			i915_request_put(rq[i]);
+		if (igt_flush_test(gt->i915))
+			err = -EIO;
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 static struct i915_request *nop_request(struct intel_engine_cs *engine)
 {
 	struct i915_request *rq;
@@ -3619,6 +3818,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_hold_reset),
 		SUBTEST(live_error_interrupt),
 		SUBTEST(live_timeslice_preempt),
+		SUBTEST(live_timeslice_rewind),
 		SUBTEST(live_timeslice_queue),
 		SUBTEST(live_busywait_preempt),
 		SUBTEST(live_preempt),
-- 
2.25.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-gfx] [PATCH 3/7] drm/i915/selftests: Relax timeout for error-interrupt reset processing
  2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 2/7] drm/i915/selftests: Exercise timeslice rewinding Chris Wilson
@ 2020-02-10 20:57 ` Chris Wilson
  2020-02-11 15:23   ` Mika Kuoppala
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 4/7] drm/i915/gem: Don't leak non-persistent requests on changing engines Chris Wilson
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 22+ messages in thread
From: Chris Wilson @ 2020-02-10 20:57 UTC (permalink / raw)
  To: intel-gfx

We can not require that the system process a tasklet in reasonable time
(thanks be to ksoftirqd), but we can insist that having waited
sufficiently for the error interrupt to have been raised and having
kicked the tasklet, the reset has begun and the request will be marked
as in error (if not already completed).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index 8b7383f6d9b3..ccd4cd2c202d 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -503,14 +503,21 @@ static int live_error_interrupt(void *arg)
 			}
 
 			for (i = 0; i < ARRAY_SIZE(client); i++) {
-				if (i915_request_wait(client[i], 0, HZ / 5) < 0) {
-					pr_err("%s: %s request still executing!\n",
-					       engine->name,
-					       error_repr(p->error[i]));
+				if (i915_request_wait(client[i], 0, HZ / 5) < 0)
+					pr_debug("%s: %s request incomplete!\n",
+						 engine->name,
+						 error_repr(p->error[i]));
+
+				if (!i915_request_started(client[i])) {
+					pr_debug("%s: %s request not stated!\n",
+						 engine->name,
+						 error_repr(p->error[i]));
 					err = -ETIME;
 					goto out;
 				}
 
+				/* Kick the tasklet to process the error */
+				intel_engine_flush_submission(engine);
 				if (client[i]->fence.error != p->error[i]) {
 					pr_err("%s: %s request completed with wrong error code: %d\n",
 					       engine->name,
-- 
2.25.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-gfx] [PATCH 4/7] drm/i915/gem: Don't leak non-persistent requests on changing engines
  2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 2/7] drm/i915/selftests: Exercise timeslice rewinding Chris Wilson
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 3/7] drm/i915/selftests: Relax timeout for error-interrupt reset processing Chris Wilson
@ 2020-02-10 20:57 ` Chris Wilson
  2020-02-11 13:41   ` Tvrtko Ursulin
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 5/7] drm/i915: Disable use of hwsp_cacheline for kernel_context Chris Wilson
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 22+ messages in thread
From: Chris Wilson @ 2020-02-10 20:57 UTC (permalink / raw)
  To: intel-gfx

If we have a set of active engines marked as being non-persistent, we
lose track of those if the user replaces those engines with
I915_CONTEXT_PARAM_ENGINES. As part of our uABI contract is that
non-persistent requests are terminated if they are no longer being
tracked by the user's context (in order to prevent a lost request
causing an untracked and so unstoppable GPU hang), we need to apply the
same context cancellation upon changing engines.

v2: Track stale engines[] so we only reap at context closure.

Fixes: a0e047156cde ("drm/i915/gem: Make context persistence optional")
Testcase: igt/gem_ctx_peristence/replace
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gem/i915_gem_context.c   | 118 ++++++++++++++++--
 .../gpu/drm/i915/gem/i915_gem_context_types.h |  13 +-
 drivers/gpu/drm/i915/i915_sw_fence.c          |  19 ++-
 drivers/gpu/drm/i915/i915_sw_fence.h          |   2 +-
 4 files changed, 139 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
index cfaf5bbdbcab..ba29462bd501 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -270,7 +270,8 @@ static struct i915_gem_engines *default_engines(struct i915_gem_context *ctx)
 	if (!e)
 		return ERR_PTR(-ENOMEM);
 
-	init_rcu_head(&e->rcu);
+	e->ctx = ctx;
+
 	for_each_engine(engine, gt, id) {
 		struct intel_context *ce;
 
@@ -450,7 +451,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce)
 	return engine;
 }
 
-static void kill_context(struct i915_gem_context *ctx)
+static void kill_engines(struct i915_gem_engines *engines)
 {
 	struct i915_gem_engines_iter it;
 	struct intel_context *ce;
@@ -462,7 +463,7 @@ static void kill_context(struct i915_gem_context *ctx)
 	 * However, we only care about pending requests, so only include
 	 * engines on which there are incomplete requests.
 	 */
-	for_each_gem_engine(ce, __context_engines_static(ctx), it) {
+	for_each_gem_engine(ce, engines, it) {
 		struct intel_engine_cs *engine;
 
 		if (intel_context_set_banned(ce))
@@ -484,10 +485,41 @@ static void kill_context(struct i915_gem_context *ctx)
 			 * the context from the GPU, we have to resort to a full
 			 * reset. We hope the collateral damage is worth it.
 			 */
-			__reset_context(ctx, engine);
+			__reset_context(engines->ctx, engine);
 	}
 }
 
+static void kill_stale_engines(struct i915_gem_context *ctx)
+{
+	struct i915_gem_engines *pos, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->stale.lock, flags);
+	list_for_each_entry_safe(pos, next, &ctx->stale.engines, link) {
+		if (!i915_sw_fence_await(&pos->fence))
+			continue;
+
+		spin_unlock_irqrestore(&ctx->stale.lock, flags);
+
+		kill_engines(pos);
+
+		spin_lock_irqsave(&ctx->stale.lock, flags);
+		list_safe_reset_next(pos, next, link);
+		list_del_init(&pos->link); /* decouple from FENCE_COMPLETE */
+
+		i915_sw_fence_complete(&pos->fence);
+	}
+	spin_unlock_irqrestore(&ctx->stale.lock, flags);
+}
+
+static void kill_context(struct i915_gem_context *ctx)
+{
+	if (!list_empty(&ctx->stale.engines))
+		kill_stale_engines(ctx);
+
+	kill_engines(__context_engines_static(ctx));
+}
+
 static void set_closed_name(struct i915_gem_context *ctx)
 {
 	char *s;
@@ -602,6 +634,9 @@ __create_context(struct drm_i915_private *i915)
 	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
 	mutex_init(&ctx->mutex);
 
+	spin_lock_init(&ctx->stale.lock);
+	INIT_LIST_HEAD(&ctx->stale.engines);
+
 	mutex_init(&ctx->engines_mutex);
 	e = default_engines(ctx);
 	if (IS_ERR(e)) {
@@ -1529,6 +1564,71 @@ static const i915_user_extension_fn set_engines__extensions[] = {
 	[I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond,
 };
 
+static int engines_notify(struct i915_sw_fence *fence,
+			  enum i915_sw_fence_notify state)
+{
+	struct i915_gem_engines *engines =
+		container_of(fence, typeof(*engines), fence);
+
+	switch (state) {
+	case FENCE_COMPLETE:
+		if (!list_empty(&engines->link)) {
+			struct i915_gem_context *ctx = engines->ctx;
+			unsigned long flags;
+
+			spin_lock_irqsave(&ctx->stale.lock, flags);
+			list_del(&engines->link);
+			spin_unlock_irqrestore(&ctx->stale.lock, flags);
+		}
+		break;
+
+	case FENCE_FREE:
+		init_rcu_head(&engines->rcu);
+		call_rcu(&engines->rcu, free_engines_rcu);
+		break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+static void engines_idle_release(struct i915_gem_engines *engines)
+{
+	struct i915_gem_engines_iter it;
+	struct intel_context *ce;
+	unsigned long flags;
+
+	GEM_BUG_ON(!engines);
+	i915_sw_fence_init(&engines->fence, engines_notify);
+
+	spin_lock_irqsave(&engines->ctx->stale.lock, flags);
+	list_add(&engines->link, &engines->ctx->stale.engines);
+	spin_unlock_irqrestore(&engines->ctx->stale.lock, flags);
+
+	for_each_gem_engine(ce, engines, it) {
+		struct dma_fence *fence;
+		int err;
+
+		if (!ce->timeline)
+			continue;
+
+		fence = i915_active_fence_get(&ce->timeline->last_request);
+		if (!fence)
+			continue;
+
+		err = i915_sw_fence_await_dma_fence(&engines->fence,
+						    fence, 0,
+						    GFP_KERNEL);
+
+		dma_fence_put(fence);
+		if (err < 0) {
+			kill_engines(engines);
+			break;
+		}
+	}
+
+	i915_sw_fence_commit(&engines->fence);
+}
+
 static int
 set_engines(struct i915_gem_context *ctx,
 	    const struct drm_i915_gem_context_param *args)
@@ -1571,7 +1671,8 @@ set_engines(struct i915_gem_context *ctx,
 	if (!set.engines)
 		return -ENOMEM;
 
-	init_rcu_head(&set.engines->rcu);
+	set.engines->ctx = ctx;
+
 	for (n = 0; n < num_engines; n++) {
 		struct i915_engine_class_instance ci;
 		struct intel_engine_cs *engine;
@@ -1631,7 +1732,8 @@ set_engines(struct i915_gem_context *ctx,
 	set.engines = rcu_replace_pointer(ctx->engines, set.engines, 1);
 	mutex_unlock(&ctx->engines_mutex);
 
-	call_rcu(&set.engines->rcu, free_engines_rcu);
+	/* Keep track of old engine sets for kill_context() */
+	engines_idle_release(set.engines);
 
 	return 0;
 }
@@ -1646,7 +1748,6 @@ __copy_engines(struct i915_gem_engines *e)
 	if (!copy)
 		return ERR_PTR(-ENOMEM);
 
-	init_rcu_head(&copy->rcu);
 	for (n = 0; n < e->num_engines; n++) {
 		if (e->engines[n])
 			copy->engines[n] = intel_context_get(e->engines[n]);
@@ -1890,7 +1991,8 @@ static int clone_engines(struct i915_gem_context *dst,
 	if (!clone)
 		goto err_unlock;
 
-	init_rcu_head(&clone->rcu);
+	clone->ctx = dst;
+
 	for (n = 0; n < e->num_engines; n++) {
 		struct intel_engine_cs *engine;
 
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
index 017ca803ab47..8d996dde8046 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
@@ -20,6 +20,7 @@
 #include "gt/intel_context_types.h"
 
 #include "i915_scheduler.h"
+#include "i915_sw_fence.h"
 
 struct pid;
 
@@ -30,7 +31,12 @@ struct intel_timeline;
 struct intel_ring;
 
 struct i915_gem_engines {
-	struct rcu_head rcu;
+	union {
+		struct rcu_head rcu;
+		struct list_head link;
+	};
+	struct i915_sw_fence fence;
+	struct i915_gem_context *ctx;
 	unsigned int num_engines;
 	struct intel_context *engines[];
 };
@@ -173,6 +179,11 @@ struct i915_gem_context {
 	 * context in messages.
 	 */
 	char name[TASK_COMM_LEN + 8];
+
+	struct {
+		struct spinlock lock;
+		struct list_head engines;
+	} stale;
 };
 
 #endif /* __I915_GEM_CONTEXT_TYPES_H__ */
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 51ba97daf2a0..bc6d4f8b78f0 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -211,10 +211,23 @@ void i915_sw_fence_complete(struct i915_sw_fence *fence)
 	__i915_sw_fence_complete(fence, NULL);
 }
 
-void i915_sw_fence_await(struct i915_sw_fence *fence)
+bool i915_sw_fence_await(struct i915_sw_fence *fence)
 {
-	debug_fence_assert(fence);
-	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
+	int old, new;
+
+	/*
+	 * It is only safe to add a new await to the fence while it has
+	 * not yet been signaled.
+	 */
+	new = atomic_read(&fence->pending);
+	do {
+		if (new < 1)
+			return false;
+
+		old = new++;
+	} while ((new = atomic_cmpxchg(&fence->pending, old, new)) != old);
+
+	return true;
 }
 
 void __i915_sw_fence_init(struct i915_sw_fence *fence,
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 19e806ce43bc..30a863353ee6 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -91,7 +91,7 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    unsigned long timeout,
 				    gfp_t gfp);
 
-void i915_sw_fence_await(struct i915_sw_fence *fence);
+bool i915_sw_fence_await(struct i915_sw_fence *fence);
 void i915_sw_fence_complete(struct i915_sw_fence *fence);
 
 static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
-- 
2.25.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-gfx] [PATCH 5/7] drm/i915: Disable use of hwsp_cacheline for kernel_context
  2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
                   ` (2 preceding siblings ...)
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 4/7] drm/i915/gem: Don't leak non-persistent requests on changing engines Chris Wilson
@ 2020-02-10 20:57 ` Chris Wilson
  2020-02-11 17:36   ` Mika Kuoppala
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 6/7] drm/i915/gt: Yield the timeslice if caught waiting on a user semaphore Chris Wilson
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 22+ messages in thread
From: Chris Wilson @ 2020-02-10 20:57 UTC (permalink / raw)
  To: intel-gfx

Currently on execlists, we use a local hwsp for the kernel_context,
rather than the engine's HWSP, as this is the default for execlists.
However, seqno rollover requires allocating a new HWSP cachline, and may
require pinning a new HWSP page in the GTT. This operation requiring
pinning in the GGTT is not allowed within the kernel_context timeline,
as doing so may require re-entering the kernel_context in order to evict
from the GGTT. As we want to avoid requiring a new HWSP for the
kernel_context, we can use the permanently pinned engine's HWSP instead.
However to do so we must prevent the use of semaphores reading the
kernel_context's HWSP, as the use of semaphores do not support rollover
onto the same cacheline. Fortunately, the kernel_context is mostly
isolated, so unlikely to give benefit to semaphores.

Reported-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_lrc.c    | 14 ++++++++++++--
 drivers/gpu/drm/i915/gt/selftest_lrc.c | 12 +++++++++---
 drivers/gpu/drm/i915/i915_request.c    | 14 +++++++++-----
 3 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 70d91ad923ef..902d440ef07d 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2964,7 +2964,8 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
 {
 	u32 *cs;
 
-	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
+	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
+		return 0;
 
 	cs = intel_ring_begin(rq, 6);
 	if (IS_ERR(cs))
@@ -4616,8 +4617,17 @@ static int __execlists_context_alloc(struct intel_context *ce,
 
 	if (!ce->timeline) {
 		struct intel_timeline *tl;
+		struct i915_vma *hwsp;
+
+		/*
+		 * Use the static global HWSP for the kernel context, and
+		 * a dynamically allocated cacheline for everyone else.
+		 */
+		hwsp = NULL;
+		if (unlikely(intel_context_is_barrier(ce)))
+			hwsp = engine->status_page.vma;
 
-		tl = intel_timeline_create(engine->gt, NULL);
+		tl = intel_timeline_create(engine->gt, hwsp);
 		if (IS_ERR(tl)) {
 			ret = PTR_ERR(tl);
 			goto error_deref_obj;
diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
index ccd4cd2c202d..6f458f6d5523 100644
--- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
+++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -3494,15 +3494,21 @@ static int bond_virtual_engine(struct intel_gt *gt,
 	rq[0] = ERR_PTR(-ENOMEM);
 	for_each_engine(master, gt, id) {
 		struct i915_sw_fence fence = {};
+		struct intel_context *ce;
 
 		if (master->class == class)
 			continue;
 
+		ce = intel_context_create(master);
+		if (IS_ERR(ce)) {
+			err = PTR_ERR(ce);
+			goto out;
+		}
+
 		memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq));
 
-		rq[0] = igt_spinner_create_request(&spin,
-						   master->kernel_context,
-						   MI_NOOP);
+		rq[0] = igt_spinner_create_request(&spin, ce, MI_NOOP);
+		intel_context_put(ce);
 		if (IS_ERR(rq[0])) {
 			err = PTR_ERR(rq[0]);
 			goto out;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 0ecc2cf64216..1adb8cf35f75 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -886,6 +886,12 @@ emit_semaphore_wait(struct i915_request *to,
 		    struct i915_request *from,
 		    gfp_t gfp)
 {
+	if (!intel_context_use_semaphores(to->context))
+		goto await_fence;
+
+	if (!rcu_access_pointer(from->hwsp_cacheline))
+		goto await_fence;
+
 	/* Just emit the first semaphore we see as request space is limited. */
 	if (already_busywaiting(to) & from->engine->mask)
 		goto await_fence;
@@ -931,12 +937,8 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
 		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
 						       &from->submit,
 						       I915_FENCE_GFP);
-	else if (intel_context_use_semaphores(to->context))
-		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
 	else
-		ret = i915_sw_fence_await_dma_fence(&to->submit,
-						    &from->fence, 0,
-						    I915_FENCE_GFP);
+		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
 	if (ret < 0)
 		return ret;
 
@@ -1035,6 +1037,8 @@ __i915_request_await_execution(struct i915_request *to,
 {
 	int err;
 
+	GEM_BUG_ON(intel_context_is_barrier(from->context));
+
 	/* Submit both requests at the same time */
 	err = __await_execution(to, from, hook, I915_FENCE_GFP);
 	if (err)
-- 
2.25.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-gfx] [PATCH 6/7] drm/i915/gt: Yield the timeslice if caught waiting on a user semaphore
  2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
                   ` (3 preceding siblings ...)
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 5/7] drm/i915: Disable use of hwsp_cacheline for kernel_context Chris Wilson
@ 2020-02-10 20:57 ` Chris Wilson
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 7/7] drm/i915/execlists: Remove preempt-to-busy roundtrip delay Chris Wilson
                   ` (4 subsequent siblings)
  9 siblings, 0 replies; 22+ messages in thread
From: Chris Wilson @ 2020-02-10 20:57 UTC (permalink / raw)
  To: intel-gfx

If we find ourselves waiting on a MI_SEMAPHORE_WAIT, either within the
user batch or in our own preamble, the engine raises a
GT_WAIT_ON_SEMAPHORE interrupt. We can unmask that interrupt and so
respond to a semaphore wait by yielding the timeslice, if we have
another context to yield to!

The only real complication is that the interrupt is only generated for
the start of the semaphore wait, and is asynchronous to our
process_csb() -- that is, we may not have registered the timeslice before
we see the interrupt. To ensure we don't miss a potential semaphore
blocking forward progress (e.g. selftests/live_timeslice_preempt) we mark
the interrupt and apply it to the next timeslice regardless of whether it
was active at the time.

v2: We use semaphores in preempt-to-busy, within the timeslicing
implementation itself! Ergo, when we do insert a preemption due to an
expired timeslice, the new context may start with the missed semaphore
flagged by the retired context and be yielded, ad infinitum. To avoid
this, read the context id at the time of the semaphore interrupt and
only yield if that context is still active.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_cs.c    |  6 +++
 drivers/gpu/drm/i915/gt/intel_engine_types.h |  9 +++++
 drivers/gpu/drm/i915/gt/intel_gt_irq.c       | 13 ++++++-
 drivers/gpu/drm/i915/gt/intel_lrc.c          | 40 +++++++++++++++++---
 drivers/gpu/drm/i915/i915_reg.h              |  1 +
 5 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index f6f5e1ec48fc..89f201a5a219 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -1288,6 +1288,12 @@ static void intel_engine_print_registers(struct intel_engine_cs *engine,
 
 	if (engine->id == RENDER_CLASS && IS_GEN_RANGE(dev_priv, 4, 7))
 		drm_printf(m, "\tCCID: 0x%08x\n", ENGINE_READ(engine, CCID));
+	if (HAS_EXECLISTS(dev_priv)) {
+		drm_printf(m, "\tEL_STAT_HI: 0x%08x\n",
+			   ENGINE_READ(engine, RING_EXECLIST_STATUS_HI));
+		drm_printf(m, "\tEL_STAT_LO: 0x%08x\n",
+			   ENGINE_READ(engine, RING_EXECLIST_STATUS_LO));
+	}
 	drm_printf(m, "\tRING_START: 0x%08x\n",
 		   ENGINE_READ(engine, RING_START));
 	drm_printf(m, "\tRING_HEAD:  0x%08x\n",
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index b23366a81048..24cff658e6e5 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -156,6 +156,15 @@ struct intel_engine_execlists {
 	 */
 	struct i915_priolist default_priolist;
 
+	/**
+	 * @yield: CCID at the time of the last semaphore-wait interrupt.
+	 *
+	 * Instead of leaving a semaphore busy-spinning on an engine, we would
+	 * like to switch to another ready context, i.e. yielding the semaphore
+	 * timeslice.
+	 */
+	u32 yield;
+
 	/**
 	 * @error_interrupt: CS Master EIR
 	 *
diff --git a/drivers/gpu/drm/i915/gt/intel_gt_irq.c b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
index f0e7fd95165a..875bd0392ffc 100644
--- a/drivers/gpu/drm/i915/gt/intel_gt_irq.c
+++ b/drivers/gpu/drm/i915/gt/intel_gt_irq.c
@@ -39,6 +39,13 @@ cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
 		}
 	}
 
+	if (iir & GT_WAIT_SEMAPHORE_INTERRUPT) {
+		WRITE_ONCE(engine->execlists.yield,
+			   ENGINE_READ_FW(engine, RING_EXECLIST_STATUS_HI));
+		if (del_timer(&engine->execlists.timer))
+			tasklet = true;
+	}
+
 	if (iir & GT_CONTEXT_SWITCH_INTERRUPT)
 		tasklet = true;
 
@@ -228,7 +235,8 @@ void gen11_gt_irq_postinstall(struct intel_gt *gt)
 	const u32 irqs =
 		GT_CS_MASTER_ERROR_INTERRUPT |
 		GT_RENDER_USER_INTERRUPT |
-		GT_CONTEXT_SWITCH_INTERRUPT;
+		GT_CONTEXT_SWITCH_INTERRUPT |
+		GT_WAIT_SEMAPHORE_INTERRUPT;
 	struct intel_uncore *uncore = gt->uncore;
 	const u32 dmask = irqs << 16 | irqs;
 	const u32 smask = irqs << 16;
@@ -366,7 +374,8 @@ void gen8_gt_irq_postinstall(struct intel_gt *gt)
 	const u32 irqs =
 		GT_CS_MASTER_ERROR_INTERRUPT |
 		GT_RENDER_USER_INTERRUPT |
-		GT_CONTEXT_SWITCH_INTERRUPT;
+		GT_CONTEXT_SWITCH_INTERRUPT |
+		GT_WAIT_SEMAPHORE_INTERRUPT;
 	const u32 gt_interrupts[] = {
 		irqs << GEN8_RCS_IRQ_SHIFT | irqs << GEN8_BCS_IRQ_SHIFT,
 		irqs << GEN8_VCS0_IRQ_SHIFT | irqs << GEN8_VCS1_IRQ_SHIFT,
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 902d440ef07d..696f0b6b223c 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1685,7 +1685,8 @@ static void defer_active(struct intel_engine_cs *engine)
 }
 
 static bool
-need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
+need_timeslice(const struct intel_engine_cs *engine,
+	       const struct i915_request *rq)
 {
 	int hint;
 
@@ -1701,6 +1702,31 @@ need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
 	return hint >= effective_prio(rq);
 }
 
+static bool
+timeslice_yield(const struct intel_engine_execlists *el,
+		const struct i915_request *rq)
+{
+	/*
+	 * Once bitten, forever smitten!
+	 *
+	 * If the active context ever busy-waited on a semaphore,
+	 * it will be treated as a hog until the end of its timeslice.
+	 * The HW only sends an interrupt on the first miss, and we
+	 * do know if that semaphore has been signaled, or even if it
+	 * is now stuck on another semaphore. Play safe, yield if it
+	 * might be stuck -- it will be given a fresh timeslice in
+	 * the near future.
+	 */
+	return upper_32_bits(rq->context->lrc_desc) == READ_ONCE(el->yield);
+}
+
+static bool
+timeslice_expired(const struct intel_engine_execlists *el,
+		  const struct i915_request *rq)
+{
+	return timer_expired(&el->timer) || timeslice_yield(el, rq);
+}
+
 static int
 switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
 {
@@ -1716,8 +1742,7 @@ timeslice(const struct intel_engine_cs *engine)
 	return READ_ONCE(engine->props.timeslice_duration_ms);
 }
 
-static unsigned long
-active_timeslice(const struct intel_engine_cs *engine)
+static unsigned long active_timeslice(const struct intel_engine_cs *engine)
 {
 	const struct i915_request *rq = *engine->execlists.active;
 
@@ -1860,13 +1885,14 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 
 			last = NULL;
 		} else if (need_timeslice(engine, last) &&
-			   timer_expired(&engine->execlists.timer)) {
+			   timeslice_expired(execlists, last)) {
 			ENGINE_TRACE(engine,
-				     "expired last=%llx:%lld, prio=%d, hint=%d\n",
+				     "expired last=%llx:%lld, prio=%d, hint=%d, yield?=%s\n",
 				     last->fence.context,
 				     last->fence.seqno,
 				     last->sched.attr.priority,
-				     execlists->queue_priority_hint);
+				     execlists->queue_priority_hint,
+				     yesno(timeslice_yield(execlists, last)));
 
 			ring_set_paused(engine, 1);
 			defer_active(engine);
@@ -2126,6 +2152,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		}
 		clear_ports(port + 1, last_port - port);
 
+		WRITE_ONCE(execlists->yield, -1);
 		execlists_submit_ports(engine);
 		set_preempt_timeout(engine);
 	} else {
@@ -4337,6 +4364,7 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
 	engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
 	engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
+	engine->irq_keep_mask |= GT_WAIT_SEMAPHORE_INTERRUPT << shift;
 }
 
 static void rcs_submission_override(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
index b09c1d6dc0aa..0f1fcc863f3d 100644
--- a/drivers/gpu/drm/i915/i915_reg.h
+++ b/drivers/gpu/drm/i915/i915_reg.h
@@ -3090,6 +3090,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
 #define GT_BSD_CS_ERROR_INTERRUPT		(1 << 15)
 #define GT_BSD_USER_INTERRUPT			(1 << 12)
 #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT_S1	(1 << 11) /* hsw+; rsvd on snb, ivb, vlv */
+#define GT_WAIT_SEMAPHORE_INTERRUPT		REG_BIT(11) /* bdw+ */
 #define GT_CONTEXT_SWITCH_INTERRUPT		(1 <<  8)
 #define GT_RENDER_L3_PARITY_ERROR_INTERRUPT	(1 <<  5) /* !snb */
 #define GT_RENDER_PIPECTL_NOTIFY_INTERRUPT	(1 <<  4)
-- 
2.25.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-gfx] [PATCH 7/7] drm/i915/execlists: Remove preempt-to-busy roundtrip delay
  2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
                   ` (4 preceding siblings ...)
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 6/7] drm/i915/gt: Yield the timeslice if caught waiting on a user semaphore Chris Wilson
@ 2020-02-10 20:57 ` Chris Wilson
  2020-02-12  1:08   ` Daniele Ceraolo Spurio
  2020-02-10 22:48 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Patchwork
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 22+ messages in thread
From: Chris Wilson @ 2020-02-10 20:57 UTC (permalink / raw)
  To: intel-gfx

To prevent the context from proceeding past the end of the request as we
unwind, we embed a semaphore into the footer of each request. (If the
context were to skip past the end of the request as we perform the
preemption, next time we reload the context it's RING_HEAD would be past
the RING_TAIL and instead of replaying the commands it would read the
read of the uninitialised ringbuffer.)

However, this requires us to keep the ring paused at the end of the
request until we have a change to process the preemption ack and remove
the semaphore. Our processing of acks is at the whim of ksoftirqd, and
so it is entirely possible that the GPU has to wait for the tasklet
before it can proceed with the next request.

It was suggested that we could also embed a MI_LOAD_REGISTER_MEM into
the footer to read the current RING_TAIL from the context, which would
allow us to not only avoid this round trip (and so release the context
as soon as we had submitted the preemption request to in ELSP), but also
skip using ELSP for lite-restores entirely. That has the nice benefit of
dramatically reducing contention and the frequency of interrupts when a
client submits two or more execbufs in rapid succession.

However, mmio access to RING_TAIL was defeatured in gen11 so we can only
employ this handy trick for gen8/gen9.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 drivers/gpu/drm/i915/gt/intel_engine_types.h | 23 +++--
 drivers/gpu/drm/i915/gt/intel_lrc.c          | 99 +++++++++++++++++++-
 2 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
index 24cff658e6e5..ae8724915320 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
@@ -488,14 +488,15 @@ struct intel_engine_cs {
 	/* status_notifier: list of callbacks for context-switch changes */
 	struct atomic_notifier_head context_status_notifier;
 
-#define I915_ENGINE_USING_CMD_PARSER BIT(0)
-#define I915_ENGINE_SUPPORTS_STATS   BIT(1)
-#define I915_ENGINE_HAS_PREEMPTION   BIT(2)
-#define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
-#define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4)
-#define I915_ENGINE_IS_VIRTUAL       BIT(5)
-#define I915_ENGINE_HAS_RELATIVE_MMIO BIT(6)
-#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(7)
+#define I915_ENGINE_REQUIRES_CMD_PARSER		BIT(0)
+#define I915_ENGINE_USING_CMD_PARSER		BIT(1)
+#define I915_ENGINE_SUPPORTS_STATS		BIT(2)
+#define I915_ENGINE_HAS_PREEMPTION		BIT(3)
+#define I915_ENGINE_HAS_SEMAPHORES		BIT(4)
+#define I915_ENGINE_HAS_TAIL_LRM		BIT(5)
+#define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET	BIT(6)
+#define I915_ENGINE_IS_VIRTUAL			BIT(7)
+#define I915_ENGINE_HAS_RELATIVE_MMIO		BIT(8)
 	unsigned int flags;
 
 	/*
@@ -592,6 +593,12 @@ intel_engine_has_semaphores(const struct intel_engine_cs *engine)
 	return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
 }
 
+static inline bool
+intel_engine_has_tail_lrm(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_HAS_TAIL_LRM;
+}
+
 static inline bool
 intel_engine_needs_breadcrumb_tasklet(const struct intel_engine_cs *engine)
 {
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 696f0b6b223c..5939672781fb 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -1797,6 +1797,74 @@ static inline void clear_ports(struct i915_request **ports, int count)
 	memset_p((void **)ports, NULL, count);
 }
 
+static struct i915_request *
+skip_lite_restore(struct intel_engine_cs *const engine,
+		  struct i915_request *first,
+		  bool *submit)
+{
+	struct intel_engine_execlists *const execlists = &engine->execlists;
+	struct i915_request *last = first;
+	struct rb_node *rb;
+
+	if (!intel_engine_has_tail_lrm(engine))
+		return last;
+
+	GEM_BUG_ON(*submit);
+	while ((rb = rb_first_cached(&execlists->queue))) {
+		struct i915_priolist *p = to_priolist(rb);
+		struct i915_request *rq, *rn;
+		int i;
+
+		priolist_for_each_request_consume(rq, rn, p, i) {
+			if (!can_merge_rq(last, rq))
+				goto out;
+
+			if (__i915_request_submit(rq)) {
+				*submit = true;
+				last = rq;
+			}
+		}
+
+		rb_erase_cached(&p->node, &execlists->queue);
+		i915_priolist_free(p);
+	}
+out:
+	if (*submit) {
+		ring_set_paused(engine, 1);
+
+		/*
+		 * If we are quick and the current context hasn't yet completed
+		 * its request, we can just tell it to extend the RING_TAIL
+		 * onto the next without having to submit a new ELSP.
+		 */
+		if (!i915_request_completed(first)) {
+			struct i915_request **port;
+
+			ENGINE_TRACE(engine,
+				     "eliding lite-restore last=%llx:%lld->%lld, current %d\n",
+				     first->fence.context,
+				     first->fence.seqno,
+				     last->fence.seqno,
+				     hwsp_seqno(last));
+			GEM_BUG_ON(first->context != last->context);
+
+			execlists_update_context(last);
+			for (port = (struct i915_request **)execlists->active;
+			     *port != first;
+			     port++)
+				;
+			WRITE_ONCE(*port, i915_request_get(last));
+			i915_request_put(first);
+
+			*submit = false;
+		}
+
+		ring_set_paused(engine, 0);
+	}
+
+	return last;
+}
+
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1934,6 +2002,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 
 				return;
 			}
+
+			last = skip_lite_restore(engine, last, &submit);
 		}
 	}
 
@@ -2155,10 +2225,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		WRITE_ONCE(execlists->yield, -1);
 		execlists_submit_ports(engine);
 		set_preempt_timeout(engine);
-	} else {
+	}
+
+	if (intel_engine_has_tail_lrm(engine) || !submit)
 skip_submit:
 		ring_set_paused(engine, 0);
-	}
 }
 
 static void
@@ -2325,7 +2396,8 @@ static void process_csb(struct intel_engine_cs *engine)
 
 			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
 
-			ring_set_paused(engine, 0);
+			if (!intel_engine_has_tail_lrm(engine))
+				ring_set_paused(engine, 0);
 
 			/* Point active to the new ELSP; prevent overwriting */
 			WRITE_ONCE(execlists->active, execlists->pending);
@@ -4123,15 +4195,28 @@ static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
 	return cs;
 }
 
+static u32 *emit_lrm_tail(struct i915_request *request, u32 *cs)
+{
+	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
+	*cs++ = i915_mmio_reg_offset(RING_TAIL(request->engine->mmio_base));
+	*cs++ = i915_ggtt_offset(request->context->state) +
+		LRC_STATE_PN * PAGE_SIZE +
+		CTX_RING_TAIL * sizeof(u32);
+	*cs++ = 0;
+
+	return cs;
+}
+
 static __always_inline u32*
-gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
-				 u32 *cs)
+gen8_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
 {
 	*cs++ = MI_USER_INTERRUPT;
 
 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 	if (intel_engine_has_semaphores(request->engine))
 		cs = emit_preempt_busywait(request, cs);
+	if (intel_engine_has_tail_lrm(request->engine))
+		cs = emit_lrm_tail(request, cs);
 
 	request->tail = intel_ring_offset(request, cs);
 	assert_ring_tail_valid(request->ring, request->tail);
@@ -4220,6 +4305,8 @@ static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
 static __always_inline u32*
 gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
 {
+	GEM_BUG_ON(intel_engine_has_tail_lrm(request->engine));
+
 	*cs++ = MI_USER_INTERRUPT;
 
 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
@@ -4286,6 +4373,8 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
 		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
 		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
 			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
+		if (INTEL_GEN(engine->i915) < 11)
+			engine->flags |= I915_ENGINE_HAS_TAIL_LRM;
 	}
 
 	if (INTEL_GEN(engine->i915) >= 12)
-- 
2.25.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex
  2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
                   ` (5 preceding siblings ...)
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 7/7] drm/i915/execlists: Remove preempt-to-busy roundtrip delay Chris Wilson
@ 2020-02-10 22:48 ` Patchwork
  2020-02-10 23:14 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
                   ` (2 subsequent siblings)
  9 siblings, 0 replies; 22+ messages in thread
From: Patchwork @ 2020-02-10 22:48 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex
URL   : https://patchwork.freedesktop.org/series/73256/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
1f4fe3bfcb66 drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex
-:15: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#15: 
References: f3c0efc9fe7a ("drm/i915/execlists: Leave resetting ring to intel_ring")

-:15: ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit <12+ chars of sha1> ("<title line>")' - ie: 'commit f3c0efc9fe7a ("drm/i915/execlists: Leave resetting ring to intel_ring")'
#15: 
References: f3c0efc9fe7a ("drm/i915/execlists: Leave resetting ring to intel_ring")

total: 1 errors, 1 warnings, 0 checks, 115 lines checked
5bf840c72399 drm/i915/selftests: Exercise timeslice rewinding
-:13: WARNING:COMMIT_LOG_LONG_LINE: Possible unwrapped commit description (prefer a maximum 75 chars per line)
#13: 
References: 5ba32c7be81e ("drm/i915/execlists: Always force a context reload when rewinding RING_TAIL")

-:13: ERROR:GIT_COMMIT_ID: Please use git commit description style 'commit <12+ chars of sha1> ("<title line>")' - ie: 'commit 5ba32c7be81e ("drm/i915/execlists: Always force a context reload when rewinding RING_TAIL")'
#13: 
References: 5ba32c7be81e ("drm/i915/execlists: Always force a context reload when rewinding RING_TAIL")

-:189: WARNING:MEMORY_BARRIER: memory barrier without comment
#189: FILE: drivers/gpu/drm/i915/gt/selftest_lrc.c:929:
+		wmb();

-:219: WARNING:MEMORY_BARRIER: memory barrier without comment
#219: FILE: drivers/gpu/drm/i915/gt/selftest_lrc.c:959:
+		wmb();

total: 1 errors, 3 warnings, 0 checks, 221 lines checked
2351ed092201 drm/i915/selftests: Relax timeout for error-interrupt reset processing
6ad77fccd647 drm/i915/gem: Don't leak non-persistent requests on changing engines
-:249: WARNING:USE_SPINLOCK_T: struct spinlock should be spinlock_t
#249: FILE: drivers/gpu/drm/i915/gem/i915_gem_context_types.h:184:
+		struct spinlock lock;

total: 0 errors, 1 warnings, 0 checks, 246 lines checked
e015db52f116 drm/i915: Disable use of hwsp_cacheline for kernel_context
49bdac891f25 drm/i915/gt: Yield the timeslice if caught waiting on a user semaphore
034350519d42 drm/i915/execlists: Remove preempt-to-busy roundtrip delay

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BAT: failure for series starting with [1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex
  2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
                   ` (6 preceding siblings ...)
  2020-02-10 22:48 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Patchwork
@ 2020-02-10 23:14 ` Patchwork
  2020-02-11 11:49 ` [Intel-gfx] [PATCH 1/7] " Andi Shyti
  2020-02-11 11:58 ` Mika Kuoppala
  9 siblings, 0 replies; 22+ messages in thread
From: Patchwork @ 2020-02-10 23:14 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex
URL   : https://patchwork.freedesktop.org/series/73256/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_7903 -> Patchwork_16510
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_16510 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_16510, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_16510:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_selftest@live_execlists:
    - fi-kbl-x1275:       [PASS][1] -> [DMESG-FAIL][2]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7903/fi-kbl-x1275/igt@i915_selftest@live_execlists.html
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/fi-kbl-x1275/igt@i915_selftest@live_execlists.html
    - fi-kbl-8809g:       [PASS][3] -> [DMESG-FAIL][4]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7903/fi-kbl-8809g/igt@i915_selftest@live_execlists.html
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/fi-kbl-8809g/igt@i915_selftest@live_execlists.html

  
Known issues
------------

  Here are the changes found in Patchwork_16510 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_close_race@basic-threads:
    - fi-byt-n2820:       [PASS][5] -> [INCOMPLETE][6] ([i915#45])
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7903/fi-byt-n2820/igt@gem_close_race@basic-threads.html
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/fi-byt-n2820/igt@gem_close_race@basic-threads.html

  * igt@i915_selftest@live_hangcheck:
    - fi-apl-guc:         [PASS][7] -> [INCOMPLETE][8] ([fdo#103927])
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7903/fi-apl-guc/igt@i915_selftest@live_hangcheck.html
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/fi-apl-guc/igt@i915_selftest@live_hangcheck.html
    - fi-glk-dsi:         [PASS][9] -> [INCOMPLETE][10] ([i915#58] / [k.org#198133])
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7903/fi-glk-dsi/igt@i915_selftest@live_hangcheck.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/fi-glk-dsi/igt@i915_selftest@live_hangcheck.html

  * igt@kms_chamelium@dp-crc-fast:
    - fi-cml-u2:          [PASS][11] -> [FAIL][12] ([i915#262])
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7903/fi-cml-u2/igt@kms_chamelium@dp-crc-fast.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/fi-cml-u2/igt@kms_chamelium@dp-crc-fast.html

  
#### Possible fixes ####

  * igt@i915_selftest@live_blt:
    - fi-bsw-n3050:       [INCOMPLETE][13] ([i915#392]) -> [PASS][14]
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7903/fi-bsw-n3050/igt@i915_selftest@live_blt.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/fi-bsw-n3050/igt@i915_selftest@live_blt.html

  * igt@i915_selftest@live_gem_contexts:
    - fi-cfl-8700k:       [DMESG-FAIL][15] ([i915#623]) -> [PASS][16]
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7903/fi-cfl-8700k/igt@i915_selftest@live_gem_contexts.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/fi-cfl-8700k/igt@i915_selftest@live_gem_contexts.html

  * igt@i915_selftest@live_gtt:
    - fi-bdw-5557u:       [TIMEOUT][17] ([fdo#112271]) -> [PASS][18]
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7903/fi-bdw-5557u/igt@i915_selftest@live_gtt.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/fi-bdw-5557u/igt@i915_selftest@live_gtt.html

  
#### Warnings ####

  * igt@i915_selftest@live_gem_contexts:
    - fi-cfl-guc:         [INCOMPLETE][19] ([CI#80] / [fdo#106070] / [i915#424]) -> [INCOMPLETE][20] ([fdo#106070] / [i915#424])
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_7903/fi-cfl-guc/igt@i915_selftest@live_gem_contexts.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/fi-cfl-guc/igt@i915_selftest@live_gem_contexts.html

  
  [CI#80]: https://gitlab.freedesktop.org/gfx-ci/i915-infra/issues/80
  [fdo#103927]: https://bugs.freedesktop.org/show_bug.cgi?id=103927
  [fdo#106070]: https://bugs.freedesktop.org/show_bug.cgi?id=106070
  [fdo#112271]: https://bugs.freedesktop.org/show_bug.cgi?id=112271
  [i915#262]: https://gitlab.freedesktop.org/drm/intel/issues/262
  [i915#392]: https://gitlab.freedesktop.org/drm/intel/issues/392
  [i915#424]: https://gitlab.freedesktop.org/drm/intel/issues/424
  [i915#45]: https://gitlab.freedesktop.org/drm/intel/issues/45
  [i915#58]: https://gitlab.freedesktop.org/drm/intel/issues/58
  [i915#623]: https://gitlab.freedesktop.org/drm/intel/issues/623
  [k.org#198133]: https://bugzilla.kernel.org/show_bug.cgi?id=198133


Participating hosts (47 -> 44)
------------------------------

  Additional (4): fi-hsw-peppy fi-skl-lmem fi-gdg-551 fi-snb-2600 
  Missing    (7): fi-kbl-soraka fi-ilk-m540 fi-hsw-4200u fi-byt-j1900 fi-byt-squawks fi-byt-clapper fi-bdw-samus 


Build changes
-------------

  * CI: CI-20190529 -> None
  * Linux: CI_DRM_7903 -> Patchwork_16510

  CI-20190529: 20190529
  CI_DRM_7903: 47b768c475f4a11a48bc43e6228660f8b26a542b @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5433: 6a96c17f3a1b4e1f90b1a0b0ce42a7219875d1a4 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_16510: 034350519d42184b266cac609044206c197a3867 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

034350519d42 drm/i915/execlists: Remove preempt-to-busy roundtrip delay
49bdac891f25 drm/i915/gt: Yield the timeslice if caught waiting on a user semaphore
e015db52f116 drm/i915: Disable use of hwsp_cacheline for kernel_context
6ad77fccd647 drm/i915/gem: Don't leak non-persistent requests on changing engines
2351ed092201 drm/i915/selftests: Relax timeout for error-interrupt reset processing
5bf840c72399 drm/i915/selftests: Exercise timeslice rewinding
1f4fe3bfcb66 drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_16510/index.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex
  2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
                   ` (7 preceding siblings ...)
  2020-02-10 23:14 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
@ 2020-02-11 11:49 ` Andi Shyti
  2020-02-11 11:58 ` Mika Kuoppala
  9 siblings, 0 replies; 22+ messages in thread
From: Andi Shyti @ 2020-02-11 11:49 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, Matthew Auld

Hi Chris,

> We manipulate ring->head while active in i915_request_retire underneath
> the timeline manipulation. We cannot rely on a stable ring->head outside
> of the timeline->mutex, in particular while setting up the context for
> resume and reset.
> 
> Closes: https://gitlab.freedesktop.org/drm/intel/issues/1126
> Fixes: 0881954965e3 ("drm/i915: Introduce intel_context.pin_mutex for pin management")
> Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex")
> References: f3c0efc9fe7a ("drm/i915/execlists: Leave resetting ring to intel_ring")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Matthew Auld <matthew.auld@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>

looks OK to me:

Reviewed-by: Andi Shyti <andi.shyti@intel.com>

Andi
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex
  2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
                   ` (8 preceding siblings ...)
  2020-02-11 11:49 ` [Intel-gfx] [PATCH 1/7] " Andi Shyti
@ 2020-02-11 11:58 ` Mika Kuoppala
  9 siblings, 0 replies; 22+ messages in thread
From: Mika Kuoppala @ 2020-02-11 11:58 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Matthew Auld

Chris Wilson <chris@chris-wilson.co.uk> writes:

> We manipulate ring->head while active in i915_request_retire underneath
> the timeline manipulation. We cannot rely on a stable ring->head outside
> of the timeline->mutex, in particular while setting up the context for
> resume and reset.

This solves the immediate problem of ring->head sampling in execlist
submission.

Future work considerations are to make WRITE_ONCE to ring head
even tho it is under timeline and then READ_ONCE on the other,
non lockable places. Or atleast the READ_ONCE notation
outside of lock.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>


>
> Closes: https://gitlab.freedesktop.org/drm/intel/issues/1126
> Fixes: 0881954965e3 ("drm/i915: Introduce intel_context.pin_mutex for pin management")
> Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex")
> References: f3c0efc9fe7a ("drm/i915/execlists: Leave resetting ring to intel_ring")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Matthew Auld <matthew.auld@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/gt/intel_lrc.c    | 36 ++++++++++++--------------
>  drivers/gpu/drm/i915/gt/selftest_lrc.c |  2 +-
>  2 files changed, 18 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 929be03bbe7e..70d91ad923ef 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -235,7 +235,8 @@ static void execlists_init_reg_state(u32 *reg_state,
>  				     bool close);
>  static void
>  __execlists_update_reg_state(const struct intel_context *ce,
> -			     const struct intel_engine_cs *engine);
> +			     const struct intel_engine_cs *engine,
> +			     u32 head);
>  
>  static void mark_eio(struct i915_request *rq)
>  {
> @@ -1184,12 +1185,11 @@ static void reset_active(struct i915_request *rq,
>  		head = rq->tail;
>  	else
>  		head = active_request(ce->timeline, rq)->head;
> -	ce->ring->head = intel_ring_wrap(ce->ring, head);
> -	intel_ring_update_space(ce->ring);
> +	head = intel_ring_wrap(ce->ring, head);
>  
>  	/* Scrub the context image to prevent replaying the previous batch */
>  	restore_default_state(ce, engine);
> -	__execlists_update_reg_state(ce, engine);
> +	__execlists_update_reg_state(ce, engine, head);
>  
>  	/* We've switched away, so this should be a no-op, but intent matters */
>  	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> @@ -2878,16 +2878,17 @@ static void execlists_context_unpin(struct intel_context *ce)
>  
>  static void
>  __execlists_update_reg_state(const struct intel_context *ce,
> -			     const struct intel_engine_cs *engine)
> +			     const struct intel_engine_cs *engine,
> +			     u32 head)
>  {
>  	struct intel_ring *ring = ce->ring;
>  	u32 *regs = ce->lrc_reg_state;
>  
> -	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->head));
> +	GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
>  	GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
>  
>  	regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
> -	regs[CTX_RING_HEAD] = ring->head;
> +	regs[CTX_RING_HEAD] = head;
>  	regs[CTX_RING_TAIL] = ring->tail;
>  
>  	/* RPCS */
> @@ -2916,7 +2917,7 @@ __execlists_context_pin(struct intel_context *ce,
>  
>  	ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
>  	ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
> -	__execlists_update_reg_state(ce, engine);
> +	__execlists_update_reg_state(ce, engine, ce->ring->tail);
>  
>  	return 0;
>  }
> @@ -2941,7 +2942,7 @@ static void execlists_context_reset(struct intel_context *ce)
>  	/* Scrub away the garbage */
>  	execlists_init_reg_state(ce->lrc_reg_state,
>  				 ce, ce->engine, ce->ring, true);
> -	__execlists_update_reg_state(ce, ce->engine);
> +	__execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
>  
>  	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
>  }
> @@ -3538,6 +3539,7 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  	struct intel_engine_execlists * const execlists = &engine->execlists;
>  	struct intel_context *ce;
>  	struct i915_request *rq;
> +	u32 head;
>  
>  	mb(); /* paranoia: read the CSB pointers from after the reset */
>  	clflush(execlists->csb_write);
> @@ -3565,15 +3567,15 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  
>  	if (i915_request_completed(rq)) {
>  		/* Idle context; tidy up the ring so we can restart afresh */
> -		ce->ring->head = intel_ring_wrap(ce->ring, rq->tail);
> +		head = intel_ring_wrap(ce->ring, rq->tail);
>  		goto out_replay;
>  	}
>  
>  	/* Context has requests still in-flight; it should not be idle! */
>  	GEM_BUG_ON(i915_active_is_idle(&ce->active));
>  	rq = active_request(ce->timeline, rq);
> -	ce->ring->head = intel_ring_wrap(ce->ring, rq->head);
> -	GEM_BUG_ON(ce->ring->head == ce->ring->tail);
> +	head = intel_ring_wrap(ce->ring, rq->head);
> +	GEM_BUG_ON(head == ce->ring->tail);
>  
>  	/*
>  	 * If this request hasn't started yet, e.g. it is waiting on a
> @@ -3618,10 +3620,9 @@ static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  
>  out_replay:
>  	ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
> -		     ce->ring->head, ce->ring->tail);
> -	intel_ring_update_space(ce->ring);
> +		     head, ce->ring->tail);
>  	__execlists_reset_reg_state(ce, engine);
> -	__execlists_update_reg_state(ce, engine);
> +	__execlists_update_reg_state(ce, engine, head);
>  	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
>  
>  unwind:
> @@ -5265,10 +5266,7 @@ void intel_lr_context_reset(struct intel_engine_cs *engine,
>  		restore_default_state(ce, engine);
>  
>  	/* Rerun the request; its payload has been neutered (if guilty). */
> -	ce->ring->head = head;
> -	intel_ring_update_space(ce->ring);
> -
> -	__execlists_update_reg_state(ce, engine);
> +	__execlists_update_reg_state(ce, engine, head);
>  }
>  
>  bool
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 7ef68500b2bd..82fa0712808e 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -201,7 +201,7 @@ static int live_unlite_restore(struct intel_gt *gt, int prio)
>  		}
>  		GEM_BUG_ON(!ce[1]->ring->size);
>  		intel_ring_reset(ce[1]->ring, ce[1]->ring->size / 2);
> -		__execlists_update_reg_state(ce[1], engine);
> +		__execlists_update_reg_state(ce[1], engine, ce[1]->ring->head);
>  
>  		rq[0] = igt_spinner_create_request(&spin, ce[0], MI_ARB_CHECK);
>  		if (IS_ERR(rq[0])) {
> -- 
> 2.25.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 4/7] drm/i915/gem: Don't leak non-persistent requests on changing engines
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 4/7] drm/i915/gem: Don't leak non-persistent requests on changing engines Chris Wilson
@ 2020-02-11 13:41   ` Tvrtko Ursulin
  2020-02-11 14:15     ` Chris Wilson
  0 siblings, 1 reply; 22+ messages in thread
From: Tvrtko Ursulin @ 2020-02-11 13:41 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 10/02/2020 20:57, Chris Wilson wrote:
> If we have a set of active engines marked as being non-persistent, we
> lose track of those if the user replaces those engines with
> I915_CONTEXT_PARAM_ENGINES. As part of our uABI contract is that
> non-persistent requests are terminated if they are no longer being
> tracked by the user's context (in order to prevent a lost request
> causing an untracked and so unstoppable GPU hang), we need to apply the
> same context cancellation upon changing engines.
> 
> v2: Track stale engines[] so we only reap at context closure.
> 
> Fixes: a0e047156cde ("drm/i915/gem: Make context persistence optional")
> Testcase: igt/gem_ctx_peristence/replace
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/gem/i915_gem_context.c   | 118 ++++++++++++++++--
>   .../gpu/drm/i915/gem/i915_gem_context_types.h |  13 +-
>   drivers/gpu/drm/i915/i915_sw_fence.c          |  19 ++-
>   drivers/gpu/drm/i915/i915_sw_fence.h          |   2 +-
>   4 files changed, 139 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> index cfaf5bbdbcab..ba29462bd501 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
> @@ -270,7 +270,8 @@ static struct i915_gem_engines *default_engines(struct i915_gem_context *ctx)
>   	if (!e)
>   		return ERR_PTR(-ENOMEM);
>   
> -	init_rcu_head(&e->rcu);
> +	e->ctx = ctx;
> +
>   	for_each_engine(engine, gt, id) {
>   		struct intel_context *ce;
>   
> @@ -450,7 +451,7 @@ static struct intel_engine_cs *active_engine(struct intel_context *ce)
>   	return engine;
>   }
>   
> -static void kill_context(struct i915_gem_context *ctx)
> +static void kill_engines(struct i915_gem_engines *engines)
>   {
>   	struct i915_gem_engines_iter it;
>   	struct intel_context *ce;
> @@ -462,7 +463,7 @@ static void kill_context(struct i915_gem_context *ctx)
>   	 * However, we only care about pending requests, so only include
>   	 * engines on which there are incomplete requests.
>   	 */
> -	for_each_gem_engine(ce, __context_engines_static(ctx), it) {
> +	for_each_gem_engine(ce, engines, it) {
>   		struct intel_engine_cs *engine;
>   
>   		if (intel_context_set_banned(ce))
> @@ -484,10 +485,41 @@ static void kill_context(struct i915_gem_context *ctx)
>   			 * the context from the GPU, we have to resort to a full
>   			 * reset. We hope the collateral damage is worth it.
>   			 */
> -			__reset_context(ctx, engine);
> +			__reset_context(engines->ctx, engine);
>   	}
>   }
>   
> +static void kill_stale_engines(struct i915_gem_context *ctx)
> +{
> +	struct i915_gem_engines *pos, *next;
> +	unsigned long flags;
> +
> +	spin_lock_irqsave(&ctx->stale.lock, flags);
> +	list_for_each_entry_safe(pos, next, &ctx->stale.engines, link) {
> +		if (!i915_sw_fence_await(&pos->fence))
> +			continue;
> +
> +		spin_unlock_irqrestore(&ctx->stale.lock, flags);
> +
> +		kill_engines(pos);
> +
> +		spin_lock_irqsave(&ctx->stale.lock, flags);
> +		list_safe_reset_next(pos, next, link);
> +		list_del_init(&pos->link); /* decouple from FENCE_COMPLETE */
> +
> +		i915_sw_fence_complete(&pos->fence);
> +	}
> +	spin_unlock_irqrestore(&ctx->stale.lock, flags);
> +}
> +
> +static void kill_context(struct i915_gem_context *ctx)
> +{
> +	if (!list_empty(&ctx->stale.engines))
> +		kill_stale_engines(ctx);

Lets see.. set_engines can freely race with context_close. The former is 
adding entries to the list under the lock, the latter is here inspecting 
list state unlocked. But then proceeds to lock it and all is good if 
false negative are not an issue. But looks like it could happen and then 
it fails to clean up. All that is needed is for this thread to not see 
the addition to the list. And since this is not a hot path how about you 
just always call kill_state_engines?

> +
> +	kill_engines(__context_engines_static(ctx));
> +}
> +
>   static void set_closed_name(struct i915_gem_context *ctx)
>   {
>   	char *s;
> @@ -602,6 +634,9 @@ __create_context(struct drm_i915_private *i915)
>   	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
>   	mutex_init(&ctx->mutex);
>   
> +	spin_lock_init(&ctx->stale.lock);
> +	INIT_LIST_HEAD(&ctx->stale.engines);
> +
>   	mutex_init(&ctx->engines_mutex);
>   	e = default_engines(ctx);
>   	if (IS_ERR(e)) {
> @@ -1529,6 +1564,71 @@ static const i915_user_extension_fn set_engines__extensions[] = {
>   	[I915_CONTEXT_ENGINES_EXT_BOND] = set_engines__bond,
>   };
>   
> +static int engines_notify(struct i915_sw_fence *fence,
> +			  enum i915_sw_fence_notify state)
> +{
> +	struct i915_gem_engines *engines =
> +		container_of(fence, typeof(*engines), fence);
> +
> +	switch (state) {
> +	case FENCE_COMPLETE:
> +		if (!list_empty(&engines->link)) {

This unlocked peek indeed looks okay as you said in the previous thread. 
engines at this point "belong" (are only reachable) from the sw_fence, 
for purpose of touching this field, so looks fine.

> +			struct i915_gem_context *ctx = engines->ctx;
> +			unsigned long flags;
> +
> +			spin_lock_irqsave(&ctx->stale.lock, flags);
> +			list_del(&engines->link);
> +			spin_unlock_irqrestore(&ctx->stale.lock, flags);
> +		}
> +		break;
> +
> +	case FENCE_FREE:
> +		init_rcu_head(&engines->rcu);
> +		call_rcu(&engines->rcu, free_engines_rcu);
> +		break;
> +	}
> +
> +	return NOTIFY_DONE;
> +}
> +
> +static void engines_idle_release(struct i915_gem_engines *engines)
> +{
> +	struct i915_gem_engines_iter it;
> +	struct intel_context *ce;
> +	unsigned long flags;
> +
> +	GEM_BUG_ON(!engines);
> +	i915_sw_fence_init(&engines->fence, engines_notify);
> +
> +	spin_lock_irqsave(&engines->ctx->stale.lock, flags);
> +	list_add(&engines->link, &engines->ctx->stale.engines);
> +	spin_unlock_irqrestore(&engines->ctx->stale.lock, flags);
> +
> +	for_each_gem_engine(ce, engines, it) {
> +		struct dma_fence *fence;
> +		int err;
> +
> +		if (!ce->timeline)
> +			continue;
> +
> +		fence = i915_active_fence_get(&ce->timeline->last_request);
> +		if (!fence)
> +			continue;
> +
> +		err = i915_sw_fence_await_dma_fence(&engines->fence,
> +						    fence, 0,
> +						    GFP_KERNEL);
> +
> +		dma_fence_put(fence);
> +		if (err < 0) {
> +			kill_engines(engines);
> +			break;
> +		}
> +	}
> +
> +	i915_sw_fence_commit(&engines->fence);
> +}
> +
>   static int
>   set_engines(struct i915_gem_context *ctx,
>   	    const struct drm_i915_gem_context_param *args)
> @@ -1571,7 +1671,8 @@ set_engines(struct i915_gem_context *ctx,
>   	if (!set.engines)
>   		return -ENOMEM;
>   
> -	init_rcu_head(&set.engines->rcu);
> +	set.engines->ctx = ctx;
> +
>   	for (n = 0; n < num_engines; n++) {
>   		struct i915_engine_class_instance ci;
>   		struct intel_engine_cs *engine;
> @@ -1631,7 +1732,8 @@ set_engines(struct i915_gem_context *ctx,
>   	set.engines = rcu_replace_pointer(ctx->engines, set.engines, 1);
>   	mutex_unlock(&ctx->engines_mutex);
>   
> -	call_rcu(&set.engines->rcu, free_engines_rcu);
> +	/* Keep track of old engine sets for kill_context() */
> +	engines_idle_release(set.engines);
>   
>   	return 0;
>   }
> @@ -1646,7 +1748,6 @@ __copy_engines(struct i915_gem_engines *e)
>   	if (!copy)
>   		return ERR_PTR(-ENOMEM);
>   
> -	init_rcu_head(&copy->rcu);
>   	for (n = 0; n < e->num_engines; n++) {
>   		if (e->engines[n])
>   			copy->engines[n] = intel_context_get(e->engines[n]);
> @@ -1890,7 +1991,8 @@ static int clone_engines(struct i915_gem_context *dst,
>   	if (!clone)
>   		goto err_unlock;
>   
> -	init_rcu_head(&clone->rcu);
> +	clone->ctx = dst;
> +
>   	for (n = 0; n < e->num_engines; n++) {
>   		struct intel_engine_cs *engine;
>   
> diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> index 017ca803ab47..8d996dde8046 100644
> --- a/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> +++ b/drivers/gpu/drm/i915/gem/i915_gem_context_types.h
> @@ -20,6 +20,7 @@
>   #include "gt/intel_context_types.h"
>   
>   #include "i915_scheduler.h"
> +#include "i915_sw_fence.h"
>   
>   struct pid;
>   
> @@ -30,7 +31,12 @@ struct intel_timeline;
>   struct intel_ring;
>   
>   struct i915_gem_engines {
> -	struct rcu_head rcu;
> +	union {
> +		struct rcu_head rcu;
> +		struct list_head link;
> +	};
> +	struct i915_sw_fence fence;
> +	struct i915_gem_context *ctx;
>   	unsigned int num_engines;
>   	struct intel_context *engines[];
>   };
> @@ -173,6 +179,11 @@ struct i915_gem_context {
>   	 * context in messages.
>   	 */
>   	char name[TASK_COMM_LEN + 8];
> +
> +	struct {
> +		struct spinlock lock;
> +		struct list_head engines;
> +	} stale;
>   };
>   
>   #endif /* __I915_GEM_CONTEXT_TYPES_H__ */
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 51ba97daf2a0..bc6d4f8b78f0 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -211,10 +211,23 @@ void i915_sw_fence_complete(struct i915_sw_fence *fence)
>   	__i915_sw_fence_complete(fence, NULL);
>   }
>   
> -void i915_sw_fence_await(struct i915_sw_fence *fence)
> +bool i915_sw_fence_await(struct i915_sw_fence *fence)
>   {
> -	debug_fence_assert(fence);
> -	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
> +	int old, new;
> +
> +	/*
> +	 * It is only safe to add a new await to the fence while it has
> +	 * not yet been signaled.
> +	 */
> +	new = atomic_read(&fence->pending);
> +	do {
> +		if (new < 1)
> +			return false;
> +
> +		old = new++;
> +	} while ((new = atomic_cmpxchg(&fence->pending, old, new)) != old);

Simplify with atomic_try_cmpxchg?

I need a refresher on sw_fence->pending. (See your new comments and 
raise you lack of old! ;)

-1 = completed
0 = ??
1 = new, not waited upon
2 = waited upon

?

> +
> +	return true;
>   }
>   
>   void __i915_sw_fence_init(struct i915_sw_fence *fence,
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 19e806ce43bc..30a863353ee6 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -91,7 +91,7 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    unsigned long timeout,
>   				    gfp_t gfp);
>   
> -void i915_sw_fence_await(struct i915_sw_fence *fence);
> +bool i915_sw_fence_await(struct i915_sw_fence *fence);
>   void i915_sw_fence_complete(struct i915_sw_fence *fence);
>   
>   static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
> 

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 4/7] drm/i915/gem: Don't leak non-persistent requests on changing engines
  2020-02-11 13:41   ` Tvrtko Ursulin
@ 2020-02-11 14:15     ` Chris Wilson
  0 siblings, 0 replies; 22+ messages in thread
From: Chris Wilson @ 2020-02-11 14:15 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2020-02-11 13:41:22)
> 
> On 10/02/2020 20:57, Chris Wilson wrote:
> > +static void kill_context(struct i915_gem_context *ctx)
> > +{
> > +     if (!list_empty(&ctx->stale.engines))
> > +             kill_stale_engines(ctx);
> 
> Lets see.. set_engines can freely race with context_close. The former is 
> adding entries to the list under the lock, the latter is here inspecting 
> list state unlocked. But then proceeds to lock it and all is good if 
> false negative are not an issue. But looks like it could happen and then 
> it fails to clean up. All that is needed is for this thread to not see 
> the addition to the list. And since this is not a hot path how about you 
> just always call kill_state_engines?

Hmm. I didn't consider the race between close context and set-engines.

We would also need to reject the late addition of engines to a closed
context under the spinlock.

Ta.

> >   #endif /* __I915_GEM_CONTEXT_TYPES_H__ */
> > diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> > index 51ba97daf2a0..bc6d4f8b78f0 100644
> > --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> > +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> > @@ -211,10 +211,23 @@ void i915_sw_fence_complete(struct i915_sw_fence *fence)
> >       __i915_sw_fence_complete(fence, NULL);
> >   }
> >   
> > -void i915_sw_fence_await(struct i915_sw_fence *fence)
> > +bool i915_sw_fence_await(struct i915_sw_fence *fence)
> >   {
> > -     debug_fence_assert(fence);
> > -     WARN_ON(atomic_inc_return(&fence->pending) <= 1);
> > +     int old, new;
> > +
> > +     /*
> > +      * It is only safe to add a new await to the fence while it has
> > +      * not yet been signaled.
> > +      */
> > +     new = atomic_read(&fence->pending);
> > +     do {
> > +             if (new < 1)
> > +                     return false;
> > +
> > +             old = new++;
> > +     } while ((new = atomic_cmpxchg(&fence->pending, old, new)) != old);
> 
> Simplify with atomic_try_cmpxchg?

I was under the mistaken impression we didn't have atomic_try_cmpxchg.

> I need a refresher on sw_fence->pending. (See your new comments and 
> raise you lack of old! ;)
> 
> -1 = completed
> 0 = ??

-1 = completed (all listeners awoken)
0 = signaled
1+ = pending waits (and yes we always start with 1 pending wait on behalf
of the caller)

> 1 = new, not waited upon
> 2 = waited upon

Maybe we don't really need -1? That was originally to avoid passing
FENCE_COMPLETE, FENCE_FREE but since we have the state, we don't need to
encode it? That would lead to a few small simplifications.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 2/7] drm/i915/selftests: Exercise timeslice rewinding
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 2/7] drm/i915/selftests: Exercise timeslice rewinding Chris Wilson
@ 2020-02-11 14:50   ` Mika Kuoppala
  2020-02-11 15:16     ` Chris Wilson
  0 siblings, 1 reply; 22+ messages in thread
From: Mika Kuoppala @ 2020-02-11 14:50 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Originally, I did not expect having to rewind a context upon
> timeslicing: the point was to replace the executing context with an idle

I think you said 'non executing' and it would fit better.

> one! However, given a second context that depends on requests from the
> first, we may have to split the requests along the first context to
> execute the second, causing us to replay the first context and have to
> rewind the RING_TAIL.
>
> References: 5ba32c7be81e ("drm/i915/execlists: Always force a context reload when rewinding RING_TAIL")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/gt/selftest_lrc.c | 202 ++++++++++++++++++++++++-
>  1 file changed, 201 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 82fa0712808e..8b7383f6d9b3 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -76,8 +76,11 @@ static int wait_for_submit(struct intel_engine_cs *engine,
>  	do {
>  		cond_resched();
>  		intel_engine_flush_submission(engine);
> -		if (i915_request_is_active(rq))
> +		if (i915_request_is_active(rq) &&
> +		    !READ_ONCE(engine->execlists.pending[0])) {
> +			tasklet_unlock_wait(&engine->execlists.tasklet);
>  			return 0;
> +		}
>  	} while (time_before(jiffies, timeout));
>  
>  	return -ETIME;
> @@ -772,6 +775,202 @@ static int live_timeslice_preempt(void *arg)
>  	return err;
>  }
>  
> +static struct i915_request *
> +create_rewinder(struct intel_context *ce,
> +		struct i915_request *wait,
> +		int slot)
> +{
> +	struct i915_request *rq;
> +	u32 offset = i915_ggtt_offset(ce->engine->status_page.vma) + 4000;
> +	u32 *cs;
> +	int err;
> +
> +	rq = intel_context_create_request(ce);
> +	if (IS_ERR(rq))
> +		return rq;
> +
> +	if (wait) {
> +		err = i915_request_await_dma_fence(rq, &wait->fence);
> +		if (err)
> +			goto err;
> +	}
> +
> +	cs = intel_ring_begin(rq, 10);
> +	if (IS_ERR(cs)) {
> +		err = PTR_ERR(cs);
> +		goto err;
> +	}
> +
> +	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> +	*cs++ = MI_NOOP;
> +
> +	*cs++ = MI_SEMAPHORE_WAIT |
> +		MI_SEMAPHORE_GLOBAL_GTT |
> +		MI_SEMAPHORE_POLL |
> +		MI_SEMAPHORE_SAD_NEQ_SDD;
> +	*cs++ = 0;
> +	*cs++ = offset;
> +	*cs++ = 0;
> +
> +	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
> +	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP(rq->engine->mmio_base));
> +	*cs++ = offset + slot * sizeof(u32);
> +	*cs++ = 0;
> +
> +	intel_ring_advance(rq, cs);
> +
> +	rq->sched.attr.priority = I915_PRIORITY_MASK;
> +	err = 0;
> +err:
> +	i915_request_get(rq);
> +	i915_request_add(rq);
> +	if (err) {
> +		i915_request_put(rq);
> +		return ERR_PTR(err);
> +	}
> +
> +	return rq;
> +}
> +
> +static int live_timeslice_rewind(void *arg)
> +{
> +	struct intel_gt *gt = arg;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	/*
> +	 * The usual presumption on timeslice expiration is that we replace
> +	 * the active context with another. However, given a chain of
> +	 * dependencies we may end up with replacing the context with itself,
> +	 * but only a few of those requests, forcing us to rewind the
> +	 * RING_TAIL of the original request.
> +	 */
> +	if (!IS_ACTIVE(CONFIG_DRM_I915_TIMESLICE_DURATION))
> +		return 0;

Looks like you could remove this check...
> +
> +	for_each_engine(engine, gt, id) {
> +		struct i915_request *rq[3] = {};
> +		struct intel_context *ce;
> +		unsigned long heartbeat;
> +		unsigned long timeslice;
> +		int i, err = 0;
> +		u32 *slot;
> +
> +		if (!intel_engine_has_timeslices(engine))
> +			continue;

and let this handle everything...shrug.

> +
> +		/*
> +		 * A:rq1 -- semaphore wait, timestamp X
> +		 * A:rq2 -- write timestamp Y
> +		 *
> +		 * B:rq1 [await A:rq1] -- write timestamp Z
> +		 *
> +		 * Force timeslice, release sempahore.

s/sempahore/semaphore.


The comment is very very helpful to dissect the test.
I would have liked the to have two context, named A and B
for even increased readability but on the other hand,
it makes then the error handling messier :P

> +		 *
> +		 * Expect evaluation order XZY
> +		 */


> +
> +		engine_heartbeat_disable(engine, &heartbeat);
> +		timeslice = xchg(&engine->props.timeslice_duration_ms, 1);
> +
> +		slot = memset(engine->status_page.addr + 1000,
> +			      0, 4 * sizeof(u32));
> +

The offset to hwsp could be defined but not insisting.

> +		ce = intel_context_create(engine);
> +		if (IS_ERR(ce)) {
> +			err = PTR_ERR(ce);
> +			goto err;
> +		}
> +
> +		rq[0] = create_rewinder(ce, NULL, 1);
> +		if (IS_ERR(rq[0])) {
> +			intel_context_put(ce);
> +			goto err;
> +		}
> +
> +		rq[1] = create_rewinder(ce, NULL, 2);
> +		intel_context_put(ce);
> +		if (IS_ERR(rq[1]))
> +			goto err;
> +
> +		err = wait_for_submit(engine, rq[1], HZ / 2);
> +		if (err) {
> +			pr_err("%s: failed to submit first context\n",
> +			       engine->name);
> +			goto err;
> +		}
> +
> +		ce = intel_context_create(engine);
> +		if (IS_ERR(ce)) {
> +			err = PTR_ERR(ce);
> +			goto err;
> +		}
> +
> +		rq[2] = create_rewinder(ce, rq[0], 3);
> +		intel_context_put(ce);
> +		if (IS_ERR(rq[2]))
> +			goto err;
> +
> +		err = wait_for_submit(engine, rq[2], HZ / 2);
> +		if (err) {
> +			pr_err("%s: failed to submit second context\n",
> +			       engine->name);
> +			goto err;
> +		}
> +		GEM_BUG_ON(!timer_pending(&engine->execlists.timer));
> +
> +		/* Wait for the timeslice to kick in */
> +		del_timer(&engine->execlists.timer);
> +		tasklet_hi_schedule(&engine->execlists.tasklet);
> +		intel_engine_flush_submission(engine);
> +
> +		/* Release the hounds! */
> +		slot[0] = 1;
> +		wmb();
> +
> +		for (i = 1; i <= 3; i++) {
> +			unsigned long timeout = jiffies + HZ / 2;
> +
> +			while (!READ_ONCE(slot[i]) &&
> +			       time_before(jiffies, timeout))

you pushed with wmb so you could expect with rmb() and cpu_relax();
I guess it works fine without :O.

> +				;
> +
> +			if (!time_before(jiffies, timeout)) {
> +				pr_err("%s: rq[%d] timed out\n",
> +				       engine->name, i - 1);
> +				err = -ETIME;
> +				goto err;
> +			}
> +
> +			pr_debug("%s: slot[%d]:%x\n", engine->name, i, slot[i]);
> +		}
> +
> +		/* XZY: XZ < XY */
> +		if (slot[3] - slot[1] >= slot[2] - slot[1]) {
> +			pr_err("%s: timeslicing did not run context B [%u] before A [%u]!\n",
> +			       engine->name,
> +			       slot[3] - slot[1],
> +			       slot[2] - slot[1]);
> +			err = -EINVAL;
> +		}
> +
> +err:
> +		memset(slot, 0xff, 4 * sizeof(u32));

was expecting slot[0] = 

Only minor nitpicks/suggestions/typo.

In general the commenting pattern was enjoyable and helped.
First one describes what we try to achieve, then how and
then the 3 one liners points to the key switches/stages on it.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> +		wmb();
> +
> +		engine->props.timeslice_duration_ms = timeslice;
> +		engine_heartbeat_enable(engine, heartbeat);
> +		for (i = 0; i < 3; i++)
> +			i915_request_put(rq[i]);
> +		if (igt_flush_test(gt->i915))
> +			err = -EIO;
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
> +}
> +
>  static struct i915_request *nop_request(struct intel_engine_cs *engine)
>  {
>  	struct i915_request *rq;
> @@ -3619,6 +3818,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>  		SUBTEST(live_hold_reset),
>  		SUBTEST(live_error_interrupt),
>  		SUBTEST(live_timeslice_preempt),
> +		SUBTEST(live_timeslice_rewind),
>  		SUBTEST(live_timeslice_queue),
>  		SUBTEST(live_busywait_preempt),
>  		SUBTEST(live_preempt),
> -- 
> 2.25.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 2/7] drm/i915/selftests: Exercise timeslice rewinding
  2020-02-11 14:50   ` Mika Kuoppala
@ 2020-02-11 15:16     ` Chris Wilson
  0 siblings, 0 replies; 22+ messages in thread
From: Chris Wilson @ 2020-02-11 15:16 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2020-02-11 14:50:08)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> > +             /* Release the hounds! */
> > +             slot[0] = 1;
> > +             wmb();
> > +
> > +             for (i = 1; i <= 3; i++) {
> > +                     unsigned long timeout = jiffies + HZ / 2;
> > +
> > +                     while (!READ_ONCE(slot[i]) &&
> > +                            time_before(jiffies, timeout))
> 
> you pushed with wmb so you could expect with rmb() and cpu_relax();
> I guess it works fine without :O.

The wmb() "pairs" with GPU; just paranoia.

> > +                             ;
> > +
> > +                     if (!time_before(jiffies, timeout)) {
> > +                             pr_err("%s: rq[%d] timed out\n",
> > +                                    engine->name, i - 1);
> > +                             err = -ETIME;
> > +                             goto err;
> > +                     }
> > +
> > +                     pr_debug("%s: slot[%d]:%x\n", engine->name, i, slot[i]);
> > +             }
> > +
> > +             /* XZY: XZ < XY */
> > +             if (slot[3] - slot[1] >= slot[2] - slot[1]) {
> > +                     pr_err("%s: timeslicing did not run context B [%u] before A [%u]!\n",
> > +                            engine->name,
> > +                            slot[3] - slot[1],
> > +                            slot[2] - slot[1]);
> > +                     err = -EINVAL;
> > +             }
> > +
> > +err:
> > +             memset(slot, 0xff, 4 * sizeof(u32));
> 
> was expecting slot[0] = 
memset32(&slot[0], -1, 4); /* weirdo */
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 3/7] drm/i915/selftests: Relax timeout for error-interrupt reset processing
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 3/7] drm/i915/selftests: Relax timeout for error-interrupt reset processing Chris Wilson
@ 2020-02-11 15:23   ` Mika Kuoppala
  2020-02-11 15:33     ` Chris Wilson
  0 siblings, 1 reply; 22+ messages in thread
From: Mika Kuoppala @ 2020-02-11 15:23 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> We can not require that the system process a tasklet in reasonable time
> (thanks be to ksoftirqd), but we can insist that having waited
> sufficiently for the error interrupt to have been raised and having
> kicked the tasklet, the reset has begun and the request will be marked
> as in error (if not already completed).
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/gt/selftest_lrc.c | 15 +++++++++++----
>  1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 8b7383f6d9b3..ccd4cd2c202d 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -503,14 +503,21 @@ static int live_error_interrupt(void *arg)
>  			}
>  
>  			for (i = 0; i < ARRAY_SIZE(client); i++) {
> -				if (i915_request_wait(client[i], 0, HZ / 5) < 0) {
> -					pr_err("%s: %s request still executing!\n",
> -					       engine->name,
> -					       error_repr(p->error[i]));
> +				if (i915_request_wait(client[i], 0, HZ / 5) < 0)
> +					pr_debug("%s: %s request incomplete!\n",
> +						 engine->name,
> +						 error_repr(p->error[i]));
> +
> +				if (!i915_request_started(client[i])) {
> +					pr_debug("%s: %s request not stated!\n",
> +						 engine->name,
> +						 error_repr(p->error[i]));
>  					err = -ETIME;
>  					goto out;
>  				}
>  
> +				/* Kick the tasklet to process the error */
> +				intel_engine_flush_submission(engine);

This pattern of usage in selftests makes me want to add mb(); to
intel_engine_flush_submission(), as it does not seem to do it
implicitly.

We want to flush submission and observe the effects, thus
it seems like a good place.

Not in a scope of this patch tho,
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

-Mika

>  				if (client[i]->fence.error != p->error[i]) {
>  					pr_err("%s: %s request completed with wrong error code: %d\n",
>  					       engine->name,
> -- 
> 2.25.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 3/7] drm/i915/selftests: Relax timeout for error-interrupt reset processing
  2020-02-11 15:23   ` Mika Kuoppala
@ 2020-02-11 15:33     ` Chris Wilson
  2020-02-11 15:54       ` Mika Kuoppala
  0 siblings, 1 reply; 22+ messages in thread
From: Chris Wilson @ 2020-02-11 15:33 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2020-02-11 15:23:24)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> > +                             /* Kick the tasklet to process the error */
> > +                             intel_engine_flush_submission(engine);
> 
> This pattern of usage in selftests makes me want to add mb(); to
> intel_engine_flush_submission(), as it does not seem to do it
> implicitly.
> 
> We want to flush submission and observe the effects, thus
> it seems like a good place.

Hmm. From the cpu perspective the memory barrier would be on the other
side in clear_bit_unlock(), and flush_submission does unlock_wait.

But, then, we have to factor in that we have to communicate with an
external client the GPU, so perhaps an explicit memory barrier...

We certainly do perform the memory barrier in order to set the GPU in
motion, but have not relied on them for observing effects (aside from
the CSB ringbuf).

I don't see a strong argument that adding a 'mb/rmb' here will make any
difference, I don't see what we are pairing with from the GPU
perspective. But maybe there is?
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 3/7] drm/i915/selftests: Relax timeout for error-interrupt reset processing
  2020-02-11 15:33     ` Chris Wilson
@ 2020-02-11 15:54       ` Mika Kuoppala
  2020-02-11 16:00         ` Chris Wilson
  0 siblings, 1 reply; 22+ messages in thread
From: Mika Kuoppala @ 2020-02-11 15:54 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2020-02-11 15:23:24)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> > +                             /* Kick the tasklet to process the error */
>> > +                             intel_engine_flush_submission(engine);
>> 
>> This pattern of usage in selftests makes me want to add mb(); to
>> intel_engine_flush_submission(), as it does not seem to do it
>> implicitly.
>> 
>> We want to flush submission and observe the effects, thus
>> it seems like a good place.
>
> Hmm. From the cpu perspective the memory barrier would be on the other
> side in clear_bit_unlock(), and flush_submission does unlock_wait.
>
> But, then, we have to factor in that we have to communicate with an
> external client the GPU, so perhaps an explicit memory barrier...
>
> We certainly do perform the memory barrier in order to set the GPU in
> motion, but have not relied on them for observing effects (aside from
> the CSB ringbuf).
>
> I don't see a strong argument that adding a 'mb/rmb' here will make any
> difference, I don't see what we are pairing with from the GPU
> perspective. But maybe there is?

I don't have a strong argument from gpu side. But what if the
flush only does the nonatomic wait and returns, and our
CPU has read the state up front for the next comparison.

Or now thinking it, the saving grace is that if we don't need
to flush in here, the tasklet has finished and finish has
the barrier?

-Mika





_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 3/7] drm/i915/selftests: Relax timeout for error-interrupt reset processing
  2020-02-11 15:54       ` Mika Kuoppala
@ 2020-02-11 16:00         ` Chris Wilson
  0 siblings, 0 replies; 22+ messages in thread
From: Chris Wilson @ 2020-02-11 16:00 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2020-02-11 15:54:07)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Mika Kuoppala (2020-02-11 15:23:24)
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >> > +                             /* Kick the tasklet to process the error */
> >> > +                             intel_engine_flush_submission(engine);
> >> 
> >> This pattern of usage in selftests makes me want to add mb(); to
> >> intel_engine_flush_submission(), as it does not seem to do it
> >> implicitly.
> >> 
> >> We want to flush submission and observe the effects, thus
> >> it seems like a good place.
> >
> > Hmm. From the cpu perspective the memory barrier would be on the other
> > side in clear_bit_unlock(), and flush_submission does unlock_wait.
> >
> > But, then, we have to factor in that we have to communicate with an
> > external client the GPU, so perhaps an explicit memory barrier...
> >
> > We certainly do perform the memory barrier in order to set the GPU in
> > motion, but have not relied on them for observing effects (aside from
> > the CSB ringbuf).
> >
> > I don't see a strong argument that adding a 'mb/rmb' here will make any
> > difference, I don't see what we are pairing with from the GPU
> > perspective. But maybe there is?
> 
> I don't have a strong argument from gpu side. But what if the
> flush only does the nonatomic wait and returns, and our
> CPU has read the state up front for the next comparison.
> 
> Or now thinking it, the saving grace is that if we don't need
> to flush in here, the tasklet has finished and finish has
> the barrier?

That's how we synchronize with the tasklet, yes. So by this point (as we
expect to have a request ready and scheduled the tasklet) we know that
the tasklet has completed.

So we know we've *initiated* some action on the GPU. Most of the
side-effects from doing so are without barriers.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 5/7] drm/i915: Disable use of hwsp_cacheline for kernel_context
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 5/7] drm/i915: Disable use of hwsp_cacheline for kernel_context Chris Wilson
@ 2020-02-11 17:36   ` Mika Kuoppala
  0 siblings, 0 replies; 22+ messages in thread
From: Mika Kuoppala @ 2020-02-11 17:36 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Currently on execlists, we use a local hwsp for the kernel_context,
> rather than the engine's HWSP, as this is the default for execlists.
> However, seqno rollover requires allocating a new HWSP cachline, and may

s/cachline/cacheline

> require pinning a new HWSP page in the GTT. This operation requiring

s/GTT/GGTT

> pinning in the GGTT is not allowed within the kernel_context timeline,
> as doing so may require re-entering the kernel_context in order to evict
> from the GGTT. As we want to avoid requiring a new HWSP for the
> kernel_context, we can use the permanently pinned engine's HWSP instead.
> However to do so we must prevent the use of semaphores reading the
> kernel_context's HWSP, as the use of semaphores do not support rollover

Back in a time, it was called seqno wrap. But I guess the cacheline
jump warrants a rollover.

> onto the same cacheline. Fortunately, the kernel_context is mostly
> isolated, so unlikely to give benefit to semaphores.
>
> Reported-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/gt/intel_lrc.c    | 14 ++++++++++++--
>  drivers/gpu/drm/i915/gt/selftest_lrc.c | 12 +++++++++---
>  drivers/gpu/drm/i915/i915_request.c    | 14 +++++++++-----
>  3 files changed, 30 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 70d91ad923ef..902d440ef07d 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -2964,7 +2964,8 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
>  {
>  	u32 *cs;
>  
> -	GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
> +	if (!i915_request_timeline(rq)->has_initial_breadcrumb)
> +		return 0;
>  
>  	cs = intel_ring_begin(rq, 6);
>  	if (IS_ERR(cs))
> @@ -4616,8 +4617,17 @@ static int __execlists_context_alloc(struct intel_context *ce,
>  
>  	if (!ce->timeline) {
>  		struct intel_timeline *tl;
> +		struct i915_vma *hwsp;
> +
> +		/*
> +		 * Use the static global HWSP for the kernel context, and
> +		 * a dynamically allocated cacheline for everyone else.
> +		 */
> +		hwsp = NULL;
> +		if (unlikely(intel_context_is_barrier(ce)))
> +			hwsp = engine->status_page.vma;
>  
> -		tl = intel_timeline_create(engine->gt, NULL);
> +		tl = intel_timeline_create(engine->gt, hwsp);
>  		if (IS_ERR(tl)) {
>  			ret = PTR_ERR(tl);
>  			goto error_deref_obj;
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index ccd4cd2c202d..6f458f6d5523 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -3494,15 +3494,21 @@ static int bond_virtual_engine(struct intel_gt *gt,
>  	rq[0] = ERR_PTR(-ENOMEM);
>  	for_each_engine(master, gt, id) {
>  		struct i915_sw_fence fence = {};
> +		struct intel_context *ce;
>  
>  		if (master->class == class)
>  			continue;
>  
> +		ce = intel_context_create(master);
> +		if (IS_ERR(ce)) {
> +			err = PTR_ERR(ce);
> +			goto out;
> +		}
> +
>  		memset_p((void *)rq, ERR_PTR(-EINVAL), ARRAY_SIZE(rq));
>  
> -		rq[0] = igt_spinner_create_request(&spin,
> -						   master->kernel_context,
> -						   MI_NOOP);
> +		rq[0] = igt_spinner_create_request(&spin, ce, MI_NOOP);
> +		intel_context_put(ce);
>  		if (IS_ERR(rq[0])) {
>  			err = PTR_ERR(rq[0]);
>  			goto out;
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 0ecc2cf64216..1adb8cf35f75 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -886,6 +886,12 @@ emit_semaphore_wait(struct i915_request *to,
>  		    struct i915_request *from,
>  		    gfp_t gfp)
>  {
> +	if (!intel_context_use_semaphores(to->context))
> +		goto await_fence;
> +
> +	if (!rcu_access_pointer(from->hwsp_cacheline))
> +		goto await_fence;
> +
>  	/* Just emit the first semaphore we see as request space is limited. */
>  	if (already_busywaiting(to) & from->engine->mask)
>  		goto await_fence;
> @@ -931,12 +937,8 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
>  		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
>  						       &from->submit,
>  						       I915_FENCE_GFP);
> -	else if (intel_context_use_semaphores(to->context))
> -		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
>  	else
> -		ret = i915_sw_fence_await_dma_fence(&to->submit,
> -						    &from->fence, 0,
> -						    I915_FENCE_GFP);
> +		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
>  	if (ret < 0)
>  		return ret;
>  
> @@ -1035,6 +1037,8 @@ __i915_request_await_execution(struct i915_request *to,
>  {
>  	int err;
>  
> +	GEM_BUG_ON(intel_context_is_barrier(from->context));
> +
>  	/* Submit both requests at the same time */
>  	err = __await_execution(to, from, hook, I915_FENCE_GFP);
>  	if (err)
> -- 
> 2.25.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/execlists: Remove preempt-to-busy roundtrip delay
  2020-02-10 20:57 ` [Intel-gfx] [PATCH 7/7] drm/i915/execlists: Remove preempt-to-busy roundtrip delay Chris Wilson
@ 2020-02-12  1:08   ` Daniele Ceraolo Spurio
  2020-02-14 10:10     ` Chris Wilson
  0 siblings, 1 reply; 22+ messages in thread
From: Daniele Ceraolo Spurio @ 2020-02-12  1:08 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx



On 2/10/20 12:57 PM, Chris Wilson wrote:
> To prevent the context from proceeding past the end of the request as we
> unwind, we embed a semaphore into the footer of each request. (If the
> context were to skip past the end of the request as we perform the
> preemption, next time we reload the context it's RING_HEAD would be past
> the RING_TAIL and instead of replaying the commands it would read the
> read of the uninitialised ringbuffer.)
> 
> However, this requires us to keep the ring paused at the end of the
> request until we have a change to process the preemption ack and remove
> the semaphore. Our processing of acks is at the whim of ksoftirqd, and
> so it is entirely possible that the GPU has to wait for the tasklet
> before it can proceed with the next request.
> 
> It was suggested that we could also embed a MI_LOAD_REGISTER_MEM into
> the footer to read the current RING_TAIL from the context, which would
> allow us to not only avoid this round trip (and so release the context
> as soon as we had submitted the preemption request to in ELSP), but also
> skip using ELSP for lite-restores entirely. That has the nice benefit of
> dramatically reducing contention and the frequency of interrupts when a
> client submits two or more execbufs in rapid succession.
> 
> However, mmio access to RING_TAIL was defeatured in gen11 so we can only
> employ this handy trick for gen8/gen9.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
> ---
>   drivers/gpu/drm/i915/gt/intel_engine_types.h | 23 +++--
>   drivers/gpu/drm/i915/gt/intel_lrc.c          | 99 +++++++++++++++++++-
>   2 files changed, 109 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> index 24cff658e6e5..ae8724915320 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h
> @@ -488,14 +488,15 @@ struct intel_engine_cs {
>   	/* status_notifier: list of callbacks for context-switch changes */
>   	struct atomic_notifier_head context_status_notifier;
>   
> -#define I915_ENGINE_USING_CMD_PARSER BIT(0)
> -#define I915_ENGINE_SUPPORTS_STATS   BIT(1)
> -#define I915_ENGINE_HAS_PREEMPTION   BIT(2)
> -#define I915_ENGINE_HAS_SEMAPHORES   BIT(3)
> -#define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET BIT(4)
> -#define I915_ENGINE_IS_VIRTUAL       BIT(5)
> -#define I915_ENGINE_HAS_RELATIVE_MMIO BIT(6)
> -#define I915_ENGINE_REQUIRES_CMD_PARSER BIT(7)
> +#define I915_ENGINE_REQUIRES_CMD_PARSER		BIT(0)
> +#define I915_ENGINE_USING_CMD_PARSER		BIT(1)
> +#define I915_ENGINE_SUPPORTS_STATS		BIT(2)
> +#define I915_ENGINE_HAS_PREEMPTION		BIT(3)
> +#define I915_ENGINE_HAS_SEMAPHORES		BIT(4)
> +#define I915_ENGINE_HAS_TAIL_LRM		BIT(5)
> +#define I915_ENGINE_NEEDS_BREADCRUMB_TASKLET	BIT(6)
> +#define I915_ENGINE_IS_VIRTUAL			BIT(7)
> +#define I915_ENGINE_HAS_RELATIVE_MMIO		BIT(8)
>   	unsigned int flags;
>   
>   	/*
> @@ -592,6 +593,12 @@ intel_engine_has_semaphores(const struct intel_engine_cs *engine)
>   	return engine->flags & I915_ENGINE_HAS_SEMAPHORES;
>   }
>   
> +static inline bool
> +intel_engine_has_tail_lrm(const struct intel_engine_cs *engine)
> +{
> +	return engine->flags & I915_ENGINE_HAS_TAIL_LRM;
> +}
> +
>   static inline bool
>   intel_engine_needs_breadcrumb_tasklet(const struct intel_engine_cs *engine)
>   {
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index 696f0b6b223c..5939672781fb 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -1797,6 +1797,74 @@ static inline void clear_ports(struct i915_request **ports, int count)
>   	memset_p((void **)ports, NULL, count);
>   }
>   
> +static struct i915_request *
> +skip_lite_restore(struct intel_engine_cs *const engine,
> +		  struct i915_request *first,
> +		  bool *submit)
> +{
> +	struct intel_engine_execlists *const execlists = &engine->execlists;
> +	struct i915_request *last = first;
> +	struct rb_node *rb;
> +
> +	if (!intel_engine_has_tail_lrm(engine))
> +		return last;
> +
> +	GEM_BUG_ON(*submit);
> +	while ((rb = rb_first_cached(&execlists->queue))) {
> +		struct i915_priolist *p = to_priolist(rb);
> +		struct i915_request *rq, *rn;
> +		int i;
> +
> +		priolist_for_each_request_consume(rq, rn, p, i) {
> +			if (!can_merge_rq(last, rq))
> +				goto out;
> +
> +			if (__i915_request_submit(rq)) {
> +				*submit = true;
> +				last = rq;
> +			}
> +		}
> +
> +		rb_erase_cached(&p->node, &execlists->queue);
> +		i915_priolist_free(p);
> +	}
> +out:
> +	if (*submit) {
> +		ring_set_paused(engine, 1);
> +
> +		/*
> +		 * If we are quick and the current context hasn't yet completed
> +		 * its request, we can just tell it to extend the RING_TAIL
> +		 * onto the next without having to submit a new ELSP.
> +		 */
> +		if (!i915_request_completed(first)) {
> +			struct i915_request **port;
> +
> +			ENGINE_TRACE(engine,
> +				     "eliding lite-restore last=%llx:%lld->%lld, current %d\n",
> +				     first->fence.context,
> +				     first->fence.seqno,
> +				     last->fence.seqno,
> +				     hwsp_seqno(last));
> +			GEM_BUG_ON(first->context != last->context);
> +
> +			execlists_update_context(last);
> +			for (port = (struct i915_request **)execlists->active;
> +			     *port != first;
> +			     port++)
> +				;
> +			WRITE_ONCE(*port, i915_request_get(last));
> +			i915_request_put(first);
> +
> +			*submit = false;
> +		}
> +
> +		ring_set_paused(engine, 0);
> +	}
> +
> +	return last;
> +}
> +
>   static void execlists_dequeue(struct intel_engine_cs *engine)
>   {
>   	struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -1934,6 +2002,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   
>   				return;
>   			}
> +
> +			last = skip_lite_restore(engine, last, &submit);
>   		}
>   	}
>   
> @@ -2155,10 +2225,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   		WRITE_ONCE(execlists->yield, -1);
>   		execlists_submit_ports(engine);
>   		set_preempt_timeout(engine);
> -	} else {
> +	}
> +
> +	if (intel_engine_has_tail_lrm(engine) || !submit)

Why do we skip here if intel_engine_has_tail_lrm is true? I see that if 
we have work pending we either take the skip_lite_restore() or the 
submit path above, but I can't see why we need to explicitly skip 
re-starting the ring.

>   skip_submit:
>   		ring_set_paused(engine, 0);
> -	}
>   }
>   
>   static void
> @@ -2325,7 +2396,8 @@ static void process_csb(struct intel_engine_cs *engine)
>   
>   			GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
>   
> -			ring_set_paused(engine, 0);
> +			if (!intel_engine_has_tail_lrm(engine))
> +				ring_set_paused(engine, 0);
>   

here as well, although I'm assuming it has the same explanation as the 
one above.

Daniele

>   			/* Point active to the new ELSP; prevent overwriting */
>   			WRITE_ONCE(execlists->active, execlists->pending);
> @@ -4123,15 +4195,28 @@ static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
>   	return cs;
>   }
>   
> +static u32 *emit_lrm_tail(struct i915_request *request, u32 *cs)
> +{
> +	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
> +	*cs++ = i915_mmio_reg_offset(RING_TAIL(request->engine->mmio_base));
> +	*cs++ = i915_ggtt_offset(request->context->state) +
> +		LRC_STATE_PN * PAGE_SIZE +
> +		CTX_RING_TAIL * sizeof(u32);
> +	*cs++ = 0;
> +
> +	return cs;
> +}
> +
>   static __always_inline u32*
> -gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
> -				 u32 *cs)
> +gen8_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
>   {
>   	*cs++ = MI_USER_INTERRUPT;
>   
>   	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
>   	if (intel_engine_has_semaphores(request->engine))
>   		cs = emit_preempt_busywait(request, cs);
> +	if (intel_engine_has_tail_lrm(request->engine))
> +		cs = emit_lrm_tail(request, cs);
>   
>   	request->tail = intel_ring_offset(request, cs);
>   	assert_ring_tail_valid(request->ring, request->tail);
> @@ -4220,6 +4305,8 @@ static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
>   static __always_inline u32*
>   gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
>   {
> +	GEM_BUG_ON(intel_engine_has_tail_lrm(request->engine));
> +
>   	*cs++ = MI_USER_INTERRUPT;
>   
>   	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
> @@ -4286,6 +4373,8 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
>   		engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
>   		if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
>   			engine->flags |= I915_ENGINE_HAS_PREEMPTION;
> +		if (INTEL_GEN(engine->i915) < 11)
> +			engine->flags |= I915_ENGINE_HAS_TAIL_LRM;
>   	}
>   
>   	if (INTEL_GEN(engine->i915) >= 12)
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [Intel-gfx] [PATCH 7/7] drm/i915/execlists: Remove preempt-to-busy roundtrip delay
  2020-02-12  1:08   ` Daniele Ceraolo Spurio
@ 2020-02-14 10:10     ` Chris Wilson
  0 siblings, 0 replies; 22+ messages in thread
From: Chris Wilson @ 2020-02-14 10:10 UTC (permalink / raw)
  To: Daniele Ceraolo Spurio, intel-gfx

Quoting Daniele Ceraolo Spurio (2020-02-12 01:08:30)
> 
> 
> On 2/10/20 12:57 PM, Chris Wilson wrote:
> > @@ -1934,6 +2002,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >   
> >                               return;
> >                       }
> > +
> > +                     last = skip_lite_restore(engine, last, &submit);
> >               }
> >       }
> >   
> > @@ -2155,10 +2225,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >               WRITE_ONCE(execlists->yield, -1);
> >               execlists_submit_ports(engine);
> >               set_preempt_timeout(engine);
> > -     } else {
> > +     }
> > +
> > +     if (intel_engine_has_tail_lrm(engine) || !submit)
> 
> Why do we skip here if intel_engine_has_tail_lrm is true? I see that if 
> we have work pending we either take the skip_lite_restore() or the 
> submit path above, but I can't see why we need to explicitly skip 
> re-starting the ring.

You mean if !has_lrm. We have to delay letting the RING_TAIL move past
the end of the request until the HW has acknowledged the preemption
request. This is required to avoid the ELSP submission from trying to
move the RING_TAIL backwards.

As it turns out, I can't special case has_lrm here since if we read the
new RING_TAIL before the ELSP event, we end up submitting the same
RING_TAIL again and trigging the HW bug.

> 
> >   skip_submit:
> >               ring_set_paused(engine, 0);
> > -     }
> >   }
> >   
> >   static void
> > @@ -2325,7 +2396,8 @@ static void process_csb(struct intel_engine_cs *engine)
> >   
> >                       GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
> >   
> > -                     ring_set_paused(engine, 0);
> > +                     if (!intel_engine_has_tail_lrm(engine))
> > +                             ring_set_paused(engine, 0);
> >   
> 
> here as well, although I'm assuming it has the same explanation as the 
> one above.

For has_lrm, it will have already seen the new RING_TAIL at the end of
the request regardless of the preempting ELSP.

However, as noted without only setting wa_tail in the context-image and
LRM normal tail, we end up hitting WaIdleLiteRestore and killing the HW.
Bleugh.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2020-02-14 10:11 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-10 20:57 [Intel-gfx] [PATCH 1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Chris Wilson
2020-02-10 20:57 ` [Intel-gfx] [PATCH 2/7] drm/i915/selftests: Exercise timeslice rewinding Chris Wilson
2020-02-11 14:50   ` Mika Kuoppala
2020-02-11 15:16     ` Chris Wilson
2020-02-10 20:57 ` [Intel-gfx] [PATCH 3/7] drm/i915/selftests: Relax timeout for error-interrupt reset processing Chris Wilson
2020-02-11 15:23   ` Mika Kuoppala
2020-02-11 15:33     ` Chris Wilson
2020-02-11 15:54       ` Mika Kuoppala
2020-02-11 16:00         ` Chris Wilson
2020-02-10 20:57 ` [Intel-gfx] [PATCH 4/7] drm/i915/gem: Don't leak non-persistent requests on changing engines Chris Wilson
2020-02-11 13:41   ` Tvrtko Ursulin
2020-02-11 14:15     ` Chris Wilson
2020-02-10 20:57 ` [Intel-gfx] [PATCH 5/7] drm/i915: Disable use of hwsp_cacheline for kernel_context Chris Wilson
2020-02-11 17:36   ` Mika Kuoppala
2020-02-10 20:57 ` [Intel-gfx] [PATCH 6/7] drm/i915/gt: Yield the timeslice if caught waiting on a user semaphore Chris Wilson
2020-02-10 20:57 ` [Intel-gfx] [PATCH 7/7] drm/i915/execlists: Remove preempt-to-busy roundtrip delay Chris Wilson
2020-02-12  1:08   ` Daniele Ceraolo Spurio
2020-02-14 10:10     ` Chris Wilson
2020-02-10 22:48 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/7] drm/i915/gt: Avoid resetting ring->head outside of its timeline mutex Patchwork
2020-02-10 23:14 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
2020-02-11 11:49 ` [Intel-gfx] [PATCH 1/7] " Andi Shyti
2020-02-11 11:58 ` Mika Kuoppala

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).