All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH 07/10] drm/i915/execlists: Cancel banned contexts on schedule-out
Date: Fri, 11 Oct 2019 10:47:26 +0100	[thread overview]
Message-ID: <e9b171f6-be40-fb86-99e0-3bfe5f69404e@linux.intel.com> (raw)
In-Reply-To: <20191010071434.31195-7-chris@chris-wilson.co.uk>


On 10/10/2019 08:14, Chris Wilson wrote:
> On completion of a banned context, scrub the context image so that we do

s/completion/schedule out/ like in the subject line? Otherwise I 
struggle to understand how banned context is completing. Presumably it 
was banned because it keeps hanging.

> not replay the active payload. The intent is that we skip banned
> payloads on request submission so that the timeline advancement
> continues on in the background. However, if we are returning to a
> preempted request, i915_request_skip() is ineffective and instead we
> need to patch up the context image so that it continues from the start
> of the next request.

But if the context is banned why do we want to continue from the start 
of the next request? Don't we want to zap all submitted so far?

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/gt/intel_lrc.c    |  58 ++++++
>   drivers/gpu/drm/i915/gt/selftest_lrc.c | 273 +++++++++++++++++++++++++
>   2 files changed, 331 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index eb99f1e804f7..79c7ebea2fcc 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -234,6 +234,9 @@ static void execlists_init_reg_state(u32 *reg_state,
>   				     const struct intel_engine_cs *engine,
>   				     const struct intel_ring *ring,
>   				     bool close);
> +static void
> +__execlists_update_reg_state(const struct intel_context *ce,
> +			     const struct intel_engine_cs *engine);
>   
>   static void __context_pin_acquire(struct intel_context *ce)
>   {
> @@ -1022,6 +1025,58 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
>   		tasklet_schedule(&ve->base.execlists.tasklet);
>   }
>   
> +static void
> +mark_complete(struct i915_request *rq, struct intel_engine_cs *engine)
> +{
> +	const struct intel_timeline * const tl = rcu_dereference(rq->timeline);
> +
> +	*(u32 *)tl->hwsp_seqno = rq->fence.seqno;
> +	GEM_BUG_ON(!i915_request_completed(rq));
> +
> +	list_for_each_entry_from_reverse(rq, &tl->requests, link) {
> +		if (i915_request_signaled(rq))
> +			break;
> +
> +		mark_eio(rq);

This would -EIO requests which have potentially be completed but not 
retired yet? If so why?

> +	}
> +
> +	intel_engine_queue_breadcrumbs(engine);
> +}
> +
> +static void cancel_active(struct i915_request *rq,
> +			  struct intel_engine_cs *engine)
> +{
> +	struct intel_context * const ce = rq->hw_context;
> +	u32 *regs = ce->lrc_reg_state;
> +
> +	if (i915_request_completed(rq))
> +		return;
> +
> +	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
> +		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
> +	__context_pin_acquire(ce);
> +
> +	/* Scrub the context image to prevent replaying the previous batch */
> +	memcpy(regs, /* skip restoring the vanilla PPHWSP */
> +	       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
> +	       engine->context_size - PAGE_SIZE);

context_size - LRC_STATE_PN * PAGE_SIZE ?

> +	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
> +
> +	/* Ring will be advanced on retire; here we need to reset the context */
> +	ce->ring->head = intel_ring_wrap(ce->ring, rq->wa_tail);
> +	__execlists_update_reg_state(ce, engine);
> +
> +	/* We've switched away, so this should be a no-op, but intent matters */
> +	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> +
> +	/* Let everyone know that the request may now be retired */
> +	rcu_read_lock();
> +	mark_complete(rq, engine);
> +	rcu_read_unlock();
> +
> +	__context_pin_release(ce);
> +}
> +
>   static inline void
>   __execlists_schedule_out(struct i915_request *rq,
>   			 struct intel_engine_cs * const engine)
> @@ -1032,6 +1087,9 @@ __execlists_schedule_out(struct i915_request *rq,
>   	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
>   	intel_gt_pm_put(engine->gt);
>   
> +	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
> +		cancel_active(rq, engine);

Or you are counting this is already the last runnable request from this 
context due coalescing? It wouldn't work if for any reason coalescing 
would be prevented. Either with GVT, or I had some ideas to prevent 
coalescing for contexts where watchdog is enabled in the future. In 
which case this would be a hidden gotcha. Maybe all that's needed in 
mark_complete is also to look towards the end of the list?

Regards,

Tvrtko

> +
>   	/*
>   	 * If this is part of a virtual engine, its next request may
>   	 * have been blocked waiting for access to the active context.
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 198cf2f754f4..1703130ef0ef 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -7,6 +7,7 @@
>   #include <linux/prime_numbers.h>
>   
>   #include "gem/i915_gem_pm.h"
> +#include "gt/intel_engine_heartbeat.h"
>   #include "gt/intel_reset.h"
>   
>   #include "i915_selftest.h"
> @@ -986,6 +987,277 @@ static int live_nopreempt(void *arg)
>   	goto err_client_b;
>   }
>   
> +struct live_preempt_cancel {
> +	struct intel_engine_cs *engine;
> +	struct preempt_client a, b;
> +};
> +
> +static int __cancel_active0(struct live_preempt_cancel *arg)
> +{
> +	struct i915_request *rq;
> +	struct igt_live_test t;
> +	int err;
> +
> +	/* Preempt cancel of ELSP0 */
> +	GEM_TRACE("%s(%s)\n", __func__, arg->engine->name);
> +
> +	if (igt_live_test_begin(&t, arg->engine->i915,
> +				__func__, arg->engine->name))
> +		return -EIO;
> +
> +	clear_bit(CONTEXT_BANNED, &arg->a.ctx->flags);
> +	rq = spinner_create_request(&arg->a.spin,
> +				    arg->a.ctx, arg->engine,
> +				    MI_ARB_CHECK);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	i915_request_get(rq);
> +	i915_request_add(rq);
> +	if (!igt_wait_for_spinner(&arg->a.spin, rq)) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	i915_gem_context_set_banned(arg->a.ctx);
> +	err = intel_engine_pulse(arg->engine);
> +	if (err)
> +		goto out;
> +
> +	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	if (rq->fence.error != -EIO) {
> +		pr_err("Cancelled inflight0 request did not report -EIO\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +out:
> +	i915_request_put(rq);
> +	if (igt_live_test_end(&t))
> +		err = -EIO;
> +	return err;
> +}
> +
> +static int __cancel_active1(struct live_preempt_cancel *arg)
> +{
> +	struct i915_request *rq[2] = {};
> +	struct igt_live_test t;
> +	int err;
> +
> +	/* Preempt cancel of ELSP1 */
> +	GEM_TRACE("%s(%s)\n", __func__, arg->engine->name);
> +
> +	if (igt_live_test_begin(&t, arg->engine->i915,
> +				__func__, arg->engine->name))
> +		return -EIO;
> +
> +	clear_bit(CONTEXT_BANNED, &arg->a.ctx->flags);
> +	rq[0] = spinner_create_request(&arg->a.spin,
> +				       arg->a.ctx, arg->engine,
> +				       MI_NOOP); /* no preemption */
> +	if (IS_ERR(rq[0]))
> +		return PTR_ERR(rq[0]);
> +
> +	i915_request_get(rq[0]);
> +	i915_request_add(rq[0]);
> +	if (!igt_wait_for_spinner(&arg->a.spin, rq[0])) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	clear_bit(CONTEXT_BANNED, &arg->b.ctx->flags);
> +	rq[1] = spinner_create_request(&arg->b.spin,
> +				       arg->b.ctx, arg->engine,
> +				       MI_ARB_CHECK);
> +	if (IS_ERR(rq[1])) {
> +		err = PTR_ERR(rq[1]);
> +		goto out;
> +	}
> +
> +	i915_request_get(rq[1]);
> +	err = i915_request_await_dma_fence(rq[1], &rq[0]->fence);
> +	i915_request_add(rq[1]);
> +	if (err)
> +		goto out;
> +
> +	i915_gem_context_set_banned(arg->b.ctx);
> +	err = intel_engine_pulse(arg->engine);
> +	if (err)
> +		goto out;
> +
> +	igt_spinner_end(&arg->a.spin);
> +	if (i915_request_wait(rq[1], 0, HZ / 5) < 0) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	if (rq[0]->fence.error != 0) {
> +		pr_err("Normal inflight0 request did not complete\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (rq[1]->fence.error != -EIO) {
> +		pr_err("Cancelled inflight1 request did not report -EIO\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +out:
> +	i915_request_put(rq[1]);
> +	i915_request_put(rq[0]);
> +	if (igt_live_test_end(&t))
> +		err = -EIO;
> +	return err;
> +}
> +
> +static int __cancel_queued(struct live_preempt_cancel *arg)
> +{
> +	struct i915_request *rq[3] = {};
> +	struct igt_live_test t;
> +	int err;
> +
> +	/* Full ELSP and one in the wings */
> +	GEM_TRACE("%s(%s)\n", __func__, arg->engine->name);
> +
> +	if (igt_live_test_begin(&t, arg->engine->i915,
> +				__func__, arg->engine->name))
> +		return -EIO;
> +
> +	clear_bit(CONTEXT_BANNED, &arg->a.ctx->flags);
> +	rq[0] = spinner_create_request(&arg->a.spin,
> +				       arg->a.ctx, arg->engine,
> +				       MI_ARB_CHECK);
> +	if (IS_ERR(rq[0]))
> +		return PTR_ERR(rq[0]);
> +
> +	i915_request_get(rq[0]);
> +	i915_request_add(rq[0]);
> +	if (!igt_wait_for_spinner(&arg->a.spin, rq[0])) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	clear_bit(CONTEXT_BANNED, &arg->b.ctx->flags);
> +	rq[1] = igt_request_alloc(arg->b.ctx, arg->engine);
> +	if (IS_ERR(rq[1])) {
> +		err = PTR_ERR(rq[1]);
> +		goto out;
> +	}
> +
> +	i915_request_get(rq[1]);
> +	err = i915_request_await_dma_fence(rq[1], &rq[0]->fence);
> +	i915_request_add(rq[1]);
> +	if (err)
> +		goto out;
> +
> +	rq[2] = spinner_create_request(&arg->b.spin,
> +				       arg->a.ctx, arg->engine,
> +				       MI_ARB_CHECK);
> +	if (IS_ERR(rq[2])) {
> +		err = PTR_ERR(rq[2]);
> +		goto out;
> +	}
> +
> +	i915_request_get(rq[2]);
> +	err = i915_request_await_dma_fence(rq[2], &rq[1]->fence);
> +	i915_request_add(rq[2]);
> +	if (err)
> +		goto out;
> +
> +	i915_gem_context_set_banned(arg->a.ctx);
> +	err = intel_engine_pulse(arg->engine);
> +	if (err)
> +		goto out;
> +
> +	if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	if (rq[0]->fence.error != -EIO) {
> +		pr_err("Cancelled inflight0 request did not report -EIO\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (rq[1]->fence.error != 0) {
> +		pr_err("Normal inflight1 request did not complete\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (rq[2]->fence.error != -EIO) {
> +		pr_err("Cancelled queued request did not report -EIO\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +out:
> +	i915_request_put(rq[2]);
> +	i915_request_put(rq[1]);
> +	i915_request_put(rq[0]);
> +	if (igt_live_test_end(&t))
> +		err = -EIO;
> +	return err;
> +}
> +
> +static int live_preempt_cancel(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct live_preempt_cancel data;
> +	enum intel_engine_id id;
> +	int err = -ENOMEM;
> +
> +	/*
> +	 * To cancel an inflight context, we need to first remove it from the
> +	 * GPU. That sounds like preemption! Plus a little bit of bookkeeping.
> +	 */
> +
> +	if (!HAS_LOGICAL_RING_PREEMPTION(i915))
> +		return 0;
> +
> +	if (preempt_client_init(i915, &data.a))
> +		return -ENOMEM;
> +	if (preempt_client_init(i915, &data.b))
> +		goto err_client_a;
> +
> +	for_each_engine(data.engine, i915, id) {
> +		if (!intel_engine_has_preemption(data.engine))
> +			continue;
> +
> +		err = __cancel_active0(&data);
> +		if (err)
> +			goto err_wedged;
> +
> +		err = __cancel_active1(&data);
> +		if (err)
> +			goto err_wedged;
> +
> +		err = __cancel_queued(&data);
> +		if (err)
> +			goto err_wedged;
> +	}
> +
> +	err = 0;
> +err_client_b:
> +	preempt_client_fini(&data.b);
> +err_client_a:
> +	preempt_client_fini(&data.a);
> +	return err;
> +
> +err_wedged:
> +	GEM_TRACE_DUMP();
> +	igt_spinner_end(&data.b.spin);
> +	igt_spinner_end(&data.a.spin);
> +	intel_gt_set_wedged(&i915->gt);
> +	goto err_client_b;
> +}
> +
>   static int live_suppress_self_preempt(void *arg)
>   {
>   	struct drm_i915_private *i915 = arg;
> @@ -2270,6 +2542,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   		SUBTEST(live_preempt),
>   		SUBTEST(live_late_preempt),
>   		SUBTEST(live_nopreempt),
> +		SUBTEST(live_preempt_cancel),
>   		SUBTEST(live_suppress_self_preempt),
>   		SUBTEST(live_suppress_wait_preempt),
>   		SUBTEST(live_chain_preempt),
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  reply	other threads:[~2019-10-11  9:47 UTC|newest]

Thread overview: 42+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-10-10  7:14 [PATCH 01/10] drm/i915: Note the addition of timeslicing to the pretend scheduler Chris Wilson
2019-10-10  7:14 ` [PATCH 02/10] drm/i915/execlists: Leave tell-tales as to why pending[] is bad Chris Wilson
2019-10-11  8:39   ` Tvrtko Ursulin
2019-10-10  7:14 ` [PATCH 03/10] drm/i915: Expose engine properties via sysfs Chris Wilson
2019-10-11  8:44   ` Tvrtko Ursulin
2019-10-11  8:49     ` Chris Wilson
2019-10-11  9:04       ` Tvrtko Ursulin
2019-10-11  9:40   ` [PATCH v2] " Chris Wilson
2019-10-10  7:14 ` [PATCH 04/10] drm/i915/execlists: Force preemption Chris Wilson
2019-10-10  7:14 ` [PATCH 05/10] drm/i915: Mark up "sentinel" requests Chris Wilson
2019-10-11  8:45   ` Tvrtko Ursulin
2019-10-10  7:14 ` [PATCH 06/10] drm/i915/gt: Introduce barrier pulses along engines Chris Wilson
2019-10-11  9:11   ` Tvrtko Ursulin
2019-10-11  9:52     ` Chris Wilson
2019-10-10  7:14 ` [PATCH 07/10] drm/i915/execlists: Cancel banned contexts on schedule-out Chris Wilson
2019-10-11  9:47   ` Tvrtko Ursulin [this message]
2019-10-11 10:03     ` Chris Wilson
2019-10-11 10:15     ` Chris Wilson
2019-10-11 10:40       ` Chris Wilson
2019-10-11 11:16   ` [PATCH v2] " Chris Wilson
2019-10-11 13:10     ` Tvrtko Ursulin
2019-10-11 14:10       ` Chris Wilson
2019-10-10  7:14 ` [PATCH 08/10] drm/i915: Cancel non-persistent contexts on close Chris Wilson
2019-10-11 13:55   ` Tvrtko Ursulin
2019-10-11 14:22     ` Chris Wilson
2019-10-11 15:41       ` Chris Wilson
2019-10-10  7:14 ` [PATCH 09/10] drm/i915: Replace hangcheck by heartbeats Chris Wilson
2019-10-11 14:24   ` Tvrtko Ursulin
2019-10-11 15:06     ` Chris Wilson
2019-10-10  7:14 ` [PATCH 10/10] drm/i915: Flush idle barriers when waiting Chris Wilson
2019-10-11 14:56   ` Tvrtko Ursulin
2019-10-11 15:11     ` Chris Wilson
2019-10-14 13:08       ` Tvrtko Ursulin
2019-10-14 13:38         ` Chris Wilson
2019-10-23 15:33         ` Chris Wilson
2019-10-23 15:33           ` [Intel-gfx] " Chris Wilson
2019-10-10  8:18 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/10] drm/i915: Note the addition of timeslicing to the pretend scheduler Patchwork
2019-10-10  8:42 ` ✓ Fi.CI.BAT: success " Patchwork
2019-10-10 16:19 ` ✗ Fi.CI.IGT: failure " Patchwork
2019-10-11  8:16 ` [PATCH 01/10] " Tvrtko Ursulin
2019-10-11  9:49 ` ✗ Fi.CI.BUILD: failure for series starting with [01/10] drm/i915: Note the addition of timeslicing to the pretend scheduler (rev2) Patchwork
2019-10-11 11:39 ` ✗ Fi.CI.BUILD: failure for series starting with [01/10] drm/i915: Note the addition of timeslicing to the pretend scheduler (rev3) Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=e9b171f6-be40-fb86-99e0-3bfe5f69404e@linux.intel.com \
    --to=tvrtko.ursulin@linux.intel.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.