Re: [PATCH 07/10] drm/i915/execlists: Cancel banned contexts on schedule-out

From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Subject: Re: [PATCH 07/10] drm/i915/execlists: Cancel banned contexts on schedule-out
Date: Fri, 11 Oct 2019 10:47:26 +0100	[thread overview]
Message-ID: <e9b171f6-be40-fb86-99e0-3bfe5f69404e@linux.intel.com> (raw)
In-Reply-To: <20191010071434.31195-7-chris@chris-wilson.co.uk>

On 10/10/2019 08:14, Chris Wilson wrote:
> On completion of a banned context, scrub the context image so that we do

s/completion/schedule out/ like in the subject line? Otherwise I 
struggle to understand how banned context is completing. Presumably it 
was banned because it keeps hanging.

> not replay the active payload. The intent is that we skip banned
> payloads on request submission so that the timeline advancement
> continues on in the background. However, if we are returning to a
> preempted request, i915_request_skip() is ineffective and instead we
> need to patch up the context image so that it continues from the start
> of the next request.

But if the context is banned why do we want to continue from the start 
of the next request? Don't we want to zap all submitted so far?

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/gt/intel_lrc.c    |  58 ++++++
>   drivers/gpu/drm/i915/gt/selftest_lrc.c | 273 +++++++++++++++++++++++++
>   2 files changed, 331 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
> index eb99f1e804f7..79c7ebea2fcc 100644
> --- a/drivers/gpu/drm/i915/gt/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
> @@ -234,6 +234,9 @@ static void execlists_init_reg_state(u32 *reg_state,
>   				     const struct intel_engine_cs *engine,
>   				     const struct intel_ring *ring,
>   				     bool close);
> +static void
> +__execlists_update_reg_state(const struct intel_context *ce,
> +			     const struct intel_engine_cs *engine);
>   
>   static void __context_pin_acquire(struct intel_context *ce)
>   {
> @@ -1022,6 +1025,58 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
>   		tasklet_schedule(&ve->base.execlists.tasklet);
>   }
>   
> +static void
> +mark_complete(struct i915_request *rq, struct intel_engine_cs *engine)
> +{
> +	const struct intel_timeline * const tl = rcu_dereference(rq->timeline);
> +
> +	*(u32 *)tl->hwsp_seqno = rq->fence.seqno;
> +	GEM_BUG_ON(!i915_request_completed(rq));
> +
> +	list_for_each_entry_from_reverse(rq, &tl->requests, link) {
> +		if (i915_request_signaled(rq))
> +			break;
> +
> +		mark_eio(rq);

This would -EIO requests which have potentially be completed but not 
retired yet? If so why?

> +	}
> +
> +	intel_engine_queue_breadcrumbs(engine);
> +}
> +
> +static void cancel_active(struct i915_request *rq,
> +			  struct intel_engine_cs *engine)
> +{
> +	struct intel_context * const ce = rq->hw_context;
> +	u32 *regs = ce->lrc_reg_state;
> +
> +	if (i915_request_completed(rq))
> +		return;
> +
> +	GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
> +		  __func__, engine->name, rq->fence.context, rq->fence.seqno);
> +	__context_pin_acquire(ce);
> +
> +	/* Scrub the context image to prevent replaying the previous batch */
> +	memcpy(regs, /* skip restoring the vanilla PPHWSP */
> +	       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
> +	       engine->context_size - PAGE_SIZE);

context_size - LRC_STATE_PN * PAGE_SIZE ?

> +	execlists_init_reg_state(regs, ce, engine, ce->ring, false);
> +
> +	/* Ring will be advanced on retire; here we need to reset the context */
> +	ce->ring->head = intel_ring_wrap(ce->ring, rq->wa_tail);
> +	__execlists_update_reg_state(ce, engine);
> +
> +	/* We've switched away, so this should be a no-op, but intent matters */
> +	ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
> +
> +	/* Let everyone know that the request may now be retired */
> +	rcu_read_lock();
> +	mark_complete(rq, engine);
> +	rcu_read_unlock();
> +
> +	__context_pin_release(ce);
> +}
> +
>   static inline void
>   __execlists_schedule_out(struct i915_request *rq,
>   			 struct intel_engine_cs * const engine)
> @@ -1032,6 +1087,9 @@ __execlists_schedule_out(struct i915_request *rq,
>   	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
>   	intel_gt_pm_put(engine->gt);
>   
> +	if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
> +		cancel_active(rq, engine);

Or you are counting this is already the last runnable request from this 
context due coalescing? It wouldn't work if for any reason coalescing 
would be prevented. Either with GVT, or I had some ideas to prevent 
coalescing for contexts where watchdog is enabled in the future. In 
which case this would be a hidden gotcha. Maybe all that's needed in 
mark_complete is also to look towards the end of the list?

Regards,

Tvrtko

> +
>   	/*
>   	 * If this is part of a virtual engine, its next request may
>   	 * have been blocked waiting for access to the active context.
> diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> index 198cf2f754f4..1703130ef0ef 100644
> --- a/drivers/gpu/drm/i915/gt/selftest_lrc.c
> +++ b/drivers/gpu/drm/i915/gt/selftest_lrc.c
> @@ -7,6 +7,7 @@
>   #include <linux/prime_numbers.h>
>   
>   #include "gem/i915_gem_pm.h"
> +#include "gt/intel_engine_heartbeat.h"
>   #include "gt/intel_reset.h"
>   
>   #include "i915_selftest.h"
> @@ -986,6 +987,277 @@ static int live_nopreempt(void *arg)
>   	goto err_client_b;
>   }
>   
> +struct live_preempt_cancel {
> +	struct intel_engine_cs *engine;
> +	struct preempt_client a, b;
> +};
> +
> +static int __cancel_active0(struct live_preempt_cancel *arg)
> +{
> +	struct i915_request *rq;
> +	struct igt_live_test t;
> +	int err;
> +
> +	/* Preempt cancel of ELSP0 */
> +	GEM_TRACE("%s(%s)\n", __func__, arg->engine->name);
> +
> +	if (igt_live_test_begin(&t, arg->engine->i915,
> +				__func__, arg->engine->name))
> +		return -EIO;
> +
> +	clear_bit(CONTEXT_BANNED, &arg->a.ctx->flags);
> +	rq = spinner_create_request(&arg->a.spin,
> +				    arg->a.ctx, arg->engine,
> +				    MI_ARB_CHECK);
> +	if (IS_ERR(rq))
> +		return PTR_ERR(rq);
> +
> +	i915_request_get(rq);
> +	i915_request_add(rq);
> +	if (!igt_wait_for_spinner(&arg->a.spin, rq)) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	i915_gem_context_set_banned(arg->a.ctx);
> +	err = intel_engine_pulse(arg->engine);
> +	if (err)
> +		goto out;
> +
> +	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	if (rq->fence.error != -EIO) {
> +		pr_err("Cancelled inflight0 request did not report -EIO\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +out:
> +	i915_request_put(rq);
> +	if (igt_live_test_end(&t))
> +		err = -EIO;
> +	return err;
> +}
> +
> +static int __cancel_active1(struct live_preempt_cancel *arg)
> +{
> +	struct i915_request *rq[2] = {};
> +	struct igt_live_test t;
> +	int err;
> +
> +	/* Preempt cancel of ELSP1 */
> +	GEM_TRACE("%s(%s)\n", __func__, arg->engine->name);
> +
> +	if (igt_live_test_begin(&t, arg->engine->i915,
> +				__func__, arg->engine->name))
> +		return -EIO;
> +
> +	clear_bit(CONTEXT_BANNED, &arg->a.ctx->flags);
> +	rq[0] = spinner_create_request(&arg->a.spin,
> +				       arg->a.ctx, arg->engine,
> +				       MI_NOOP); /* no preemption */
> +	if (IS_ERR(rq[0]))
> +		return PTR_ERR(rq[0]);
> +
> +	i915_request_get(rq[0]);
> +	i915_request_add(rq[0]);
> +	if (!igt_wait_for_spinner(&arg->a.spin, rq[0])) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	clear_bit(CONTEXT_BANNED, &arg->b.ctx->flags);
> +	rq[1] = spinner_create_request(&arg->b.spin,
> +				       arg->b.ctx, arg->engine,
> +				       MI_ARB_CHECK);
> +	if (IS_ERR(rq[1])) {
> +		err = PTR_ERR(rq[1]);
> +		goto out;
> +	}
> +
> +	i915_request_get(rq[1]);
> +	err = i915_request_await_dma_fence(rq[1], &rq[0]->fence);
> +	i915_request_add(rq[1]);
> +	if (err)
> +		goto out;
> +
> +	i915_gem_context_set_banned(arg->b.ctx);
> +	err = intel_engine_pulse(arg->engine);
> +	if (err)
> +		goto out;
> +
> +	igt_spinner_end(&arg->a.spin);
> +	if (i915_request_wait(rq[1], 0, HZ / 5) < 0) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	if (rq[0]->fence.error != 0) {
> +		pr_err("Normal inflight0 request did not complete\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (rq[1]->fence.error != -EIO) {
> +		pr_err("Cancelled inflight1 request did not report -EIO\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +out:
> +	i915_request_put(rq[1]);
> +	i915_request_put(rq[0]);
> +	if (igt_live_test_end(&t))
> +		err = -EIO;
> +	return err;
> +}
> +
> +static int __cancel_queued(struct live_preempt_cancel *arg)
> +{
> +	struct i915_request *rq[3] = {};
> +	struct igt_live_test t;
> +	int err;
> +
> +	/* Full ELSP and one in the wings */
> +	GEM_TRACE("%s(%s)\n", __func__, arg->engine->name);
> +
> +	if (igt_live_test_begin(&t, arg->engine->i915,
> +				__func__, arg->engine->name))
> +		return -EIO;
> +
> +	clear_bit(CONTEXT_BANNED, &arg->a.ctx->flags);
> +	rq[0] = spinner_create_request(&arg->a.spin,
> +				       arg->a.ctx, arg->engine,
> +				       MI_ARB_CHECK);
> +	if (IS_ERR(rq[0]))
> +		return PTR_ERR(rq[0]);
> +
> +	i915_request_get(rq[0]);
> +	i915_request_add(rq[0]);
> +	if (!igt_wait_for_spinner(&arg->a.spin, rq[0])) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	clear_bit(CONTEXT_BANNED, &arg->b.ctx->flags);
> +	rq[1] = igt_request_alloc(arg->b.ctx, arg->engine);
> +	if (IS_ERR(rq[1])) {
> +		err = PTR_ERR(rq[1]);
> +		goto out;
> +	}
> +
> +	i915_request_get(rq[1]);
> +	err = i915_request_await_dma_fence(rq[1], &rq[0]->fence);
> +	i915_request_add(rq[1]);
> +	if (err)
> +		goto out;
> +
> +	rq[2] = spinner_create_request(&arg->b.spin,
> +				       arg->a.ctx, arg->engine,
> +				       MI_ARB_CHECK);
> +	if (IS_ERR(rq[2])) {
> +		err = PTR_ERR(rq[2]);
> +		goto out;
> +	}
> +
> +	i915_request_get(rq[2]);
> +	err = i915_request_await_dma_fence(rq[2], &rq[1]->fence);
> +	i915_request_add(rq[2]);
> +	if (err)
> +		goto out;
> +
> +	i915_gem_context_set_banned(arg->a.ctx);
> +	err = intel_engine_pulse(arg->engine);
> +	if (err)
> +		goto out;
> +
> +	if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
> +		err = -EIO;
> +		goto out;
> +	}
> +
> +	if (rq[0]->fence.error != -EIO) {
> +		pr_err("Cancelled inflight0 request did not report -EIO\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (rq[1]->fence.error != 0) {
> +		pr_err("Normal inflight1 request did not complete\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +	if (rq[2]->fence.error != -EIO) {
> +		pr_err("Cancelled queued request did not report -EIO\n");
> +		err = -EINVAL;
> +		goto out;
> +	}
> +
> +out:
> +	i915_request_put(rq[2]);
> +	i915_request_put(rq[1]);
> +	i915_request_put(rq[0]);
> +	if (igt_live_test_end(&t))
> +		err = -EIO;
> +	return err;
> +}
> +
> +static int live_preempt_cancel(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct live_preempt_cancel data;
> +	enum intel_engine_id id;
> +	int err = -ENOMEM;
> +
> +	/*
> +	 * To cancel an inflight context, we need to first remove it from the
> +	 * GPU. That sounds like preemption! Plus a little bit of bookkeeping.
> +	 */
> +
> +	if (!HAS_LOGICAL_RING_PREEMPTION(i915))
> +		return 0;
> +
> +	if (preempt_client_init(i915, &data.a))
> +		return -ENOMEM;
> +	if (preempt_client_init(i915, &data.b))
> +		goto err_client_a;
> +
> +	for_each_engine(data.engine, i915, id) {
> +		if (!intel_engine_has_preemption(data.engine))
> +			continue;
> +
> +		err = __cancel_active0(&data);
> +		if (err)
> +			goto err_wedged;
> +
> +		err = __cancel_active1(&data);
> +		if (err)
> +			goto err_wedged;
> +
> +		err = __cancel_queued(&data);
> +		if (err)
> +			goto err_wedged;
> +	}
> +
> +	err = 0;
> +err_client_b:
> +	preempt_client_fini(&data.b);
> +err_client_a:
> +	preempt_client_fini(&data.a);
> +	return err;
> +
> +err_wedged:
> +	GEM_TRACE_DUMP();
> +	igt_spinner_end(&data.b.spin);
> +	igt_spinner_end(&data.a.spin);
> +	intel_gt_set_wedged(&i915->gt);
> +	goto err_client_b;
> +}
> +
>   static int live_suppress_self_preempt(void *arg)
>   {
>   	struct drm_i915_private *i915 = arg;
> @@ -2270,6 +2542,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
>   		SUBTEST(live_preempt),
>   		SUBTEST(live_late_preempt),
>   		SUBTEST(live_nopreempt),
> +		SUBTEST(live_preempt_cancel),
>   		SUBTEST(live_suppress_self_preempt),
>   		SUBTEST(live_suppress_wait_preempt),
>   		SUBTEST(live_chain_preempt),
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx