All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Subject: Re: [RESEND 2/6] drm/i915/selftests: Exercise reset to break stuck GTT eviction
Date: Mon, 16 Jul 2018 10:32:25 +0100	[thread overview]
Message-ID: <280388cf-4e8f-6179-4eda-410665465df2@linux.intel.com> (raw)
In-Reply-To: <20180716080332.32283-2-chris@chris-wilson.co.uk>


On 16/07/2018 09:03, Chris Wilson wrote:
> We must be able to reset the GPU while we are waiting on it to perform
> an eviction (unbinding an active vma). So attach a spinning request to a
> target vma and try and it evict it from a thread to see if that blocks
> indefinitely.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   .../gpu/drm/i915/selftests/intel_hangcheck.c  | 153 +++++++++++++++++-
>   1 file changed, 151 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 73462a65a330..344973664206 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -27,6 +27,7 @@
>   #include "../i915_selftest.h"
>   #include "i915_random.h"
>   #include "igt_flush_test.h"
> +#include "igt_wedge_me.h"
>   
>   #include "mock_context.h"
>   #include "mock_drm.h"
> @@ -921,7 +922,7 @@ static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
>   	return reset_count;
>   }
>   
> -static int igt_wait_reset(void *arg)
> +static int igt_reset_wait(void *arg)
>   {
>   	struct drm_i915_private *i915 = arg;
>   	struct i915_request *rq;
> @@ -995,6 +996,152 @@ static int igt_wait_reset(void *arg)
>   	return err;
>   }
>   
> +static int evict_vma(void *data)
> +{
> +	struct i915_vma *vma = data;
> +	struct drm_i915_private *i915 = vma->vm->i915;
> +	struct drm_mm_node evict = vma->node;
> +	int err;
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	err = i915_gem_evict_for_node(vma->vm, &evict, 0);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +
> +	return err;
> +}
> +
> +static int __igt_reset_evict_vma(struct drm_i915_private *i915,
> +				 struct i915_address_space *vm)
> +{
> +	struct drm_i915_gem_object *obj;
> +	struct task_struct *tsk = NULL;
> +	struct i915_request *rq;
> +	struct i915_vma *vma;
> +	struct hang h;
> +	int err;
> +
> +	if (!intel_engine_can_store_dword(i915->engine[RCS]))
> +		return 0;
> +
> +	/* Check that we can recover an unbind stuck on a hanging request */
> +
> +	global_reset_lock(i915);
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	err = hang_init(&h, i915);
> +	if (err)
> +		goto unlock;
> +
> +	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
> +	if (IS_ERR(obj)) {
> +		err = PTR_ERR(obj);
> +		goto fini;
> +	}
> +
> +	vma = i915_vma_instance(obj, vm, NULL);
> +	if (IS_ERR(vma)) {
> +		err = PTR_ERR(vma);
> +		goto out_obj;
> +	}
> +
> +	rq = hang_create_request(&h, i915->engine[RCS]);
> +	if (IS_ERR(rq)) {
> +		err = PTR_ERR(rq);
> +		goto out_obj;
> +	}
> +
> +	err = i915_vma_pin(vma, 0, 0,
> +			   i915_vma_is_ggtt(vma) ? PIN_GLOBAL : PIN_USER);
> +	if (err)
> +		goto out_obj;
> +
> +	err = i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE);
> +	i915_vma_unpin(vma);
> +
> +	i915_request_get(rq);
> +	i915_request_add(rq);
> +	if (err)
> +		goto out_rq;
> +
> +	mutex_unlock(&i915->drm.struct_mutex);
> +
> +	if (!wait_until_running(&h, rq)) {
> +		struct drm_printer p = drm_info_printer(i915->drm.dev);
> +
> +		pr_err("%s: Failed to start request %x, at %x\n",
> +		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
> +		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
> +
> +		i915_gem_set_wedged(i915);
> +		goto out_reset;
> +	}
> +
> +	tsk = kthread_run(evict_vma, vma, "igt/evict_vma");
> +
> +	if (wait_for(waitqueue_active(&rq->execute), 10)) {

So this is 10ms for the thread to start and start waiting. I am trying 
to judge if it will be stable enough.. And since it cannot distinguish 
between the failure of thread to start and get to the bit where it is 
supposed to wait, versus actually not waiting.

Maybe set a flag just before i915_gem_evict_for_node and wait on that first?

And there is a comment against waitqueue_active that warns about races 
but I think it is fine in this case.

> +		struct drm_printer p = drm_info_printer(i915->drm.dev);
> +
> +		pr_err("igt/evict_vma kthread did not wait\n");
> +		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
> +
> +		i915_gem_set_wedged(i915);
> +		goto out_reset;
> +	}
> +
> +out_reset:
> +	fake_hangcheck(rq, intel_engine_flag(rq->engine));
> +
> +	if (tsk) {
> +		struct igt_wedge_me w;
> +
> +		igt_wedge_on_timeout(&w, i915, HZ / 5)
> +			err = kthread_stop(tsk);

This means 200ms grace for reset to terminate the spinner and unblock 
the eviction thread? Is that enough on all platforms?

> +	}
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +out_rq:
> +	i915_request_put(rq);
> +out_obj:
> +	i915_gem_object_put(obj);
> +fini:
> +	hang_fini(&h);
> +unlock:
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	global_reset_unlock(i915);
> +
> +	if (i915_terminally_wedged(&i915->gpu_error))
> +		return -EIO;
> +
> +	return err;
> +}
> +
> +static int igt_reset_evict_ggtt(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +
> +	return __igt_reset_evict_vma(i915, &i915->ggtt.vm);
> +}
> +
> +static int igt_reset_evict_ppgtt(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct i915_gem_context *ctx;
> +	int err;
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	ctx = kernel_context(i915);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	if (IS_ERR(ctx))
> +		return PTR_ERR(ctx);
> +
> +	err = 0;
> +	if (ctx->ppgtt)

Just skip the test when !USES_FULL_PPGTT?

> +		err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm);
> +
> +	kernel_context_close(ctx);
> +	return err;
> +}
> +
>   static int wait_for_others(struct drm_i915_private *i915,
>   			   struct intel_engine_cs *exclude)
>   {
> @@ -1240,8 +1387,10 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
>   		SUBTEST(igt_reset_idle_engine),
>   		SUBTEST(igt_reset_active_engine),
>   		SUBTEST(igt_reset_engines),
> -		SUBTEST(igt_wait_reset),
>   		SUBTEST(igt_reset_queue),
> +		SUBTEST(igt_reset_wait),
> +		SUBTEST(igt_reset_evict_ggtt),
> +		SUBTEST(igt_reset_evict_ppgtt),
>   		SUBTEST(igt_handle_error),
>   	};
>   	bool saved_hangcheck;
> 

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  reply	other threads:[~2018-07-16  9:32 UTC|newest]

Thread overview: 18+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-07-16  8:03 [RESEND 1/6] drm/i915/selftests: Force a preemption hang Chris Wilson
2018-07-16  8:03 ` [RESEND 2/6] drm/i915/selftests: Exercise reset to break stuck GTT eviction Chris Wilson
2018-07-16  9:32   ` Tvrtko Ursulin [this message]
2018-07-16  8:03 ` [RESEND 3/6] drm/i915/execlists: Always clear preempt status on cancelling all Chris Wilson
2018-07-16  9:45   ` Tvrtko Ursulin
2018-07-16  8:03 ` [RESEND 4/6] drm/i915/execlists: Disable submission tasklet upon wedging Chris Wilson
2018-07-16  9:59   ` Tvrtko Ursulin
2018-07-16  8:03 ` [RESEND 5/6] drm/i915: Remove pci private pointer after destroying the device private Chris Wilson
2018-07-16 10:04   ` Tvrtko Ursulin
2018-07-16 10:32   ` Michal Wajdeczko
2018-07-16 10:37     ` Chris Wilson
2018-07-16  8:03 ` [RESEND 6/6] drm/i915/selftests: Downgrade igt_timeout message Chris Wilson
2018-07-16 10:06   ` Tvrtko Ursulin
2018-07-16  8:32 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [RESEND,1/6] drm/i915/selftests: Force a preemption hang Patchwork
2018-07-16  8:53 ` ✓ Fi.CI.BAT: success " Patchwork
2018-07-16  9:08 ` [RESEND 1/6] " Tvrtko Ursulin
2018-07-16 15:04 ` ✓ Fi.CI.IGT: success for series starting with [RESEND,1/6] " Patchwork
  -- strict thread matches above, loose matches on Subject: below --
2018-07-16  7:22 [RESEND 1/6] " Chris Wilson
2018-07-16  7:22 ` [RESEND 2/6] drm/i915/selftests: Exercise reset to break stuck GTT eviction Chris Wilson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=280388cf-4e8f-6179-4eda-410665465df2@linux.intel.com \
    --to=tvrtko.ursulin@linux.intel.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.