All of lore.kernel.org
 help / color / mirror / Atom feed
From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Cc: igt-dev@lists.freedesktop.org
Subject: Re: [igt-dev] [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread
Date: Fri, 3 Aug 2018 15:03:05 +0100	[thread overview]
Message-ID: <39ca04ab-888a-c9ae-dd22-44077cfa5a75@linux.intel.com> (raw)
In-Reply-To: <20180727111324.7471-1-chris@chris-wilson.co.uk>


On 27/07/2018 12:13, Chris Wilson wrote:
> We assert that we complete a wedge within 250ms. However, when we use a
> thread to delay the wedging until after we start waiting, that thread
> itself is delayed longer than our wait timeout. This results in a false
> positive error where we fail the test before we even trigger the reset.
> 
> Reorder the test so that we only ever measure the delay from triggering
> the reset until we wakeup, and assert that is in a timely fashion
> (less than 250ms).
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105954
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   tests/gem_eio.c | 45 ++++++++++++++++++++++++---------------------
>   1 file changed, 24 insertions(+), 21 deletions(-)
> 
> diff --git a/tests/gem_eio.c b/tests/gem_eio.c
> index 3162a3170..b26b4b895 100644
> --- a/tests/gem_eio.c
> +++ b/tests/gem_eio.c
> @@ -190,7 +190,8 @@ static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
>   
>   struct hang_ctx {
>   	int debugfs;
> -	struct timespec ts;
> +	struct timespec delay;
> +	struct timespec *ts;
>   	timer_t timer;
>   };
>   
> @@ -198,8 +199,10 @@ static void hang_handler(union sigval arg)
>   {
>   	struct hang_ctx *ctx = arg.sival_ptr;
>   
> -	igt_debug("hang delay = %.2fus\n", igt_nsec_elapsed(&ctx->ts) / 1000.0);
> +	igt_debug("hang delay = %.2fus\n",
> +		  igt_nsec_elapsed(&ctx->delay) / 1000.0);
>   
> +	igt_nsec_elapsed(ctx->ts);
>   	igt_assert(igt_sysfs_set(ctx->debugfs, "i915_wedged", "-1"));
>   
>   	igt_assert_eq(timer_delete(ctx->timer), 0);
> @@ -207,7 +210,7 @@ static void hang_handler(union sigval arg)
>   	free(ctx);
>   }
>   
> -static void hang_after(int fd, unsigned int us)
> +static void hang_after(int fd, unsigned int us, struct timespec *ts)
>   {
>   	struct sigevent sev = {
>   		.sigev_notify = SIGEV_THREAD,
> @@ -229,30 +232,30 @@ static void hang_after(int fd, unsigned int us)
>   
>   	igt_assert_eq(timer_create(CLOCK_MONOTONIC, &sev, &ctx->timer), 0);
>   
> -	igt_nsec_elapsed(&ctx->ts);
> +	ctx->ts = ts;
> +	igt_nsec_elapsed(&ctx->delay);
>   
>   	igt_assert_eq(timer_settime(ctx->timer, 0, &its, NULL), 0);
>   }
>   
> -static int __check_wait(int fd, uint32_t bo, unsigned int wait)
> +static void __check_wait(int fd, uint32_t bo, unsigned int wait)
>   {
> -	unsigned long wait_timeout = 250e6; /* Some margin for actual reset. */
> -	int ret;
> +	struct timespec ts = {};
> +	uint64_t elapsed;
>   
>   	if (wait) {
> -		/*
> -		 * Double the wait plus some fixed margin to ensure gem_wait
> -		 * can never time out before the async hang runs.
> -		 */
> -		wait_timeout += wait * 2000 + 250e6;
> -		hang_after(fd, wait);
> +		hang_after(fd, wait, &ts);
>   	} else {
> +		igt_nsec_elapsed(&ts);
>   		manual_hang(fd);
>   	}
>   
> -	ret = __gem_wait(fd, bo, wait_timeout);
> +	gem_sync(fd, bo);

One drawback I can see is if reset fails to work then we hang until IGT 
runner timeout. Alternative would be a smaller patch to bump the wait 
delay which should work given how rare are the failures. It was one of 
the goals of the gem_eio rewrite I think, but whatever, I suppose 
runtime of test when reset is broken is not that important.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

>   
> -	return ret;
> +	elapsed = igt_nsec_elapsed(&ts);
> +	igt_assert_f(elapsed < 250e6,
> +		     "Wake up following reset+wedge took %.3fms\n",
> +		     elapsed*1e-6);
>   }
>   
>   static void __test_banned(int fd)
> @@ -322,7 +325,7 @@ static void test_wait(int fd, unsigned int flags, unsigned int wait)
>   
>   	hang = spin_sync(fd, 0, I915_EXEC_DEFAULT);
>   
> -	igt_assert_eq(__check_wait(fd, hang->handle, wait), 0);
> +	__check_wait(fd, hang->handle, wait);
>   
>   	igt_spin_batch_free(fd, hang);
>   
> @@ -392,7 +395,7 @@ static void test_inflight(int fd, unsigned int wait)
>   			igt_assert(fence[n] != -1);
>   		}
>   
> -		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +		__check_wait(fd, obj[1].handle, wait);
>   
>   		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -443,7 +446,7 @@ static void test_inflight_suspend(int fd)
>   	igt_set_autoresume_delay(30);
>   	igt_system_suspend_autoresume(SUSPEND_STATE_MEM, SUSPEND_TEST_NONE);
>   
> -	igt_assert_eq(__check_wait(fd, obj[1].handle, 10), 0);
> +	__check_wait(fd, obj[1].handle, 10);
>   
>   	for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   		igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -521,7 +524,7 @@ static void test_inflight_contexts(int fd, unsigned int wait)
>   			igt_assert(fence[n] != -1);
>   		}
>   
> -		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +		__check_wait(fd, obj[1].handle, wait);
>   
>   		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -630,7 +633,7 @@ static void test_inflight_internal(int fd, unsigned int wait)
>   		nfence++;
>   	}
>   
> -	igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +	__check_wait(fd, obj[1].handle, wait);
>   
>   	while (nfence--) {
>   		igt_assert_eq(sync_fence_status(fences[nfence]), -EIO);
> @@ -682,7 +685,7 @@ static void test_reset_stress(int fd, unsigned int flags)
>   			gem_execbuf(fd, &execbuf);
>   
>   		/* Wedge after a small delay. */
> -		igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0);
> +		__check_wait(fd, obj.handle, 100e3);
>   
>   		/* Unwedge by forcing a reset. */
>   		igt_assert(i915_reset_control(true));
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

WARNING: multiple messages have this Message-ID (diff)
From: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
To: Chris Wilson <chris@chris-wilson.co.uk>, intel-gfx@lists.freedesktop.org
Cc: igt-dev@lists.freedesktop.org
Subject: Re: [igt-dev] [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread
Date: Fri, 3 Aug 2018 15:03:05 +0100	[thread overview]
Message-ID: <39ca04ab-888a-c9ae-dd22-44077cfa5a75@linux.intel.com> (raw)
In-Reply-To: <20180727111324.7471-1-chris@chris-wilson.co.uk>


On 27/07/2018 12:13, Chris Wilson wrote:
> We assert that we complete a wedge within 250ms. However, when we use a
> thread to delay the wedging until after we start waiting, that thread
> itself is delayed longer than our wait timeout. This results in a false
> positive error where we fail the test before we even trigger the reset.
> 
> Reorder the test so that we only ever measure the delay from triggering
> the reset until we wakeup, and assert that is in a timely fashion
> (less than 250ms).
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105954
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   tests/gem_eio.c | 45 ++++++++++++++++++++++++---------------------
>   1 file changed, 24 insertions(+), 21 deletions(-)
> 
> diff --git a/tests/gem_eio.c b/tests/gem_eio.c
> index 3162a3170..b26b4b895 100644
> --- a/tests/gem_eio.c
> +++ b/tests/gem_eio.c
> @@ -190,7 +190,8 @@ static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
>   
>   struct hang_ctx {
>   	int debugfs;
> -	struct timespec ts;
> +	struct timespec delay;
> +	struct timespec *ts;
>   	timer_t timer;
>   };
>   
> @@ -198,8 +199,10 @@ static void hang_handler(union sigval arg)
>   {
>   	struct hang_ctx *ctx = arg.sival_ptr;
>   
> -	igt_debug("hang delay = %.2fus\n", igt_nsec_elapsed(&ctx->ts) / 1000.0);
> +	igt_debug("hang delay = %.2fus\n",
> +		  igt_nsec_elapsed(&ctx->delay) / 1000.0);
>   
> +	igt_nsec_elapsed(ctx->ts);
>   	igt_assert(igt_sysfs_set(ctx->debugfs, "i915_wedged", "-1"));
>   
>   	igt_assert_eq(timer_delete(ctx->timer), 0);
> @@ -207,7 +210,7 @@ static void hang_handler(union sigval arg)
>   	free(ctx);
>   }
>   
> -static void hang_after(int fd, unsigned int us)
> +static void hang_after(int fd, unsigned int us, struct timespec *ts)
>   {
>   	struct sigevent sev = {
>   		.sigev_notify = SIGEV_THREAD,
> @@ -229,30 +232,30 @@ static void hang_after(int fd, unsigned int us)
>   
>   	igt_assert_eq(timer_create(CLOCK_MONOTONIC, &sev, &ctx->timer), 0);
>   
> -	igt_nsec_elapsed(&ctx->ts);
> +	ctx->ts = ts;
> +	igt_nsec_elapsed(&ctx->delay);
>   
>   	igt_assert_eq(timer_settime(ctx->timer, 0, &its, NULL), 0);
>   }
>   
> -static int __check_wait(int fd, uint32_t bo, unsigned int wait)
> +static void __check_wait(int fd, uint32_t bo, unsigned int wait)
>   {
> -	unsigned long wait_timeout = 250e6; /* Some margin for actual reset. */
> -	int ret;
> +	struct timespec ts = {};
> +	uint64_t elapsed;
>   
>   	if (wait) {
> -		/*
> -		 * Double the wait plus some fixed margin to ensure gem_wait
> -		 * can never time out before the async hang runs.
> -		 */
> -		wait_timeout += wait * 2000 + 250e6;
> -		hang_after(fd, wait);
> +		hang_after(fd, wait, &ts);
>   	} else {
> +		igt_nsec_elapsed(&ts);
>   		manual_hang(fd);
>   	}
>   
> -	ret = __gem_wait(fd, bo, wait_timeout);
> +	gem_sync(fd, bo);

One drawback I can see is if reset fails to work then we hang until IGT 
runner timeout. Alternative would be a smaller patch to bump the wait 
delay which should work given how rare are the failures. It was one of 
the goals of the gem_eio rewrite I think, but whatever, I suppose 
runtime of test when reset is broken is not that important.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

>   
> -	return ret;
> +	elapsed = igt_nsec_elapsed(&ts);
> +	igt_assert_f(elapsed < 250e6,
> +		     "Wake up following reset+wedge took %.3fms\n",
> +		     elapsed*1e-6);
>   }
>   
>   static void __test_banned(int fd)
> @@ -322,7 +325,7 @@ static void test_wait(int fd, unsigned int flags, unsigned int wait)
>   
>   	hang = spin_sync(fd, 0, I915_EXEC_DEFAULT);
>   
> -	igt_assert_eq(__check_wait(fd, hang->handle, wait), 0);
> +	__check_wait(fd, hang->handle, wait);
>   
>   	igt_spin_batch_free(fd, hang);
>   
> @@ -392,7 +395,7 @@ static void test_inflight(int fd, unsigned int wait)
>   			igt_assert(fence[n] != -1);
>   		}
>   
> -		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +		__check_wait(fd, obj[1].handle, wait);
>   
>   		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -443,7 +446,7 @@ static void test_inflight_suspend(int fd)
>   	igt_set_autoresume_delay(30);
>   	igt_system_suspend_autoresume(SUSPEND_STATE_MEM, SUSPEND_TEST_NONE);
>   
> -	igt_assert_eq(__check_wait(fd, obj[1].handle, 10), 0);
> +	__check_wait(fd, obj[1].handle, 10);
>   
>   	for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   		igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -521,7 +524,7 @@ static void test_inflight_contexts(int fd, unsigned int wait)
>   			igt_assert(fence[n] != -1);
>   		}
>   
> -		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +		__check_wait(fd, obj[1].handle, wait);
>   
>   		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -630,7 +633,7 @@ static void test_inflight_internal(int fd, unsigned int wait)
>   		nfence++;
>   	}
>   
> -	igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +	__check_wait(fd, obj[1].handle, wait);
>   
>   	while (nfence--) {
>   		igt_assert_eq(sync_fence_status(fences[nfence]), -EIO);
> @@ -682,7 +685,7 @@ static void test_reset_stress(int fd, unsigned int flags)
>   			gem_execbuf(fd, &execbuf);
>   
>   		/* Wedge after a small delay. */
> -		igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0);
> +		__check_wait(fd, obj.handle, 100e3);
>   
>   		/* Unwedge by forcing a reset. */
>   		igt_assert(i915_reset_control(true));
> 
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

  parent reply	other threads:[~2018-08-03 14:03 UTC|newest]

Thread overview: 13+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-07-27 11:13 [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread Chris Wilson
2018-07-27 11:13 ` [igt-dev] " Chris Wilson
2018-07-27 13:24 ` [igt-dev] ✓ Fi.CI.BAT: success for " Patchwork
2018-07-27 14:52 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-07-27 15:16   ` Chris Wilson
2018-07-27 18:44     ` Chris Wilson
2018-07-30 16:20 ` [igt-dev] ✓ Fi.CI.BAT: success " Patchwork
2018-07-30 18:29 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork
2018-07-30 18:33   ` Chris Wilson
2018-08-03 14:03 ` Tvrtko Ursulin [this message]
2018-08-03 14:03   ` [igt-dev] [PATCH i-g-t] " Tvrtko Ursulin
2018-08-03 14:10   ` Chris Wilson
2018-08-03 14:10     ` Chris Wilson

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=39ca04ab-888a-c9ae-dd22-44077cfa5a75@linux.intel.com \
    --to=tvrtko.ursulin@linux.intel.com \
    --cc=chris@chris-wilson.co.uk \
    --cc=igt-dev@lists.freedesktop.org \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.