All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread
@ 2018-07-27 11:13 ` Chris Wilson
  0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-07-27 11:13 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev

We assert that we complete a wedge within 250ms. However, when we use a
thread to delay the wedging until after we start waiting, that thread
itself is delayed longer than our wait timeout. This results in a false
positive error where we fail the test before we even trigger the reset.

Reorder the test so that we only ever measure the delay from triggering
the reset until we wakeup, and assert that is in a timely fashion
(less than 250ms).

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105954
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 tests/gem_eio.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/tests/gem_eio.c b/tests/gem_eio.c
index 3162a3170..b26b4b895 100644
--- a/tests/gem_eio.c
+++ b/tests/gem_eio.c
@@ -190,7 +190,8 @@ static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
 
 struct hang_ctx {
 	int debugfs;
-	struct timespec ts;
+	struct timespec delay;
+	struct timespec *ts;
 	timer_t timer;
 };
 
@@ -198,8 +199,10 @@ static void hang_handler(union sigval arg)
 {
 	struct hang_ctx *ctx = arg.sival_ptr;
 
-	igt_debug("hang delay = %.2fus\n", igt_nsec_elapsed(&ctx->ts) / 1000.0);
+	igt_debug("hang delay = %.2fus\n",
+		  igt_nsec_elapsed(&ctx->delay) / 1000.0);
 
+	igt_nsec_elapsed(ctx->ts);
 	igt_assert(igt_sysfs_set(ctx->debugfs, "i915_wedged", "-1"));
 
 	igt_assert_eq(timer_delete(ctx->timer), 0);
@@ -207,7 +210,7 @@ static void hang_handler(union sigval arg)
 	free(ctx);
 }
 
-static void hang_after(int fd, unsigned int us)
+static void hang_after(int fd, unsigned int us, struct timespec *ts)
 {
 	struct sigevent sev = {
 		.sigev_notify = SIGEV_THREAD,
@@ -229,30 +232,30 @@ static void hang_after(int fd, unsigned int us)
 
 	igt_assert_eq(timer_create(CLOCK_MONOTONIC, &sev, &ctx->timer), 0);
 
-	igt_nsec_elapsed(&ctx->ts);
+	ctx->ts = ts;
+	igt_nsec_elapsed(&ctx->delay);
 
 	igt_assert_eq(timer_settime(ctx->timer, 0, &its, NULL), 0);
 }
 
-static int __check_wait(int fd, uint32_t bo, unsigned int wait)
+static void __check_wait(int fd, uint32_t bo, unsigned int wait)
 {
-	unsigned long wait_timeout = 250e6; /* Some margin for actual reset. */
-	int ret;
+	struct timespec ts = {};
+	uint64_t elapsed;
 
 	if (wait) {
-		/*
-		 * Double the wait plus some fixed margin to ensure gem_wait
-		 * can never time out before the async hang runs.
-		 */
-		wait_timeout += wait * 2000 + 250e6;
-		hang_after(fd, wait);
+		hang_after(fd, wait, &ts);
 	} else {
+		igt_nsec_elapsed(&ts);
 		manual_hang(fd);
 	}
 
-	ret = __gem_wait(fd, bo, wait_timeout);
+	gem_sync(fd, bo);
 
-	return ret;
+	elapsed = igt_nsec_elapsed(&ts);
+	igt_assert_f(elapsed < 250e6,
+		     "Wake up following reset+wedge took %.3fms\n",
+		     elapsed*1e-6);
 }
 
 static void __test_banned(int fd)
@@ -322,7 +325,7 @@ static void test_wait(int fd, unsigned int flags, unsigned int wait)
 
 	hang = spin_sync(fd, 0, I915_EXEC_DEFAULT);
 
-	igt_assert_eq(__check_wait(fd, hang->handle, wait), 0);
+	__check_wait(fd, hang->handle, wait);
 
 	igt_spin_batch_free(fd, hang);
 
@@ -392,7 +395,7 @@ static void test_inflight(int fd, unsigned int wait)
 			igt_assert(fence[n] != -1);
 		}
 
-		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
+		__check_wait(fd, obj[1].handle, wait);
 
 		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
 			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
@@ -443,7 +446,7 @@ static void test_inflight_suspend(int fd)
 	igt_set_autoresume_delay(30);
 	igt_system_suspend_autoresume(SUSPEND_STATE_MEM, SUSPEND_TEST_NONE);
 
-	igt_assert_eq(__check_wait(fd, obj[1].handle, 10), 0);
+	__check_wait(fd, obj[1].handle, 10);
 
 	for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
 		igt_assert_eq(sync_fence_status(fence[n]), -EIO);
@@ -521,7 +524,7 @@ static void test_inflight_contexts(int fd, unsigned int wait)
 			igt_assert(fence[n] != -1);
 		}
 
-		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
+		__check_wait(fd, obj[1].handle, wait);
 
 		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
 			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
@@ -630,7 +633,7 @@ static void test_inflight_internal(int fd, unsigned int wait)
 		nfence++;
 	}
 
-	igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
+	__check_wait(fd, obj[1].handle, wait);
 
 	while (nfence--) {
 		igt_assert_eq(sync_fence_status(fences[nfence]), -EIO);
@@ -682,7 +685,7 @@ static void test_reset_stress(int fd, unsigned int flags)
 			gem_execbuf(fd, &execbuf);
 
 		/* Wedge after a small delay. */
-		igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0);
+		__check_wait(fd, obj.handle, 100e3);
 
 		/* Unwedge by forcing a reset. */
 		igt_assert(i915_reset_control(true));
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [igt-dev] [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread
@ 2018-07-27 11:13 ` Chris Wilson
  0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-07-27 11:13 UTC (permalink / raw)
  To: intel-gfx; +Cc: igt-dev

We assert that we complete a wedge within 250ms. However, when we use a
thread to delay the wedging until after we start waiting, that thread
itself is delayed longer than our wait timeout. This results in a false
positive error where we fail the test before we even trigger the reset.

Reorder the test so that we only ever measure the delay from triggering
the reset until we wakeup, and assert that is in a timely fashion
(less than 250ms).

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105954
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 tests/gem_eio.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/tests/gem_eio.c b/tests/gem_eio.c
index 3162a3170..b26b4b895 100644
--- a/tests/gem_eio.c
+++ b/tests/gem_eio.c
@@ -190,7 +190,8 @@ static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
 
 struct hang_ctx {
 	int debugfs;
-	struct timespec ts;
+	struct timespec delay;
+	struct timespec *ts;
 	timer_t timer;
 };
 
@@ -198,8 +199,10 @@ static void hang_handler(union sigval arg)
 {
 	struct hang_ctx *ctx = arg.sival_ptr;
 
-	igt_debug("hang delay = %.2fus\n", igt_nsec_elapsed(&ctx->ts) / 1000.0);
+	igt_debug("hang delay = %.2fus\n",
+		  igt_nsec_elapsed(&ctx->delay) / 1000.0);
 
+	igt_nsec_elapsed(ctx->ts);
 	igt_assert(igt_sysfs_set(ctx->debugfs, "i915_wedged", "-1"));
 
 	igt_assert_eq(timer_delete(ctx->timer), 0);
@@ -207,7 +210,7 @@ static void hang_handler(union sigval arg)
 	free(ctx);
 }
 
-static void hang_after(int fd, unsigned int us)
+static void hang_after(int fd, unsigned int us, struct timespec *ts)
 {
 	struct sigevent sev = {
 		.sigev_notify = SIGEV_THREAD,
@@ -229,30 +232,30 @@ static void hang_after(int fd, unsigned int us)
 
 	igt_assert_eq(timer_create(CLOCK_MONOTONIC, &sev, &ctx->timer), 0);
 
-	igt_nsec_elapsed(&ctx->ts);
+	ctx->ts = ts;
+	igt_nsec_elapsed(&ctx->delay);
 
 	igt_assert_eq(timer_settime(ctx->timer, 0, &its, NULL), 0);
 }
 
-static int __check_wait(int fd, uint32_t bo, unsigned int wait)
+static void __check_wait(int fd, uint32_t bo, unsigned int wait)
 {
-	unsigned long wait_timeout = 250e6; /* Some margin for actual reset. */
-	int ret;
+	struct timespec ts = {};
+	uint64_t elapsed;
 
 	if (wait) {
-		/*
-		 * Double the wait plus some fixed margin to ensure gem_wait
-		 * can never time out before the async hang runs.
-		 */
-		wait_timeout += wait * 2000 + 250e6;
-		hang_after(fd, wait);
+		hang_after(fd, wait, &ts);
 	} else {
+		igt_nsec_elapsed(&ts);
 		manual_hang(fd);
 	}
 
-	ret = __gem_wait(fd, bo, wait_timeout);
+	gem_sync(fd, bo);
 
-	return ret;
+	elapsed = igt_nsec_elapsed(&ts);
+	igt_assert_f(elapsed < 250e6,
+		     "Wake up following reset+wedge took %.3fms\n",
+		     elapsed*1e-6);
 }
 
 static void __test_banned(int fd)
@@ -322,7 +325,7 @@ static void test_wait(int fd, unsigned int flags, unsigned int wait)
 
 	hang = spin_sync(fd, 0, I915_EXEC_DEFAULT);
 
-	igt_assert_eq(__check_wait(fd, hang->handle, wait), 0);
+	__check_wait(fd, hang->handle, wait);
 
 	igt_spin_batch_free(fd, hang);
 
@@ -392,7 +395,7 @@ static void test_inflight(int fd, unsigned int wait)
 			igt_assert(fence[n] != -1);
 		}
 
-		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
+		__check_wait(fd, obj[1].handle, wait);
 
 		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
 			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
@@ -443,7 +446,7 @@ static void test_inflight_suspend(int fd)
 	igt_set_autoresume_delay(30);
 	igt_system_suspend_autoresume(SUSPEND_STATE_MEM, SUSPEND_TEST_NONE);
 
-	igt_assert_eq(__check_wait(fd, obj[1].handle, 10), 0);
+	__check_wait(fd, obj[1].handle, 10);
 
 	for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
 		igt_assert_eq(sync_fence_status(fence[n]), -EIO);
@@ -521,7 +524,7 @@ static void test_inflight_contexts(int fd, unsigned int wait)
 			igt_assert(fence[n] != -1);
 		}
 
-		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
+		__check_wait(fd, obj[1].handle, wait);
 
 		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
 			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
@@ -630,7 +633,7 @@ static void test_inflight_internal(int fd, unsigned int wait)
 		nfence++;
 	}
 
-	igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
+	__check_wait(fd, obj[1].handle, wait);
 
 	while (nfence--) {
 		igt_assert_eq(sync_fence_status(fences[nfence]), -EIO);
@@ -682,7 +685,7 @@ static void test_reset_stress(int fd, unsigned int flags)
 			gem_execbuf(fd, &execbuf);
 
 		/* Wedge after a small delay. */
-		igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0);
+		__check_wait(fd, obj.handle, 100e3);
 
 		/* Unwedge by forcing a reset. */
 		igt_assert(i915_reset_control(true));
-- 
2.18.0

_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for igt/gem_eio: Measure reset delay from thread
  2018-07-27 11:13 ` [igt-dev] " Chris Wilson
  (?)
@ 2018-07-27 13:24 ` Patchwork
  -1 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2018-07-27 13:24 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev

== Series Details ==

Series: igt/gem_eio: Measure reset delay from thread
URL   : https://patchwork.freedesktop.org/series/47338/
State : success

== Summary ==

= CI Bug Log - changes from CI_DRM_4560 -> IGTPW_1662 =

== Summary - SUCCESS ==

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/47338/revisions/1/mbox/

== Known issues ==

  Here are the changes found in IGTPW_1662 that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@kms_frontbuffer_tracking@basic:
      fi-kbl-r:           PASS -> DMESG-WARN (fdo#105602)

    igt@kms_pipe_crc_basic@suspend-read-crc-pipe-b:
      fi-snb-2520m:       PASS -> INCOMPLETE (fdo#103713)

    igt@kms_pipe_crc_basic@suspend-read-crc-pipe-c:
      fi-bxt-dsi:         PASS -> INCOMPLETE (fdo#103927)

    
    ==== Possible fixes ====

    igt@kms_chamelium@dp-crc-fast:
      fi-kbl-7500u:       DMESG-FAIL (fdo#103841) -> PASS

    {igt@kms_psr@primary_mmap_gtt}:
      fi-cnl-psr:         DMESG-WARN (fdo#107372) -> PASS

    
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
  fdo#103841 https://bugs.freedesktop.org/show_bug.cgi?id=103841
  fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927
  fdo#105602 https://bugs.freedesktop.org/show_bug.cgi?id=105602
  fdo#107372 https://bugs.freedesktop.org/show_bug.cgi?id=107372


== Participating hosts (51 -> 46) ==

  Additional (1): fi-skl-guc 
  Missing    (6): fi-ilk-m540 fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-icl-u fi-byt-clapper 


== Build changes ==

    * IGT: IGT_4576 -> IGTPW_1662

  CI_DRM_4560: b73c0ddef408783e556741ac9d3679b7d153e3e1 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGTPW_1662: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1662/
  IGT_4576: bcb37a9b20eeec97f15fac2222408cc2e0b77631 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1662/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [igt-dev] ✗ Fi.CI.IGT: failure for igt/gem_eio: Measure reset delay from thread
  2018-07-27 11:13 ` [igt-dev] " Chris Wilson
  (?)
  (?)
@ 2018-07-27 14:52 ` Patchwork
  2018-07-27 15:16   ` Chris Wilson
  -1 siblings, 1 reply; 13+ messages in thread
From: Patchwork @ 2018-07-27 14:52 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev

== Series Details ==

Series: igt/gem_eio: Measure reset delay from thread
URL   : https://patchwork.freedesktop.org/series/47338/
State : failure

== Summary ==

= CI Bug Log - changes from IGT_4576_full -> IGTPW_1662_full =

== Summary - FAILURE ==

  Serious unknown changes coming with IGTPW_1662_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in IGTPW_1662_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/47338/revisions/1/mbox/

== Possible new issues ==

  Here are the unknown changes that may have been introduced in IGTPW_1662_full:

  === IGT changes ===

    ==== Possible regressions ====

    igt@gem_eio@reset-stress:
      shard-hsw:          PASS -> FAIL +2

    igt@gem_eio@wait-10ms:
      shard-snb:          PASS -> FAIL

    
    ==== Warnings ====

    igt@perf_pmu@rc6:
      shard-kbl:          PASS -> SKIP

    
== Known issues ==

  Here are the changes found in IGTPW_1662_full that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@kms_cursor_crc@cursor-256x256-suspend:
      shard-apl:          PASS -> FAIL (fdo#103375)

    igt@kms_cursor_crc@cursor-64x64-suspend:
      shard-snb:          PASS -> INCOMPLETE (fdo#105411) +1

    igt@pm_rpm@modeset-non-lpsp-stress:
      shard-kbl:          PASS -> FAIL (fdo#106539)
      shard-hsw:          PASS -> FAIL (fdo#106539)
      shard-glk:          PASS -> FAIL (fdo#106539)
      shard-apl:          PASS -> FAIL (fdo#106539)

    
    ==== Possible fixes ====

    igt@gem_softpin@evict-active-interruptible:
      shard-snb:          INCOMPLETE (fdo#105411) -> PASS

    igt@kms_draw_crc@draw-method-xrgb8888-pwrite-ytiled:
      shard-glk:          FAIL (fdo#107401) -> PASS

    igt@kms_flip@2x-plain-flip-fb-recreate:
      shard-glk:          FAIL (fdo#100368) -> PASS

    igt@kms_plane@plane-panning-bottom-right-suspend-pipe-c-planes:
      shard-kbl:          INCOMPLETE (fdo#103665) -> PASS

    igt@kms_setmode@basic:
      shard-apl:          FAIL (fdo#99912) -> PASS

    igt@kms_vblank@pipe-b-ts-continuation-dpms-suspend:
      shard-glk:          FAIL (fdo#103375) -> PASS

    igt@prime_vgem@basic-fence-flip:
      shard-kbl:          DMESG-WARN (fdo#106247) -> PASS

    
  fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
  fdo#103375 https://bugs.freedesktop.org/show_bug.cgi?id=103375
  fdo#103665 https://bugs.freedesktop.org/show_bug.cgi?id=103665
  fdo#105411 https://bugs.freedesktop.org/show_bug.cgi?id=105411
  fdo#106247 https://bugs.freedesktop.org/show_bug.cgi?id=106247
  fdo#106539 https://bugs.freedesktop.org/show_bug.cgi?id=106539
  fdo#107401 https://bugs.freedesktop.org/show_bug.cgi?id=107401
  fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912


== Participating hosts (5 -> 5) ==

  No changes in participating hosts


== Build changes ==

    * IGT: IGT_4576 -> IGTPW_1662
    * Linux: CI_DRM_4547 -> CI_DRM_4560

  CI_DRM_4547: 0a7ab192a697e951b2404f3c1ce42c5fa74f9ed1 @ git://anongit.freedesktop.org/gfx-ci/linux
  CI_DRM_4560: b73c0ddef408783e556741ac9d3679b7d153e3e1 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGTPW_1662: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1662/
  IGT_4576: bcb37a9b20eeec97f15fac2222408cc2e0b77631 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1662/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [igt-dev] ✗ Fi.CI.IGT: failure for igt/gem_eio: Measure reset delay from thread
  2018-07-27 14:52 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
@ 2018-07-27 15:16   ` Chris Wilson
  2018-07-27 18:44     ` Chris Wilson
  0 siblings, 1 reply; 13+ messages in thread
From: Chris Wilson @ 2018-07-27 15:16 UTC (permalink / raw)
  To: Patchwork, igt-dev

Quoting Patchwork (2018-07-27 15:52:37)
> == Series Details ==
> 
> Series: igt/gem_eio: Measure reset delay from thread
> URL   : https://patchwork.freedesktop.org/series/47338/
> State : failure
> 
> == Summary ==
> 
> = CI Bug Log - changes from IGT_4576_full -> IGTPW_1662_full =
> 
> == Summary - FAILURE ==
> 
>   Serious unknown changes coming with IGTPW_1662_full absolutely need to be
>   verified manually.
>   
>   If you think the reported changes have nothing to do with the changes
>   introduced in IGTPW_1662_full, please notify your bug team to allow them
>   to document this new failure mode, which will reduce false positives in CI.
> 
>   External URL: https://patchwork.freedesktop.org/api/1.0/series/47338/revisions/1/mbox/
> 
> == Possible new issues ==
> 
>   Here are the unknown changes that may have been introduced in IGTPW_1662_full:
> 
>   === IGT changes ===
> 
>     ==== Possible regressions ====
> 
>     igt@gem_eio@reset-stress:
>       shard-hsw:          PASS -> FAIL +2
> 
>     igt@gem_eio@wait-10ms:
>       shard-snb:          PASS -> FAIL
> 
>     
>     ==== Warnings ====
> 
>     igt@perf_pmu@rc6:
>       shard-kbl:          PASS -> SKIP
> 
>     
> == Known issues ==
> 
>   Here are the changes found in IGTPW_1662_full that come from known issues:
> 
>   === IGT changes ===
> 
>     ==== Issues hit ====
> 
>     igt@kms_cursor_crc@cursor-256x256-suspend:
>       shard-apl:          PASS -> FAIL (fdo#103375)
> 
>     igt@kms_cursor_crc@cursor-64x64-suspend:
>       shard-snb:          PASS -> INCOMPLETE (fdo#105411) +1
> 
>     igt@pm_rpm@modeset-non-lpsp-stress:
>       shard-kbl:          PASS -> FAIL (fdo#106539)
>       shard-hsw:          PASS -> FAIL (fdo#106539)
>       shard-glk:          PASS -> FAIL (fdo#106539)
>       shard-apl:          PASS -> FAIL (fdo#106539)

Hmm, eye opening. We've found a genuine bug, missing the wakeup.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [igt-dev] ✗ Fi.CI.IGT: failure for igt/gem_eio: Measure reset delay from thread
  2018-07-27 15:16   ` Chris Wilson
@ 2018-07-27 18:44     ` Chris Wilson
  0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-07-27 18:44 UTC (permalink / raw)
  To: Patchwork, igt-dev

Quoting Chris Wilson (2018-07-27 16:16:32)
> > == Known issues ==
> > 
> >   Here are the changes found in IGTPW_1662_full that come from known issues:
> > 
> >   === IGT changes ===
> > 
> >     ==== Issues hit ====
> > 
> >     igt@kms_cursor_crc@cursor-256x256-suspend:
> >       shard-apl:          PASS -> FAIL (fdo#103375)
> > 
> >     igt@kms_cursor_crc@cursor-64x64-suspend:
> >       shard-snb:          PASS -> INCOMPLETE (fdo#105411) +1
> > 
> >     igt@pm_rpm@modeset-non-lpsp-stress:
> >       shard-kbl:          PASS -> FAIL (fdo#106539)
> >       shard-hsw:          PASS -> FAIL (fdo#106539)
> >       shard-glk:          PASS -> FAIL (fdo#106539)
> >       shard-apl:          PASS -> FAIL (fdo#106539)
> 
> Hmm, eye opening. We've found a genuine bug, missing the wakeup.

Band aid sent.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [igt-dev] ✓ Fi.CI.BAT: success for igt/gem_eio: Measure reset delay from thread
  2018-07-27 11:13 ` [igt-dev] " Chris Wilson
                   ` (2 preceding siblings ...)
  (?)
@ 2018-07-30 16:20 ` Patchwork
  -1 siblings, 0 replies; 13+ messages in thread
From: Patchwork @ 2018-07-30 16:20 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev

== Series Details ==

Series: igt/gem_eio: Measure reset delay from thread
URL   : https://patchwork.freedesktop.org/series/47338/
State : success

== Summary ==

= CI Bug Log - changes from CI_DRM_4592 -> IGTPW_1671 =

== Summary - SUCCESS ==

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/47338/revisions/1/mbox/

== Known issues ==

  Here are the changes found in IGTPW_1671 that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@drv_module_reload@basic-reload-inject:
      fi-hsw-4770r:       PASS -> DMESG-WARN (fdo#107425)

    igt@kms_flip@basic-flip-vs-dpms:
      fi-skl-6700hq:      PASS -> DMESG-WARN (fdo#105998)

    igt@prime_vgem@basic-fence-flip:
      fi-ilk-650:         PASS -> FAIL (fdo#104008)

    
    ==== Warnings ====

    {igt@kms_psr@primary_page_flip}:
      fi-cnl-psr:         DMESG-WARN (fdo#107372) -> DMESG-FAIL (fdo#107372)

    
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  fdo#104008 https://bugs.freedesktop.org/show_bug.cgi?id=104008
  fdo#105998 https://bugs.freedesktop.org/show_bug.cgi?id=105998
  fdo#107372 https://bugs.freedesktop.org/show_bug.cgi?id=107372
  fdo#107425 https://bugs.freedesktop.org/show_bug.cgi?id=107425


== Participating hosts (52 -> 47) ==

  Additional (1): fi-kbl-8809g 
  Missing    (6): fi-ilk-m540 fi-hsw-4200u fi-byt-squawks fi-bsw-cyan fi-ctg-p8600 fi-byt-clapper 


== Build changes ==

    * IGT: IGT_4580 -> IGTPW_1671

  CI_DRM_4592: ed4b6c987889caf5a4041cf78eacdede85964577 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGTPW_1671: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1671/
  IGT_4580: f1c868dae24056ebc27e4f3c197724ce9b956a8a @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1671/issues.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* [igt-dev] ✓ Fi.CI.IGT: success for igt/gem_eio: Measure reset delay from thread
  2018-07-27 11:13 ` [igt-dev] " Chris Wilson
                   ` (3 preceding siblings ...)
  (?)
@ 2018-07-30 18:29 ` Patchwork
  2018-07-30 18:33   ` Chris Wilson
  -1 siblings, 1 reply; 13+ messages in thread
From: Patchwork @ 2018-07-30 18:29 UTC (permalink / raw)
  To: Chris Wilson; +Cc: igt-dev

== Series Details ==

Series: igt/gem_eio: Measure reset delay from thread
URL   : https://patchwork.freedesktop.org/series/47338/
State : success

== Summary ==

= CI Bug Log - changes from IGT_4580_full -> IGTPW_1671_full =

== Summary - WARNING ==

  Minor unknown changes coming with IGTPW_1671_full need to be verified
  manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in IGTPW_1671_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/47338/revisions/1/mbox/

== Possible new issues ==

  Here are the unknown changes that may have been introduced in IGTPW_1671_full:

  === IGT changes ===

    ==== Warnings ====

    igt@kms_vblank@pipe-b-accuracy-idle:
      shard-snb:          PASS -> SKIP +1

    igt@pm_rc6_residency@rc6-accuracy:
      shard-kbl:          PASS -> SKIP
      shard-snb:          SKIP -> PASS

    
== Known issues ==

  Here are the changes found in IGTPW_1671_full that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@gem_eio@wait-immediate:
      shard-snb:          PASS -> INCOMPLETE (fdo#105411)

    igt@gem_ppgtt@blt-vs-render-ctx0:
      shard-kbl:          PASS -> INCOMPLETE (fdo#103665, fdo#106023)

    igt@kms_frontbuffer_tracking@fbc-1p-offscren-pri-indfb-draw-blt:
      shard-glk:          PASS -> FAIL (fdo#103167)

    igt@kms_setmode@basic:
      shard-apl:          PASS -> FAIL (fdo#99912)

    igt@kms_vblank@pipe-c-ts-continuation-suspend:
      shard-kbl:          PASS -> INCOMPLETE (fdo#103665)

    igt@testdisplay:
      shard-glk:          PASS -> INCOMPLETE (k.org#198133, fdo#107093, fdo#103359)

    
    ==== Possible fixes ====

    igt@kms_flip@plain-flip-fb-recreate-interruptible:
      shard-glk:          FAIL (fdo#100368) -> PASS

    igt@kms_plane_multiple@atomic-pipe-a-tiling-x:
      shard-snb:          FAIL (fdo#103166) -> PASS

    igt@kms_rotation_crc@sprite-rotation-180:
      shard-snb:          FAIL (fdo#103925) -> PASS +1

    
  fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
  fdo#103166 https://bugs.freedesktop.org/show_bug.cgi?id=103166
  fdo#103167 https://bugs.freedesktop.org/show_bug.cgi?id=103167
  fdo#103359 https://bugs.freedesktop.org/show_bug.cgi?id=103359
  fdo#103665 https://bugs.freedesktop.org/show_bug.cgi?id=103665
  fdo#103925 https://bugs.freedesktop.org/show_bug.cgi?id=103925
  fdo#105411 https://bugs.freedesktop.org/show_bug.cgi?id=105411
  fdo#106023 https://bugs.freedesktop.org/show_bug.cgi?id=106023
  fdo#107093 https://bugs.freedesktop.org/show_bug.cgi?id=107093
  fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912
  k.org#198133 https://bugzilla.kernel.org/show_bug.cgi?id=198133


== Participating hosts (5 -> 5) ==

  No changes in participating hosts


== Build changes ==

    * IGT: IGT_4580 -> IGTPW_1671
    * Linux: CI_DRM_4562 -> CI_DRM_4592

  CI_DRM_4562: 99bbd80f75cdcf28699ffd3c93a714ca4a89b962 @ git://anongit.freedesktop.org/gfx-ci/linux
  CI_DRM_4592: ed4b6c987889caf5a4041cf78eacdede85964577 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGTPW_1671: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1671/
  IGT_4580: f1c868dae24056ebc27e4f3c197724ce9b956a8a @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/IGTPW_1671/shards.html
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [igt-dev] ✓ Fi.CI.IGT: success for igt/gem_eio: Measure reset delay from thread
  2018-07-30 18:29 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork
@ 2018-07-30 18:33   ` Chris Wilson
  0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-07-30 18:33 UTC (permalink / raw)
  To: Patchwork, igt-dev

Quoting Patchwork (2018-07-30 19:29:16)
>     igt@gem_eio@wait-immediate:
>       shard-snb:          PASS -> INCOMPLETE (fdo#105411)
> 

No logs, not even on the paths affected by the patch.

Please review kindly.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread
  2018-07-27 11:13 ` [igt-dev] " Chris Wilson
@ 2018-08-03 14:03   ` Tvrtko Ursulin
  -1 siblings, 0 replies; 13+ messages in thread
From: Tvrtko Ursulin @ 2018-08-03 14:03 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: igt-dev


On 27/07/2018 12:13, Chris Wilson wrote:
> We assert that we complete a wedge within 250ms. However, when we use a
> thread to delay the wedging until after we start waiting, that thread
> itself is delayed longer than our wait timeout. This results in a false
> positive error where we fail the test before we even trigger the reset.
> 
> Reorder the test so that we only ever measure the delay from triggering
> the reset until we wakeup, and assert that is in a timely fashion
> (less than 250ms).
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105954
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   tests/gem_eio.c | 45 ++++++++++++++++++++++++---------------------
>   1 file changed, 24 insertions(+), 21 deletions(-)
> 
> diff --git a/tests/gem_eio.c b/tests/gem_eio.c
> index 3162a3170..b26b4b895 100644
> --- a/tests/gem_eio.c
> +++ b/tests/gem_eio.c
> @@ -190,7 +190,8 @@ static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
>   
>   struct hang_ctx {
>   	int debugfs;
> -	struct timespec ts;
> +	struct timespec delay;
> +	struct timespec *ts;
>   	timer_t timer;
>   };
>   
> @@ -198,8 +199,10 @@ static void hang_handler(union sigval arg)
>   {
>   	struct hang_ctx *ctx = arg.sival_ptr;
>   
> -	igt_debug("hang delay = %.2fus\n", igt_nsec_elapsed(&ctx->ts) / 1000.0);
> +	igt_debug("hang delay = %.2fus\n",
> +		  igt_nsec_elapsed(&ctx->delay) / 1000.0);
>   
> +	igt_nsec_elapsed(ctx->ts);
>   	igt_assert(igt_sysfs_set(ctx->debugfs, "i915_wedged", "-1"));
>   
>   	igt_assert_eq(timer_delete(ctx->timer), 0);
> @@ -207,7 +210,7 @@ static void hang_handler(union sigval arg)
>   	free(ctx);
>   }
>   
> -static void hang_after(int fd, unsigned int us)
> +static void hang_after(int fd, unsigned int us, struct timespec *ts)
>   {
>   	struct sigevent sev = {
>   		.sigev_notify = SIGEV_THREAD,
> @@ -229,30 +232,30 @@ static void hang_after(int fd, unsigned int us)
>   
>   	igt_assert_eq(timer_create(CLOCK_MONOTONIC, &sev, &ctx->timer), 0);
>   
> -	igt_nsec_elapsed(&ctx->ts);
> +	ctx->ts = ts;
> +	igt_nsec_elapsed(&ctx->delay);
>   
>   	igt_assert_eq(timer_settime(ctx->timer, 0, &its, NULL), 0);
>   }
>   
> -static int __check_wait(int fd, uint32_t bo, unsigned int wait)
> +static void __check_wait(int fd, uint32_t bo, unsigned int wait)
>   {
> -	unsigned long wait_timeout = 250e6; /* Some margin for actual reset. */
> -	int ret;
> +	struct timespec ts = {};
> +	uint64_t elapsed;
>   
>   	if (wait) {
> -		/*
> -		 * Double the wait plus some fixed margin to ensure gem_wait
> -		 * can never time out before the async hang runs.
> -		 */
> -		wait_timeout += wait * 2000 + 250e6;
> -		hang_after(fd, wait);
> +		hang_after(fd, wait, &ts);
>   	} else {
> +		igt_nsec_elapsed(&ts);
>   		manual_hang(fd);
>   	}
>   
> -	ret = __gem_wait(fd, bo, wait_timeout);
> +	gem_sync(fd, bo);

One drawback I can see is if reset fails to work then we hang until IGT 
runner timeout. Alternative would be a smaller patch to bump the wait 
delay which should work given how rare are the failures. It was one of 
the goals of the gem_eio rewrite I think, but whatever, I suppose 
runtime of test when reset is broken is not that important.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

>   
> -	return ret;
> +	elapsed = igt_nsec_elapsed(&ts);
> +	igt_assert_f(elapsed < 250e6,
> +		     "Wake up following reset+wedge took %.3fms\n",
> +		     elapsed*1e-6);
>   }
>   
>   static void __test_banned(int fd)
> @@ -322,7 +325,7 @@ static void test_wait(int fd, unsigned int flags, unsigned int wait)
>   
>   	hang = spin_sync(fd, 0, I915_EXEC_DEFAULT);
>   
> -	igt_assert_eq(__check_wait(fd, hang->handle, wait), 0);
> +	__check_wait(fd, hang->handle, wait);
>   
>   	igt_spin_batch_free(fd, hang);
>   
> @@ -392,7 +395,7 @@ static void test_inflight(int fd, unsigned int wait)
>   			igt_assert(fence[n] != -1);
>   		}
>   
> -		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +		__check_wait(fd, obj[1].handle, wait);
>   
>   		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -443,7 +446,7 @@ static void test_inflight_suspend(int fd)
>   	igt_set_autoresume_delay(30);
>   	igt_system_suspend_autoresume(SUSPEND_STATE_MEM, SUSPEND_TEST_NONE);
>   
> -	igt_assert_eq(__check_wait(fd, obj[1].handle, 10), 0);
> +	__check_wait(fd, obj[1].handle, 10);
>   
>   	for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   		igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -521,7 +524,7 @@ static void test_inflight_contexts(int fd, unsigned int wait)
>   			igt_assert(fence[n] != -1);
>   		}
>   
> -		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +		__check_wait(fd, obj[1].handle, wait);
>   
>   		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -630,7 +633,7 @@ static void test_inflight_internal(int fd, unsigned int wait)
>   		nfence++;
>   	}
>   
> -	igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +	__check_wait(fd, obj[1].handle, wait);
>   
>   	while (nfence--) {
>   		igt_assert_eq(sync_fence_status(fences[nfence]), -EIO);
> @@ -682,7 +685,7 @@ static void test_reset_stress(int fd, unsigned int flags)
>   			gem_execbuf(fd, &execbuf);
>   
>   		/* Wedge after a small delay. */
> -		igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0);
> +		__check_wait(fd, obj.handle, 100e3);
>   
>   		/* Unwedge by forcing a reset. */
>   		igt_assert(i915_reset_control(true));
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread
@ 2018-08-03 14:03   ` Tvrtko Ursulin
  0 siblings, 0 replies; 13+ messages in thread
From: Tvrtko Ursulin @ 2018-08-03 14:03 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: igt-dev


On 27/07/2018 12:13, Chris Wilson wrote:
> We assert that we complete a wedge within 250ms. However, when we use a
> thread to delay the wedging until after we start waiting, that thread
> itself is delayed longer than our wait timeout. This results in a false
> positive error where we fail the test before we even trigger the reset.
> 
> Reorder the test so that we only ever measure the delay from triggering
> the reset until we wakeup, and assert that is in a timely fashion
> (less than 250ms).
> 
> Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105954
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   tests/gem_eio.c | 45 ++++++++++++++++++++++++---------------------
>   1 file changed, 24 insertions(+), 21 deletions(-)
> 
> diff --git a/tests/gem_eio.c b/tests/gem_eio.c
> index 3162a3170..b26b4b895 100644
> --- a/tests/gem_eio.c
> +++ b/tests/gem_eio.c
> @@ -190,7 +190,8 @@ static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
>   
>   struct hang_ctx {
>   	int debugfs;
> -	struct timespec ts;
> +	struct timespec delay;
> +	struct timespec *ts;
>   	timer_t timer;
>   };
>   
> @@ -198,8 +199,10 @@ static void hang_handler(union sigval arg)
>   {
>   	struct hang_ctx *ctx = arg.sival_ptr;
>   
> -	igt_debug("hang delay = %.2fus\n", igt_nsec_elapsed(&ctx->ts) / 1000.0);
> +	igt_debug("hang delay = %.2fus\n",
> +		  igt_nsec_elapsed(&ctx->delay) / 1000.0);
>   
> +	igt_nsec_elapsed(ctx->ts);
>   	igt_assert(igt_sysfs_set(ctx->debugfs, "i915_wedged", "-1"));
>   
>   	igt_assert_eq(timer_delete(ctx->timer), 0);
> @@ -207,7 +210,7 @@ static void hang_handler(union sigval arg)
>   	free(ctx);
>   }
>   
> -static void hang_after(int fd, unsigned int us)
> +static void hang_after(int fd, unsigned int us, struct timespec *ts)
>   {
>   	struct sigevent sev = {
>   		.sigev_notify = SIGEV_THREAD,
> @@ -229,30 +232,30 @@ static void hang_after(int fd, unsigned int us)
>   
>   	igt_assert_eq(timer_create(CLOCK_MONOTONIC, &sev, &ctx->timer), 0);
>   
> -	igt_nsec_elapsed(&ctx->ts);
> +	ctx->ts = ts;
> +	igt_nsec_elapsed(&ctx->delay);
>   
>   	igt_assert_eq(timer_settime(ctx->timer, 0, &its, NULL), 0);
>   }
>   
> -static int __check_wait(int fd, uint32_t bo, unsigned int wait)
> +static void __check_wait(int fd, uint32_t bo, unsigned int wait)
>   {
> -	unsigned long wait_timeout = 250e6; /* Some margin for actual reset. */
> -	int ret;
> +	struct timespec ts = {};
> +	uint64_t elapsed;
>   
>   	if (wait) {
> -		/*
> -		 * Double the wait plus some fixed margin to ensure gem_wait
> -		 * can never time out before the async hang runs.
> -		 */
> -		wait_timeout += wait * 2000 + 250e6;
> -		hang_after(fd, wait);
> +		hang_after(fd, wait, &ts);
>   	} else {
> +		igt_nsec_elapsed(&ts);
>   		manual_hang(fd);
>   	}
>   
> -	ret = __gem_wait(fd, bo, wait_timeout);
> +	gem_sync(fd, bo);

One drawback I can see is if reset fails to work then we hang until IGT 
runner timeout. Alternative would be a smaller patch to bump the wait 
delay which should work given how rare are the failures. It was one of 
the goals of the gem_eio rewrite I think, but whatever, I suppose 
runtime of test when reset is broken is not that important.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

>   
> -	return ret;
> +	elapsed = igt_nsec_elapsed(&ts);
> +	igt_assert_f(elapsed < 250e6,
> +		     "Wake up following reset+wedge took %.3fms\n",
> +		     elapsed*1e-6);
>   }
>   
>   static void __test_banned(int fd)
> @@ -322,7 +325,7 @@ static void test_wait(int fd, unsigned int flags, unsigned int wait)
>   
>   	hang = spin_sync(fd, 0, I915_EXEC_DEFAULT);
>   
> -	igt_assert_eq(__check_wait(fd, hang->handle, wait), 0);
> +	__check_wait(fd, hang->handle, wait);
>   
>   	igt_spin_batch_free(fd, hang);
>   
> @@ -392,7 +395,7 @@ static void test_inflight(int fd, unsigned int wait)
>   			igt_assert(fence[n] != -1);
>   		}
>   
> -		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +		__check_wait(fd, obj[1].handle, wait);
>   
>   		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -443,7 +446,7 @@ static void test_inflight_suspend(int fd)
>   	igt_set_autoresume_delay(30);
>   	igt_system_suspend_autoresume(SUSPEND_STATE_MEM, SUSPEND_TEST_NONE);
>   
> -	igt_assert_eq(__check_wait(fd, obj[1].handle, 10), 0);
> +	__check_wait(fd, obj[1].handle, 10);
>   
>   	for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   		igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -521,7 +524,7 @@ static void test_inflight_contexts(int fd, unsigned int wait)
>   			igt_assert(fence[n] != -1);
>   		}
>   
> -		igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +		__check_wait(fd, obj[1].handle, wait);
>   
>   		for (unsigned int n = 0; n < ARRAY_SIZE(fence); n++) {
>   			igt_assert_eq(sync_fence_status(fence[n]), -EIO);
> @@ -630,7 +633,7 @@ static void test_inflight_internal(int fd, unsigned int wait)
>   		nfence++;
>   	}
>   
> -	igt_assert_eq(__check_wait(fd, obj[1].handle, wait), 0);
> +	__check_wait(fd, obj[1].handle, wait);
>   
>   	while (nfence--) {
>   		igt_assert_eq(sync_fence_status(fences[nfence]), -EIO);
> @@ -682,7 +685,7 @@ static void test_reset_stress(int fd, unsigned int flags)
>   			gem_execbuf(fd, &execbuf);
>   
>   		/* Wedge after a small delay. */
> -		igt_assert_eq(__check_wait(fd, obj.handle, 100e3), 0);
> +		__check_wait(fd, obj.handle, 100e3);
>   
>   		/* Unwedge by forcing a reset. */
>   		igt_assert(i915_reset_control(true));
> 
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread
  2018-08-03 14:03   ` Tvrtko Ursulin
@ 2018-08-03 14:10     ` Chris Wilson
  -1 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-08-03 14:10 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: igt-dev

Quoting Tvrtko Ursulin (2018-08-03 15:03:05)
> 
> On 27/07/2018 12:13, Chris Wilson wrote:
> > We assert that we complete a wedge within 250ms. However, when we use a
> > thread to delay the wedging until after we start waiting, that thread
> > itself is delayed longer than our wait timeout. This results in a false
> > positive error where we fail the test before we even trigger the reset.
> > 
> > Reorder the test so that we only ever measure the delay from triggering
> > the reset until we wakeup, and assert that is in a timely fashion
> > (less than 250ms).
> > 
> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105954
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   tests/gem_eio.c | 45 ++++++++++++++++++++++++---------------------
> >   1 file changed, 24 insertions(+), 21 deletions(-)
> > 
> > diff --git a/tests/gem_eio.c b/tests/gem_eio.c
> > index 3162a3170..b26b4b895 100644
> > --- a/tests/gem_eio.c
> > +++ b/tests/gem_eio.c
> > @@ -190,7 +190,8 @@ static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
> >   
> >   struct hang_ctx {
> >       int debugfs;
> > -     struct timespec ts;
> > +     struct timespec delay;
> > +     struct timespec *ts;
> >       timer_t timer;
> >   };
> >   
> > @@ -198,8 +199,10 @@ static void hang_handler(union sigval arg)
> >   {
> >       struct hang_ctx *ctx = arg.sival_ptr;
> >   
> > -     igt_debug("hang delay = %.2fus\n", igt_nsec_elapsed(&ctx->ts) / 1000.0);
> > +     igt_debug("hang delay = %.2fus\n",
> > +               igt_nsec_elapsed(&ctx->delay) / 1000.0);
> >   
> > +     igt_nsec_elapsed(ctx->ts);
> >       igt_assert(igt_sysfs_set(ctx->debugfs, "i915_wedged", "-1"));
> >   
> >       igt_assert_eq(timer_delete(ctx->timer), 0);
> > @@ -207,7 +210,7 @@ static void hang_handler(union sigval arg)
> >       free(ctx);
> >   }
> >   
> > -static void hang_after(int fd, unsigned int us)
> > +static void hang_after(int fd, unsigned int us, struct timespec *ts)
> >   {
> >       struct sigevent sev = {
> >               .sigev_notify = SIGEV_THREAD,
> > @@ -229,30 +232,30 @@ static void hang_after(int fd, unsigned int us)
> >   
> >       igt_assert_eq(timer_create(CLOCK_MONOTONIC, &sev, &ctx->timer), 0);
> >   
> > -     igt_nsec_elapsed(&ctx->ts);
> > +     ctx->ts = ts;
> > +     igt_nsec_elapsed(&ctx->delay);
> >   
> >       igt_assert_eq(timer_settime(ctx->timer, 0, &its, NULL), 0);
> >   }
> >   
> > -static int __check_wait(int fd, uint32_t bo, unsigned int wait)
> > +static void __check_wait(int fd, uint32_t bo, unsigned int wait)
> >   {
> > -     unsigned long wait_timeout = 250e6; /* Some margin for actual reset. */
> > -     int ret;
> > +     struct timespec ts = {};
> > +     uint64_t elapsed;
> >   
> >       if (wait) {
> > -             /*
> > -              * Double the wait plus some fixed margin to ensure gem_wait
> > -              * can never time out before the async hang runs.
> > -              */
> > -             wait_timeout += wait * 2000 + 250e6;
> > -             hang_after(fd, wait);
> > +             hang_after(fd, wait, &ts);
> >       } else {
> > +             igt_nsec_elapsed(&ts);
> >               manual_hang(fd);
> >       }
> >   
> > -     ret = __gem_wait(fd, bo, wait_timeout);
> > +     gem_sync(fd, bo);
> 
> One drawback I can see is if reset fails to work then we hang until IGT 
> runner timeout. Alternative would be a smaller patch to bump the wait 
> delay which should work given how rare are the failures. It was one of 
> the goals of the gem_eio rewrite I think, but whatever, I suppose 
> runtime of test when reset is broken is not that important.

It hard to put an upper bound on just how much the scheduler is going to
screw us over. On the other hand, this let's us say precisely which
portion must be done in the timely fashion (reset -> wakeup), and for
that bound can be invariant. Which I think is nice.

The downside is indeed that instead of getting a clear fail with
explanation if the reset fails completely (not even able to wedge the
device), and that hangcheck itself fails to wedge eventually, then we do
rely on the external runner to declare the test broken. I think that's
fair, it requires a series of events to fail; and in particular
hangcheck is supposed to be guarding against even its own failure.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [igt-dev] [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread
@ 2018-08-03 14:10     ` Chris Wilson
  0 siblings, 0 replies; 13+ messages in thread
From: Chris Wilson @ 2018-08-03 14:10 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: igt-dev

Quoting Tvrtko Ursulin (2018-08-03 15:03:05)
> 
> On 27/07/2018 12:13, Chris Wilson wrote:
> > We assert that we complete a wedge within 250ms. However, when we use a
> > thread to delay the wedging until after we start waiting, that thread
> > itself is delayed longer than our wait timeout. This results in a false
> > positive error where we fail the test before we even trigger the reset.
> > 
> > Reorder the test so that we only ever measure the delay from triggering
> > the reset until we wakeup, and assert that is in a timely fashion
> > (less than 250ms).
> > 
> > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=105954
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   tests/gem_eio.c | 45 ++++++++++++++++++++++++---------------------
> >   1 file changed, 24 insertions(+), 21 deletions(-)
> > 
> > diff --git a/tests/gem_eio.c b/tests/gem_eio.c
> > index 3162a3170..b26b4b895 100644
> > --- a/tests/gem_eio.c
> > +++ b/tests/gem_eio.c
> > @@ -190,7 +190,8 @@ static igt_spin_t * spin_sync(int fd, uint32_t ctx, unsigned long flags)
> >   
> >   struct hang_ctx {
> >       int debugfs;
> > -     struct timespec ts;
> > +     struct timespec delay;
> > +     struct timespec *ts;
> >       timer_t timer;
> >   };
> >   
> > @@ -198,8 +199,10 @@ static void hang_handler(union sigval arg)
> >   {
> >       struct hang_ctx *ctx = arg.sival_ptr;
> >   
> > -     igt_debug("hang delay = %.2fus\n", igt_nsec_elapsed(&ctx->ts) / 1000.0);
> > +     igt_debug("hang delay = %.2fus\n",
> > +               igt_nsec_elapsed(&ctx->delay) / 1000.0);
> >   
> > +     igt_nsec_elapsed(ctx->ts);
> >       igt_assert(igt_sysfs_set(ctx->debugfs, "i915_wedged", "-1"));
> >   
> >       igt_assert_eq(timer_delete(ctx->timer), 0);
> > @@ -207,7 +210,7 @@ static void hang_handler(union sigval arg)
> >       free(ctx);
> >   }
> >   
> > -static void hang_after(int fd, unsigned int us)
> > +static void hang_after(int fd, unsigned int us, struct timespec *ts)
> >   {
> >       struct sigevent sev = {
> >               .sigev_notify = SIGEV_THREAD,
> > @@ -229,30 +232,30 @@ static void hang_after(int fd, unsigned int us)
> >   
> >       igt_assert_eq(timer_create(CLOCK_MONOTONIC, &sev, &ctx->timer), 0);
> >   
> > -     igt_nsec_elapsed(&ctx->ts);
> > +     ctx->ts = ts;
> > +     igt_nsec_elapsed(&ctx->delay);
> >   
> >       igt_assert_eq(timer_settime(ctx->timer, 0, &its, NULL), 0);
> >   }
> >   
> > -static int __check_wait(int fd, uint32_t bo, unsigned int wait)
> > +static void __check_wait(int fd, uint32_t bo, unsigned int wait)
> >   {
> > -     unsigned long wait_timeout = 250e6; /* Some margin for actual reset. */
> > -     int ret;
> > +     struct timespec ts = {};
> > +     uint64_t elapsed;
> >   
> >       if (wait) {
> > -             /*
> > -              * Double the wait plus some fixed margin to ensure gem_wait
> > -              * can never time out before the async hang runs.
> > -              */
> > -             wait_timeout += wait * 2000 + 250e6;
> > -             hang_after(fd, wait);
> > +             hang_after(fd, wait, &ts);
> >       } else {
> > +             igt_nsec_elapsed(&ts);
> >               manual_hang(fd);
> >       }
> >   
> > -     ret = __gem_wait(fd, bo, wait_timeout);
> > +     gem_sync(fd, bo);
> 
> One drawback I can see is if reset fails to work then we hang until IGT 
> runner timeout. Alternative would be a smaller patch to bump the wait 
> delay which should work given how rare are the failures. It was one of 
> the goals of the gem_eio rewrite I think, but whatever, I suppose 
> runtime of test when reset is broken is not that important.

It hard to put an upper bound on just how much the scheduler is going to
screw us over. On the other hand, this let's us say precisely which
portion must be done in the timely fashion (reset -> wakeup), and for
that bound can be invariant. Which I think is nice.

The downside is indeed that instead of getting a clear fail with
explanation if the reset fails completely (not even able to wedge the
device), and that hangcheck itself fails to wedge eventually, then we do
rely on the external runner to declare the test broken. I think that's
fair, it requires a series of events to fail; and in particular
hangcheck is supposed to be guarding against even its own failure.
-Chris
_______________________________________________
igt-dev mailing list
igt-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/igt-dev

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-08-03 14:10 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-27 11:13 [PATCH i-g-t] igt/gem_eio: Measure reset delay from thread Chris Wilson
2018-07-27 11:13 ` [igt-dev] " Chris Wilson
2018-07-27 13:24 ` [igt-dev] ✓ Fi.CI.BAT: success for " Patchwork
2018-07-27 14:52 ` [igt-dev] ✗ Fi.CI.IGT: failure " Patchwork
2018-07-27 15:16   ` Chris Wilson
2018-07-27 18:44     ` Chris Wilson
2018-07-30 16:20 ` [igt-dev] ✓ Fi.CI.BAT: success " Patchwork
2018-07-30 18:29 ` [igt-dev] ✓ Fi.CI.IGT: " Patchwork
2018-07-30 18:33   ` Chris Wilson
2018-08-03 14:03 ` [igt-dev] [PATCH i-g-t] " Tvrtko Ursulin
2018-08-03 14:03   ` Tvrtko Ursulin
2018-08-03 14:10   ` Chris Wilson
2018-08-03 14:10     ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.