All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
@ 2018-07-09 10:42 Chris Wilson
  2018-07-09 10:42 ` [PATCH 2/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() on setup Chris Wilson
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Chris Wilson @ 2018-07-09 10:42 UTC (permalink / raw)
  To: intel-gfx

Usually we have no idea about the upper bound we need to wait to catch
up with userspace when idling the device, but in a few situations we
know the system was idle beforehand and can provide a short timeout in
order to very quickly catch a failure, long before hangcheck kicks in.

In the following patches, we will use the timeout to curtain two overly
long waits, where we know we can expect the GPU to complete within a
reasonable time or declare it broken.

In particular, with a broken GPU we expect it to fail during the initial
GPU setup where do a couple of context switches to record the defaults.
This is a task that takes a few milliseconds even on the slowest of
devices, but we may have to wait 60s for hangcheck to give in and
declare the machine inoperable. In this a case where any gpu hang is
unacceptable, both from a timeliness and practical standpoint.

The other improvement is that in selftests, we do not need to arm an
independent timer to inject a wedge, as we can just limit the timeout on
the wait directly.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  6 ++-
 drivers/gpu/drm/i915/i915_drv.h               |  2 +-
 drivers/gpu/drm/i915/i915_gem.c               | 43 +++++++++++--------
 drivers/gpu/drm/i915/i915_gem_evict.c         |  3 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c           |  2 +-
 drivers/gpu/drm/i915/i915_gem_shrinker.c      | 11 +++--
 drivers/gpu/drm/i915/i915_perf.c              |  4 +-
 drivers/gpu/drm/i915/i915_request.c           |  6 ++-
 .../gpu/drm/i915/selftests/i915_gem_context.c |  4 +-
 drivers/gpu/drm/i915/selftests/i915_request.c |  4 +-
 .../gpu/drm/i915/selftests/igt_flush_test.c   |  2 +-
 11 files changed, 56 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 544e5e7f011f..099f97ef2303 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4105,7 +4105,8 @@ fault_irq_set(struct drm_i915_private *i915,
 
 	err = i915_gem_wait_for_idle(i915,
 				     I915_WAIT_LOCKED |
-				     I915_WAIT_INTERRUPTIBLE);
+				     I915_WAIT_INTERRUPTIBLE,
+				     MAX_SCHEDULE_TIMEOUT);
 	if (err)
 		goto err_unlock;
 
@@ -4210,7 +4211,8 @@ i915_drop_caches_set(void *data, u64 val)
 		if (val & DROP_ACTIVE)
 			ret = i915_gem_wait_for_idle(dev_priv,
 						     I915_WAIT_INTERRUPTIBLE |
-						     I915_WAIT_LOCKED);
+						     I915_WAIT_LOCKED,
+						     MAX_SCHEDULE_TIMEOUT);
 
 		if (val & DROP_RETIRE)
 			i915_retire_requests(dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 09ab12458244..bec0a2796c37 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3154,7 +3154,7 @@ void i915_gem_init_swizzling(struct drm_i915_private *dev_priv);
 void i915_gem_fini(struct drm_i915_private *dev_priv);
 void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
 int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
-			   unsigned int flags);
+			   unsigned int flags, long timeout);
 int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
 void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
 void i915_gem_resume(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 1a9dab302433..093d98ed7755 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2267,7 +2267,9 @@ static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
 
 	/* Attempt to reap some mmap space from dead objects */
 	do {
-		err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
+		err = i915_gem_wait_for_idle(dev_priv,
+					     I915_WAIT_INTERRUPTIBLE,
+					     MAX_SCHEDULE_TIMEOUT);
 		if (err)
 			break;
 
@@ -3742,14 +3744,14 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	return ret;
 }
 
-static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
+static long wait_for_timeline(struct i915_timeline *tl,
+			      unsigned int flags, long timeout)
 {
 	struct i915_request *rq;
-	long ret;
 
 	rq = i915_gem_active_get_unlocked(&tl->last_request);
 	if (!rq)
-		return 0;
+		return timeout;
 
 	/*
 	 * "Race-to-idle".
@@ -3763,10 +3765,10 @@ static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
 	if (flags & I915_WAIT_FOR_IDLE_BOOST)
 		gen6_rps_boost(rq, NULL);
 
-	ret = i915_request_wait(rq, flags, MAX_SCHEDULE_TIMEOUT);
+	timeout = i915_request_wait(rq, flags, timeout);
 	i915_request_put(rq);
 
-	return ret < 0 ? ret : 0;
+	return timeout;
 }
 
 static int wait_for_engines(struct drm_i915_private *i915)
@@ -3782,7 +3784,8 @@ static int wait_for_engines(struct drm_i915_private *i915)
 	return 0;
 }
 
-int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
+int i915_gem_wait_for_idle(struct drm_i915_private *i915,
+			   unsigned int flags, long timeout)
 {
 	GEM_TRACE("flags=%x (%s)\n",
 		  flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");
@@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
 		lockdep_assert_held(&i915->drm.struct_mutex);
 
 		list_for_each_entry(tl, &i915->gt.timelines, link) {
-			err = wait_for_timeline(tl, flags);
-			if (err)
-				return err;
+			timeout = wait_for_timeline(tl, flags, timeout);
+			if (timeout < 0)
+				return timeout;
 		}
 
 		err = wait_for_engines(i915);
@@ -3812,12 +3815,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
 	} else {
 		struct intel_engine_cs *engine;
 		enum intel_engine_id id;
-		int err;
 
 		for_each_engine(engine, i915, id) {
-			err = wait_for_timeline(&engine->timeline, flags);
-			if (err)
-				return err;
+			struct i915_timeline *tl = &engine->timeline;
+
+			timeout = wait_for_timeline(tl, flags, timeout);
+			if (timeout < 0)
+				return timeout;
 		}
 	}
 
@@ -5052,7 +5056,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
 		ret = i915_gem_wait_for_idle(dev_priv,
 					     I915_WAIT_INTERRUPTIBLE |
 					     I915_WAIT_LOCKED |
-					     I915_WAIT_FOR_IDLE_BOOST);
+					     I915_WAIT_FOR_IDLE_BOOST,
+					     MAX_SCHEDULE_TIMEOUT);
 		if (ret && ret != -EIO)
 			goto err_unlock;
 
@@ -5356,7 +5361,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
 	if (err)
 		goto err_active;
 
-	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+	err = i915_gem_wait_for_idle(i915,
+				     I915_WAIT_LOCKED,
+				     MAX_SCHEDULE_TIMEOUT);
 	if (err)
 		goto err_active;
 
@@ -5421,7 +5428,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
 	if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
 		goto out_ctx;
 
-	if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
+	if (WARN_ON(i915_gem_wait_for_idle(i915,
+					   I915_WAIT_LOCKED,
+					   MAX_SCHEDULE_TIMEOUT)))
 		goto out_ctx;
 
 	i915_gem_contexts_lost(i915);
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index 54814a196ee4..02b83a5ed96c 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -69,7 +69,8 @@ static int ggtt_flush(struct drm_i915_private *i915)
 
 	err = i915_gem_wait_for_idle(i915,
 				     I915_WAIT_INTERRUPTIBLE |
-				     I915_WAIT_LOCKED);
+				     I915_WAIT_LOCKED,
+				     MAX_SCHEDULE_TIMEOUT);
 	if (err)
 		return err;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 2ad319e17e39..abd81fb9b0b6 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2793,7 +2793,7 @@ void i915_gem_gtt_finish_pages(struct drm_i915_gem_object *obj,
 	struct i915_ggtt *ggtt = &dev_priv->ggtt;
 
 	if (unlikely(ggtt->do_idle_maps)) {
-		if (i915_gem_wait_for_idle(dev_priv, 0)) {
+		if (i915_gem_wait_for_idle(dev_priv, 0, MAX_SCHEDULE_TIMEOUT)) {
 			DRM_ERROR("Failed to wait for idle; VT'd may hang.\n");
 			/* Wait a bit, in hopes it avoids the hang */
 			udelay(10);
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index 55e84e71f526..c61f5b80fee3 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -172,7 +172,9 @@ i915_gem_shrink(struct drm_i915_private *i915,
 	 * we will free as much as we can and hope to get a second chance.
 	 */
 	if (flags & I915_SHRINK_ACTIVE)
-		i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+		i915_gem_wait_for_idle(i915,
+				       I915_WAIT_LOCKED,
+				       MAX_SCHEDULE_TIMEOUT);
 
 	trace_i915_gem_shrink(i915, target, flags);
 	i915_retire_requests(i915);
@@ -392,7 +394,8 @@ shrinker_lock_uninterruptible(struct drm_i915_private *i915, bool *unlock,
 	unsigned long timeout = jiffies + msecs_to_jiffies_timeout(timeout_ms);
 
 	do {
-		if (i915_gem_wait_for_idle(i915, 0) == 0 &&
+		if (i915_gem_wait_for_idle(i915,
+					   0, MAX_SCHEDULE_TIMEOUT) == 0 &&
 		    shrinker_lock(i915, unlock))
 			break;
 
@@ -466,7 +469,9 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
 		return NOTIFY_DONE;
 
 	/* Force everything onto the inactive lists */
-	ret = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+	ret = i915_gem_wait_for_idle(i915,
+				     I915_WAIT_LOCKED,
+				     MAX_SCHEDULE_TIMEOUT);
 	if (ret)
 		goto out;
 
diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
index 447407fee3b8..6bf10952c724 100644
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1836,7 +1836,9 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
 	 * So far the best way to work around this issue seems to be draining
 	 * the GPU from any submitted work.
 	 */
-	ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
+	ret = i915_gem_wait_for_idle(dev_priv,
+				     wait_flags,
+				     MAX_SCHEDULE_TIMEOUT);
 	if (ret)
 		goto out;
 
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 3248369dbcfb..5c2c93cbab12 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -206,7 +206,8 @@ static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno)
 	/* Carefully retire all requests without writing to the rings */
 	ret = i915_gem_wait_for_idle(i915,
 				     I915_WAIT_INTERRUPTIBLE |
-				     I915_WAIT_LOCKED);
+				     I915_WAIT_LOCKED,
+				     MAX_SCHEDULE_TIMEOUT);
 	if (ret)
 		return ret;
 
@@ -735,7 +736,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 		/* Ratelimit ourselves to prevent oom from malicious clients */
 		ret = i915_gem_wait_for_idle(i915,
 					     I915_WAIT_LOCKED |
-					     I915_WAIT_INTERRUPTIBLE);
+					     I915_WAIT_INTERRUPTIBLE,
+					     MAX_SCHEDULE_TIMEOUT);
 		if (ret)
 			goto err_unreserve;
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 5fbe15f4effd..ab2590242033 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -478,7 +478,9 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
 		}
 	}
 
-	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+	err = i915_gem_wait_for_idle(i915,
+				     I915_WAIT_LOCKED,
+				     MAX_SCHEDULE_TIMEOUT);
 	if (err)
 		return err;
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 43995fc3534d..c4aac6141e04 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -286,7 +286,9 @@ static int begin_live_test(struct live_test *t,
 	t->func = func;
 	t->name = name;
 
-	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
+	err = i915_gem_wait_for_idle(i915,
+				     I915_WAIT_LOCKED,
+				     MAX_SCHEDULE_TIMEOUT);
 	if (err) {
 		pr_err("%s(%s): failed to idle before, with err=%d!",
 		       func, name, err);
diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
index 0d06f559243f..09ab037ce803 100644
--- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c
+++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
@@ -64,7 +64,7 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
 	}
 
 	wedge_on_timeout(&w, i915, HZ)
-		i915_gem_wait_for_idle(i915, flags);
+		i915_gem_wait_for_idle(i915, flags, MAX_SCHEDULE_TIMEOUT);
 
 	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
 }
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() on setup
  2018-07-09 10:42 [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() Chris Wilson
@ 2018-07-09 10:42 ` Chris Wilson
  2018-07-09 12:01   ` Mika Kuoppala
  2018-07-09 10:42 ` [PATCH 3/3] drm/i915/selftests: Replace wait-on-timeout with explicit timeout Chris Wilson
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Chris Wilson @ 2018-07-09 10:42 UTC (permalink / raw)
  To: intel-gfx

With a broken GPU we expect it to fail during the initial
GPU setup where do a couple of context switches to record the defaults.
This is a task that takes a few milliseconds even on the slowest of
devices, but we may have to wait 60s for hangcheck to give in and
declare the machine inoperable. In this a case where any gpu hang is
unacceptable, both from a timeliness and practical standpoint.

We can therefore set a timeout on our wait-for-idle that is shorter than
the hangcheck (which may be up to 60s for a declaring a wedged driver)
and so detect the broken GPU much more quickly during driver load (and
so prevent stalling userspace for ages).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 093d98ed7755..05a9486dd4cf 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5361,11 +5361,10 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
 	if (err)
 		goto err_active;
 
-	err = i915_gem_wait_for_idle(i915,
-				     I915_WAIT_LOCKED,
-				     MAX_SCHEDULE_TIMEOUT);
-	if (err)
+	if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
+		err = -EIO; /* Caller will declare us wedged */
 		goto err_active;
+	}
 
 	assert_kernel_context_is_current(i915);
 
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/3] drm/i915/selftests: Replace wait-on-timeout with explicit timeout
  2018-07-09 10:42 [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() Chris Wilson
  2018-07-09 10:42 ` [PATCH 2/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() on setup Chris Wilson
@ 2018-07-09 10:42 ` Chris Wilson
  2018-07-09 12:07   ` Mika Kuoppala
  2018-07-09 11:18 ` ✓ Fi.CI.BAT: success for series starting with [1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() Patchwork
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 11+ messages in thread
From: Chris Wilson @ 2018-07-09 10:42 UTC (permalink / raw)
  To: intel-gfx

In igt_flush_test() we install a background timer in order to ensure
that the wait completes within a certain time. We can now tell the wait
that it has to complete within a timeout, and so no longer need the
background timer.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 .../gpu/drm/i915/selftests/igt_flush_test.c   | 55 +++----------------
 1 file changed, 9 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
index 09ab037ce803..af66e3d4e23a 100644
--- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c
+++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
@@ -9,52 +9,8 @@
 #include "../i915_selftest.h"
 #include "igt_flush_test.h"
 
-struct wedge_me {
-	struct delayed_work work;
-	struct drm_i915_private *i915;
-	const void *symbol;
-};
-
-static void wedge_me(struct work_struct *work)
-{
-	struct wedge_me *w = container_of(work, typeof(*w), work.work);
-
-	pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
-
-	GEM_TRACE("%pS timed out.\n", w->symbol);
-	GEM_TRACE_DUMP();
-
-	i915_gem_set_wedged(w->i915);
-}
-
-static void __init_wedge(struct wedge_me *w,
-			 struct drm_i915_private *i915,
-			 long timeout,
-			 const void *symbol)
-{
-	w->i915 = i915;
-	w->symbol = symbol;
-
-	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
-	schedule_delayed_work(&w->work, timeout);
-}
-
-static void __fini_wedge(struct wedge_me *w)
-{
-	cancel_delayed_work_sync(&w->work);
-	destroy_delayed_work_on_stack(&w->work);
-	w->i915 = NULL;
-}
-
-#define wedge_on_timeout(W, DEV, TIMEOUT)				\
-	for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
-	     (W)->i915;							\
-	     __fini_wedge((W)))
-
 int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
 {
-	struct wedge_me w;
-
 	cond_resched();
 
 	if (flags & I915_WAIT_LOCKED &&
@@ -63,8 +19,15 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
 		i915_gem_set_wedged(i915);
 	}
 
-	wedge_on_timeout(&w, i915, HZ)
-		i915_gem_wait_for_idle(i915, flags, MAX_SCHEDULE_TIMEOUT);
+	if (i915_gem_wait_for_idle(i915, flags, HZ / 5) == -ETIME) {
+		pr_err("%pS timed out, cancelling all further testing.\n",
+		       __builtin_return_address(0));
+
+		GEM_TRACE("%pS timed out.\n", __builtin_return_address(0));
+		GEM_TRACE_DUMP();
+
+		i915_gem_set_wedged(i915);
+	}
 
 	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
 }
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
  2018-07-09 10:42 [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() Chris Wilson
  2018-07-09 10:42 ` [PATCH 2/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() on setup Chris Wilson
  2018-07-09 10:42 ` [PATCH 3/3] drm/i915/selftests: Replace wait-on-timeout with explicit timeout Chris Wilson
@ 2018-07-09 11:18 ` Patchwork
  2018-07-09 11:38 ` [PATCH 1/3] " Mika Kuoppala
  2018-07-09 13:54 ` ✓ Fi.CI.IGT: success for series starting with [1/3] " Patchwork
  4 siblings, 0 replies; 11+ messages in thread
From: Patchwork @ 2018-07-09 11:18 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
URL   : https://patchwork.freedesktop.org/series/46170/
State : success

== Summary ==

= CI Bug Log - changes from CI_DRM_4453 -> Patchwork_9589 =

== Summary - SUCCESS ==

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/46170/revisions/1/mbox/

== Possible new issues ==

  Here are the unknown changes that may have been introduced in Patchwork_9589:

  === IGT changes ===

    ==== Warnings ====

    igt@drv_module_reload@basic-no-display:
      {fi-skl-iommu}:     FAIL (fdo#106066) -> DMESG-FAIL +2

    
== Known issues ==

  Here are the changes found in Patchwork_9589 that come from known issues:

  === IGT changes ===

    ==== Possible fixes ====

    igt@kms_pipe_crc_basic@suspend-read-crc-pipe-b:
      fi-snb-2520m:       INCOMPLETE (fdo#103713) -> PASS

    
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  fdo#103713 https://bugs.freedesktop.org/show_bug.cgi?id=103713
  fdo#106066 https://bugs.freedesktop.org/show_bug.cgi?id=106066


== Participating hosts (44 -> 40) ==

  Missing    (4): fi-ilk-m540 fi-byt-squawks fi-bsw-cyan fi-hsw-4200u 


== Build changes ==

    * Linux: CI_DRM_4453 -> Patchwork_9589

  CI_DRM_4453: 940e07c2a136b53965e4a6728e01e1638f5ba2de @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4543: 366eed37c7c71217e1cb1f3be5e26358a41f0001 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_9589: ea43af3817fe6016fc2fadb156796493ea31977d @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

ea43af3817fe drm/i915/selftests: Replace wait-on-timeout with explicit timeout
b6258c60f520 drm/i915: Provide a timeout to i915_gem_wait_for_idle() on setup
7de1c7a931a0 drm/i915: Provide a timeout to i915_gem_wait_for_idle()

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_9589/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
  2018-07-09 10:42 [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() Chris Wilson
                   ` (2 preceding siblings ...)
  2018-07-09 11:18 ` ✓ Fi.CI.BAT: success for series starting with [1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() Patchwork
@ 2018-07-09 11:38 ` Mika Kuoppala
  2018-07-09 11:47   ` Chris Wilson
  2018-07-09 13:54 ` ✓ Fi.CI.IGT: success for series starting with [1/3] " Patchwork
  4 siblings, 1 reply; 11+ messages in thread
From: Mika Kuoppala @ 2018-07-09 11:38 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Usually we have no idea about the upper bound we need to wait to catch
> up with userspace when idling the device, but in a few situations we
> know the system was idle beforehand and can provide a short timeout in
> order to very quickly catch a failure, long before hangcheck kicks in.
>
> In the following patches, we will use the timeout to curtain two overly
> long waits, where we know we can expect the GPU to complete within a
> reasonable time or declare it broken.
>
> In particular, with a broken GPU we expect it to fail during the initial
> GPU setup where do a couple of context switches to record the defaults.
> This is a task that takes a few milliseconds even on the slowest of
> devices, but we may have to wait 60s for hangcheck to give in and
> declare the machine inoperable. In this a case where any gpu hang is
> unacceptable, both from a timeliness and practical standpoint.
>
> The other improvement is that in selftests, we do not need to arm an
> independent timer to inject a wedge, as we can just limit the timeout on
> the wait directly.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c           |  6 ++-
>  drivers/gpu/drm/i915/i915_drv.h               |  2 +-
>  drivers/gpu/drm/i915/i915_gem.c               | 43 +++++++++++--------
>  drivers/gpu/drm/i915/i915_gem_evict.c         |  3 +-
>  drivers/gpu/drm/i915/i915_gem_gtt.c           |  2 +-
>  drivers/gpu/drm/i915/i915_gem_shrinker.c      | 11 +++--
>  drivers/gpu/drm/i915/i915_perf.c              |  4 +-
>  drivers/gpu/drm/i915/i915_request.c           |  6 ++-
>  .../gpu/drm/i915/selftests/i915_gem_context.c |  4 +-
>  drivers/gpu/drm/i915/selftests/i915_request.c |  4 +-
>  .../gpu/drm/i915/selftests/igt_flush_test.c   |  2 +-
>  11 files changed, 56 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 544e5e7f011f..099f97ef2303 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4105,7 +4105,8 @@ fault_irq_set(struct drm_i915_private *i915,
>  
>  	err = i915_gem_wait_for_idle(i915,
>  				     I915_WAIT_LOCKED |
> -				     I915_WAIT_INTERRUPTIBLE);
> +				     I915_WAIT_INTERRUPTIBLE,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (err)
>  		goto err_unlock;
>  
> @@ -4210,7 +4211,8 @@ i915_drop_caches_set(void *data, u64 val)
>  		if (val & DROP_ACTIVE)
>  			ret = i915_gem_wait_for_idle(dev_priv,
>  						     I915_WAIT_INTERRUPTIBLE |
> -						     I915_WAIT_LOCKED);
> +						     I915_WAIT_LOCKED,
> +						     MAX_SCHEDULE_TIMEOUT);
>  
>  		if (val & DROP_RETIRE)
>  			i915_retire_requests(dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 09ab12458244..bec0a2796c37 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3154,7 +3154,7 @@ void i915_gem_init_swizzling(struct drm_i915_private *dev_priv);
>  void i915_gem_fini(struct drm_i915_private *dev_priv);
>  void i915_gem_cleanup_engines(struct drm_i915_private *dev_priv);
>  int i915_gem_wait_for_idle(struct drm_i915_private *dev_priv,
> -			   unsigned int flags);
> +			   unsigned int flags, long timeout);
>  int __must_check i915_gem_suspend(struct drm_i915_private *dev_priv);
>  void i915_gem_suspend_late(struct drm_i915_private *dev_priv);
>  void i915_gem_resume(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 1a9dab302433..093d98ed7755 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2267,7 +2267,9 @@ static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
>  
>  	/* Attempt to reap some mmap space from dead objects */
>  	do {
> -		err = i915_gem_wait_for_idle(dev_priv, I915_WAIT_INTERRUPTIBLE);
> +		err = i915_gem_wait_for_idle(dev_priv,
> +					     I915_WAIT_INTERRUPTIBLE,
> +					     MAX_SCHEDULE_TIMEOUT);
>  		if (err)
>  			break;
>  
> @@ -3742,14 +3744,14 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>  	return ret;
>  }
>  
> -static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
> +static long wait_for_timeline(struct i915_timeline *tl,
> +			      unsigned int flags, long timeout)
>  {
>  	struct i915_request *rq;
> -	long ret;
>  
>  	rq = i915_gem_active_get_unlocked(&tl->last_request);
>  	if (!rq)
> -		return 0;
> +		return timeout;
>  
>  	/*
>  	 * "Race-to-idle".
> @@ -3763,10 +3765,10 @@ static int wait_for_timeline(struct i915_timeline *tl, unsigned int flags)
>  	if (flags & I915_WAIT_FOR_IDLE_BOOST)
>  		gen6_rps_boost(rq, NULL);
>  
> -	ret = i915_request_wait(rq, flags, MAX_SCHEDULE_TIMEOUT);
> +	timeout = i915_request_wait(rq, flags, timeout);
>  	i915_request_put(rq);
>  
> -	return ret < 0 ? ret : 0;
> +	return timeout;
>  }
>  
>  static int wait_for_engines(struct drm_i915_private *i915)
> @@ -3782,7 +3784,8 @@ static int wait_for_engines(struct drm_i915_private *i915)
>  	return 0;
>  }
>  
> -int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
> +int i915_gem_wait_for_idle(struct drm_i915_private *i915,
> +			   unsigned int flags, long timeout)
>  {
>  	GEM_TRACE("flags=%x (%s)\n",
>  		  flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");

Did you omit enhancing the trace on purpose?

Eventually i915_request_wait will assert for silly timeout values, but
should we add assertion here too as wait_for_timeline will
return what we put, for nonactive timelines?

> @@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>  		lockdep_assert_held(&i915->drm.struct_mutex);
>  
>  		list_for_each_entry(tl, &i915->gt.timelines, link) {
> -			err = wait_for_timeline(tl, flags);
> -			if (err)
> -				return err;
> +			timeout = wait_for_timeline(tl, flags, timeout);
> +			if (timeout < 0)
> +				return timeout;
>  		}
>  
>  		err = wait_for_engines(i915);

To truely serve the caller, the remaining timeout could be passed to
wait_for_engines too, but that can be followup.

So the only real thing is the missing trace.
-Mika


> @@ -3812,12 +3815,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>  	} else {
>  		struct intel_engine_cs *engine;
>  		enum intel_engine_id id;
> -		int err;
>  
>  		for_each_engine(engine, i915, id) {
> -			err = wait_for_timeline(&engine->timeline, flags);
> -			if (err)
> -				return err;
> +			struct i915_timeline *tl = &engine->timeline;
> +
> +			timeout = wait_for_timeline(tl, flags, timeout);
> +			if (timeout < 0)
> +				return timeout;
>  		}
>  	}
>  
> @@ -5052,7 +5056,8 @@ int i915_gem_suspend(struct drm_i915_private *dev_priv)
>  		ret = i915_gem_wait_for_idle(dev_priv,
>  					     I915_WAIT_INTERRUPTIBLE |
>  					     I915_WAIT_LOCKED |
> -					     I915_WAIT_FOR_IDLE_BOOST);
> +					     I915_WAIT_FOR_IDLE_BOOST,
> +					     MAX_SCHEDULE_TIMEOUT);
>  		if (ret && ret != -EIO)
>  			goto err_unlock;
>  
> @@ -5356,7 +5361,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
>  	if (err)
>  		goto err_active;
>  
> -	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +	err = i915_gem_wait_for_idle(i915,
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (err)
>  		goto err_active;
>  
> @@ -5421,7 +5428,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
>  	if (WARN_ON(i915_gem_switch_to_kernel_context(i915)))
>  		goto out_ctx;
>  
> -	if (WARN_ON(i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED)))
> +	if (WARN_ON(i915_gem_wait_for_idle(i915,
> +					   I915_WAIT_LOCKED,
> +					   MAX_SCHEDULE_TIMEOUT)))
>  		goto out_ctx;
>  
>  	i915_gem_contexts_lost(i915);
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
> index 54814a196ee4..02b83a5ed96c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -69,7 +69,8 @@ static int ggtt_flush(struct drm_i915_private *i915)
>  
>  	err = i915_gem_wait_for_idle(i915,
>  				     I915_WAIT_INTERRUPTIBLE |
> -				     I915_WAIT_LOCKED);
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (err)
>  		return err;
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 2ad319e17e39..abd81fb9b0b6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2793,7 +2793,7 @@ void i915_gem_gtt_finish_pages(struct drm_i915_gem_object *obj,
>  	struct i915_ggtt *ggtt = &dev_priv->ggtt;
>  
>  	if (unlikely(ggtt->do_idle_maps)) {
> -		if (i915_gem_wait_for_idle(dev_priv, 0)) {
> +		if (i915_gem_wait_for_idle(dev_priv, 0, MAX_SCHEDULE_TIMEOUT)) {
>  			DRM_ERROR("Failed to wait for idle; VT'd may hang.\n");
>  			/* Wait a bit, in hopes it avoids the hang */
>  			udelay(10);
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index 55e84e71f526..c61f5b80fee3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -172,7 +172,9 @@ i915_gem_shrink(struct drm_i915_private *i915,
>  	 * we will free as much as we can and hope to get a second chance.
>  	 */
>  	if (flags & I915_SHRINK_ACTIVE)
> -		i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +		i915_gem_wait_for_idle(i915,
> +				       I915_WAIT_LOCKED,
> +				       MAX_SCHEDULE_TIMEOUT);
>  
>  	trace_i915_gem_shrink(i915, target, flags);
>  	i915_retire_requests(i915);
> @@ -392,7 +394,8 @@ shrinker_lock_uninterruptible(struct drm_i915_private *i915, bool *unlock,
>  	unsigned long timeout = jiffies + msecs_to_jiffies_timeout(timeout_ms);
>  
>  	do {
> -		if (i915_gem_wait_for_idle(i915, 0) == 0 &&
> +		if (i915_gem_wait_for_idle(i915,
> +					   0, MAX_SCHEDULE_TIMEOUT) == 0 &&
>  		    shrinker_lock(i915, unlock))
>  			break;
>  
> @@ -466,7 +469,9 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
>  		return NOTIFY_DONE;
>  
>  	/* Force everything onto the inactive lists */
> -	ret = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +	ret = i915_gem_wait_for_idle(i915,
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (ret)
>  		goto out;
>  
> diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c
> index 447407fee3b8..6bf10952c724 100644
> --- a/drivers/gpu/drm/i915/i915_perf.c
> +++ b/drivers/gpu/drm/i915/i915_perf.c
> @@ -1836,7 +1836,9 @@ static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
>  	 * So far the best way to work around this issue seems to be draining
>  	 * the GPU from any submitted work.
>  	 */
> -	ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
> +	ret = i915_gem_wait_for_idle(dev_priv,
> +				     wait_flags,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (ret)
>  		goto out;
>  
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 3248369dbcfb..5c2c93cbab12 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -206,7 +206,8 @@ static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno)
>  	/* Carefully retire all requests without writing to the rings */
>  	ret = i915_gem_wait_for_idle(i915,
>  				     I915_WAIT_INTERRUPTIBLE |
> -				     I915_WAIT_LOCKED);
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (ret)
>  		return ret;
>  
> @@ -735,7 +736,8 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>  		/* Ratelimit ourselves to prevent oom from malicious clients */
>  		ret = i915_gem_wait_for_idle(i915,
>  					     I915_WAIT_LOCKED |
> -					     I915_WAIT_INTERRUPTIBLE);
> +					     I915_WAIT_INTERRUPTIBLE,
> +					     MAX_SCHEDULE_TIMEOUT);
>  		if (ret)
>  			goto err_unreserve;
>  
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> index 5fbe15f4effd..ab2590242033 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> @@ -478,7 +478,9 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
>  		}
>  	}
>  
> -	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +	err = i915_gem_wait_for_idle(i915,
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (err)
>  		return err;
>  
> diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
> index 43995fc3534d..c4aac6141e04 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_request.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_request.c
> @@ -286,7 +286,9 @@ static int begin_live_test(struct live_test *t,
>  	t->func = func;
>  	t->name = name;
>  
> -	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED);
> +	err = i915_gem_wait_for_idle(i915,
> +				     I915_WAIT_LOCKED,
> +				     MAX_SCHEDULE_TIMEOUT);
>  	if (err) {
>  		pr_err("%s(%s): failed to idle before, with err=%d!",
>  		       func, name, err);
> diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> index 0d06f559243f..09ab037ce803 100644
> --- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> @@ -64,7 +64,7 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
>  	}
>  
>  	wedge_on_timeout(&w, i915, HZ)
> -		i915_gem_wait_for_idle(i915, flags);
> +		i915_gem_wait_for_idle(i915, flags, MAX_SCHEDULE_TIMEOUT);
>  
>  	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
>  }
> -- 
> 2.18.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
  2018-07-09 11:38 ` [PATCH 1/3] " Mika Kuoppala
@ 2018-07-09 11:47   ` Chris Wilson
  2018-07-09 11:52     ` Mika Kuoppala
  0 siblings, 1 reply; 11+ messages in thread
From: Chris Wilson @ 2018-07-09 11:47 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2018-07-09 12:38:10)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> > -int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
> > +int i915_gem_wait_for_idle(struct drm_i915_private *i915,
> > +                        unsigned int flags, long timeout)
> >  {
> >       GEM_TRACE("flags=%x (%s)\n",
> >                 flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");
> 
> Did you omit enhancing the trace on purpose?

Didn't even consider it. I personally only use it as a breadcrumb in the
trace.
 
> Eventually i915_request_wait will assert for silly timeout values, but
> should we add assertion here too as wait_for_timeline will
> return what we put, for nonactive timelines?
> 
> > @@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
> >               lockdep_assert_held(&i915->drm.struct_mutex);
> >  
> >               list_for_each_entry(tl, &i915->gt.timelines, link) {
> > -                     err = wait_for_timeline(tl, flags);
> > -                     if (err)
> > -                             return err;
> > +                     timeout = wait_for_timeline(tl, flags, timeout);
> > +                     if (timeout < 0)
> > +                             return timeout;
> >               }
> >  
> >               err = wait_for_engines(i915);
> 
> To truely serve the caller, the remaining timeout could be passed to
> wait_for_engines too, but that can be followup.

I don't think so. The timeout we employ there is specific to the CSB
delay and we need to employ that irrespective of the caller timeout.
At this point, we have successfully waited for all requests, and now
just have to drain any HW queues -- imo, we have completed the task the
caller set out for us (requests completed within the timeout). And now
onto the followup retirement optimisations.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
  2018-07-09 11:47   ` Chris Wilson
@ 2018-07-09 11:52     ` Mika Kuoppala
  2018-07-09 13:06       ` Chris Wilson
  0 siblings, 1 reply; 11+ messages in thread
From: Mika Kuoppala @ 2018-07-09 11:52 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2018-07-09 12:38:10)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> > -int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>> > +int i915_gem_wait_for_idle(struct drm_i915_private *i915,
>> > +                        unsigned int flags, long timeout)
>> >  {
>> >       GEM_TRACE("flags=%x (%s)\n",
>> >                 flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");
>> 
>> Did you omit enhancing the trace on purpose?
>
> Didn't even consider it. I personally only use it as a breadcrumb in the
> trace.
>  
>> Eventually i915_request_wait will assert for silly timeout values, but
>> should we add assertion here too as wait_for_timeline will
>> return what we put, for nonactive timelines?
>> 
>> > @@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>> >               lockdep_assert_held(&i915->drm.struct_mutex);
>> >  
>> >               list_for_each_entry(tl, &i915->gt.timelines, link) {
>> > -                     err = wait_for_timeline(tl, flags);
>> > -                     if (err)
>> > -                             return err;
>> > +                     timeout = wait_for_timeline(tl, flags, timeout);
>> > +                     if (timeout < 0)
>> > +                             return timeout;
>> >               }
>> >  
>> >               err = wait_for_engines(i915);
>> 
>> To truely serve the caller, the remaining timeout could be passed to
>> wait_for_engines too, but that can be followup.
>
> I don't think so. The timeout we employ there is specific to the CSB
> delay and we need to employ that irrespective of the caller timeout.
> At this point, we have successfully waited for all requests, and now
> just have to drain any HW queues -- imo, we have completed the task the
> caller set out for us (requests completed within the timeout). And now
> onto the followup retirement optimisations.

Agreed, my mistake. At that point all the requests are already done.
If the engine hw flush timeouts, that is catastrophic, regardless
of caller preferences.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> -Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 2/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() on setup
  2018-07-09 10:42 ` [PATCH 2/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() on setup Chris Wilson
@ 2018-07-09 12:01   ` Mika Kuoppala
  0 siblings, 0 replies; 11+ messages in thread
From: Mika Kuoppala @ 2018-07-09 12:01 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> With a broken GPU we expect it to fail during the initial
> GPU setup where do a couple of context switches to record the defaults.
> This is a task that takes a few milliseconds even on the slowest of
> devices, but we may have to wait 60s for hangcheck to give in and
> declare the machine inoperable. In this a case where any gpu hang is
> unacceptable, both from a timeliness and practical standpoint.
>
> We can therefore set a timeout on our wait-for-idle that is shorter than
> the hangcheck (which may be up to 60s for a declaring a wedged driver)
> and so detect the broken GPU much more quickly during driver load (and
> so prevent stalling userspace for ages).
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 3/3] drm/i915/selftests: Replace wait-on-timeout with explicit timeout
  2018-07-09 10:42 ` [PATCH 3/3] drm/i915/selftests: Replace wait-on-timeout with explicit timeout Chris Wilson
@ 2018-07-09 12:07   ` Mika Kuoppala
  0 siblings, 0 replies; 11+ messages in thread
From: Mika Kuoppala @ 2018-07-09 12:07 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> In igt_flush_test() we install a background timer in order to ensure
> that the wait completes within a certain time. We can now tell the wait
> that it has to complete within a timeout, and so no longer need the
> background timer.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  .../gpu/drm/i915/selftests/igt_flush_test.c   | 55 +++----------------
>  1 file changed, 9 insertions(+), 46 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/igt_flush_test.c b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> index 09ab037ce803..af66e3d4e23a 100644
> --- a/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_flush_test.c
> @@ -9,52 +9,8 @@
>  #include "../i915_selftest.h"
>  #include "igt_flush_test.h"
>  
> -struct wedge_me {
> -	struct delayed_work work;
> -	struct drm_i915_private *i915;
> -	const void *symbol;
> -};
> -
> -static void wedge_me(struct work_struct *work)
> -{
> -	struct wedge_me *w = container_of(work, typeof(*w), work.work);
> -
> -	pr_err("%pS timed out, cancelling all further testing.\n", w->symbol);
> -
> -	GEM_TRACE("%pS timed out.\n", w->symbol);
> -	GEM_TRACE_DUMP();
> -
> -	i915_gem_set_wedged(w->i915);
> -}
> -
> -static void __init_wedge(struct wedge_me *w,
> -			 struct drm_i915_private *i915,
> -			 long timeout,
> -			 const void *symbol)
> -{
> -	w->i915 = i915;
> -	w->symbol = symbol;
> -
> -	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> -	schedule_delayed_work(&w->work, timeout);
> -}
> -
> -static void __fini_wedge(struct wedge_me *w)
> -{
> -	cancel_delayed_work_sync(&w->work);
> -	destroy_delayed_work_on_stack(&w->work);
> -	w->i915 = NULL;
> -}
> -
> -#define wedge_on_timeout(W, DEV, TIMEOUT)				\
> -	for (__init_wedge((W), (DEV), (TIMEOUT), __builtin_return_address(0)); \
> -	     (W)->i915;							\
> -	     __fini_wedge((W)))
> -
>  int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
>  {
> -	struct wedge_me w;
> -
>  	cond_resched();
>  
>  	if (flags & I915_WAIT_LOCKED &&
> @@ -63,8 +19,15 @@ int igt_flush_test(struct drm_i915_private *i915, unsigned int flags)
>  		i915_gem_set_wedged(i915);
>  	}
>  
> -	wedge_on_timeout(&w, i915, HZ)
> -		i915_gem_wait_for_idle(i915, flags, MAX_SCHEDULE_TIMEOUT);
> +	if (i915_gem_wait_for_idle(i915, flags, HZ / 5) == -ETIME) {

For all the callers, changing from HZ to HZ/5 should be acceptable.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>


> +		pr_err("%pS timed out, cancelling all further testing.\n",
> +		       __builtin_return_address(0));
> +
> +		GEM_TRACE("%pS timed out.\n", __builtin_return_address(0));
> +		GEM_TRACE_DUMP();
> +
> +		i915_gem_set_wedged(i915);
> +	}
>  
>  	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
>  }
> -- 
> 2.18.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
  2018-07-09 11:52     ` Mika Kuoppala
@ 2018-07-09 13:06       ` Chris Wilson
  0 siblings, 0 replies; 11+ messages in thread
From: Chris Wilson @ 2018-07-09 13:06 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2018-07-09 12:52:51)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Mika Kuoppala (2018-07-09 12:38:10)
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >> > -int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
> >> > +int i915_gem_wait_for_idle(struct drm_i915_private *i915,
> >> > +                        unsigned int flags, long timeout)
> >> >  {
> >> >       GEM_TRACE("flags=%x (%s)\n",
> >> >                 flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked");
> >> 
> >> Did you omit enhancing the trace on purpose?
> >
> > Didn't even consider it. I personally only use it as a breadcrumb in the
> > trace.
> >  
> >> Eventually i915_request_wait will assert for silly timeout values, but
> >> should we add assertion here too as wait_for_timeline will
> >> return what we put, for nonactive timelines?
> >> 
> >> > @@ -3798,9 +3801,9 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
> >> >               lockdep_assert_held(&i915->drm.struct_mutex);
> >> >  
> >> >               list_for_each_entry(tl, &i915->gt.timelines, link) {
> >> > -                     err = wait_for_timeline(tl, flags);
> >> > -                     if (err)
> >> > -                             return err;
> >> > +                     timeout = wait_for_timeline(tl, flags, timeout);
> >> > +                     if (timeout < 0)
> >> > +                             return timeout;
> >> >               }
> >> >  
> >> >               err = wait_for_engines(i915);
> >> 
> >> To truely serve the caller, the remaining timeout could be passed to
> >> wait_for_engines too, but that can be followup.
> >
> > I don't think so. The timeout we employ there is specific to the CSB
> > delay and we need to employ that irrespective of the caller timeout.
> > At this point, we have successfully waited for all requests, and now
> > just have to drain any HW queues -- imo, we have completed the task the
> > caller set out for us (requests completed within the timeout). And now
> > onto the followup retirement optimisations.
> 
> Agreed, my mistake. At that point all the requests are already done.
> If the engine hw flush timeouts, that is catastrophic, regardless
> of caller preferences.
> 
> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

Added the timeout info to the GEM_TRACE and pushed. Thanks for the review,
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

* ✓ Fi.CI.IGT: success for series starting with [1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
  2018-07-09 10:42 [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() Chris Wilson
                   ` (3 preceding siblings ...)
  2018-07-09 11:38 ` [PATCH 1/3] " Mika Kuoppala
@ 2018-07-09 13:54 ` Patchwork
  4 siblings, 0 replies; 11+ messages in thread
From: Patchwork @ 2018-07-09 13:54 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle()
URL   : https://patchwork.freedesktop.org/series/46170/
State : success

== Summary ==

= CI Bug Log - changes from CI_DRM_4453_full -> Patchwork_9589_full =

== Summary - WARNING ==

  Minor unknown changes coming with Patchwork_9589_full need to be verified
  manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_9589_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

== Possible new issues ==

  Here are the unknown changes that may have been introduced in Patchwork_9589_full:

  === IGT changes ===

    ==== Warnings ====

    igt@gem_exec_schedule@deep-blt:
      shard-kbl:          PASS -> SKIP +1

    igt@gem_exec_schedule@deep-bsd2:
      shard-kbl:          SKIP -> PASS +1

    
== Known issues ==

  Here are the changes found in Patchwork_9589_full that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@drv_suspend@shrink:
      shard-kbl:          PASS -> FAIL (fdo#106886)

    igt@gem_render_linear_blits@basic:
      shard-kbl:          PASS -> INCOMPLETE (fdo#103665)

    igt@kms_flip@2x-flip-vs-blocking-wf-vblank:
      shard-glk:          PASS -> FAIL (fdo#100368)

    igt@kms_flip_tiling@flip-y-tiled:
      shard-glk:          PASS -> FAIL (fdo#107161)

    
    ==== Possible fixes ====

    igt@drv_selftest@live_hangcheck:
      shard-kbl:          DMESG-FAIL (fdo#106560, fdo#106947) -> PASS

    igt@drv_suspend@shrink:
      shard-apl:          INCOMPLETE (fdo#103927) -> PASS

    igt@kms_cursor_legacy@2x-nonblocking-modeset-vs-cursor-atomic:
      shard-glk:          FAIL (fdo#105454, fdo#106509) -> PASS

    igt@kms_draw_crc@draw-method-rgb565-mmap-wc-ytiled:
      shard-glk:          FAIL (fdo#103184) -> PASS

    igt@kms_flip_tiling@flip-to-x-tiled:
      shard-glk:          FAIL (fdo#103822, fdo#107161) -> PASS

    igt@kms_flip_tiling@flip-x-tiled:
      shard-glk:          FAIL (fdo#107161) -> PASS

    igt@kms_plane@plane-panning-top-left-pipe-b-planes:
      shard-kbl:          DMESG-WARN (fdo#106247) -> PASS

    
  fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
  fdo#103184 https://bugs.freedesktop.org/show_bug.cgi?id=103184
  fdo#103665 https://bugs.freedesktop.org/show_bug.cgi?id=103665
  fdo#103822 https://bugs.freedesktop.org/show_bug.cgi?id=103822
  fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927
  fdo#105454 https://bugs.freedesktop.org/show_bug.cgi?id=105454
  fdo#106247 https://bugs.freedesktop.org/show_bug.cgi?id=106247
  fdo#106509 https://bugs.freedesktop.org/show_bug.cgi?id=106509
  fdo#106560 https://bugs.freedesktop.org/show_bug.cgi?id=106560
  fdo#106886 https://bugs.freedesktop.org/show_bug.cgi?id=106886
  fdo#106947 https://bugs.freedesktop.org/show_bug.cgi?id=106947
  fdo#107161 https://bugs.freedesktop.org/show_bug.cgi?id=107161


== Participating hosts (5 -> 5) ==

  No changes in participating hosts


== Build changes ==

    * Linux: CI_DRM_4453 -> Patchwork_9589

  CI_DRM_4453: 940e07c2a136b53965e4a6728e01e1638f5ba2de @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4543: 366eed37c7c71217e1cb1f3be5e26358a41f0001 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_9589: ea43af3817fe6016fc2fadb156796493ea31977d @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_9589/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2018-07-09 13:54 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-09 10:42 [PATCH 1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() Chris Wilson
2018-07-09 10:42 ` [PATCH 2/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() on setup Chris Wilson
2018-07-09 12:01   ` Mika Kuoppala
2018-07-09 10:42 ` [PATCH 3/3] drm/i915/selftests: Replace wait-on-timeout with explicit timeout Chris Wilson
2018-07-09 12:07   ` Mika Kuoppala
2018-07-09 11:18 ` ✓ Fi.CI.BAT: success for series starting with [1/3] drm/i915: Provide a timeout to i915_gem_wait_for_idle() Patchwork
2018-07-09 11:38 ` [PATCH 1/3] " Mika Kuoppala
2018-07-09 11:47   ` Chris Wilson
2018-07-09 11:52     ` Mika Kuoppala
2018-07-09 13:06       ` Chris Wilson
2018-07-09 13:54 ` ✓ Fi.CI.IGT: success for series starting with [1/3] " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.