All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Break modeset deadlocks on reset
@ 2017-06-20 16:12 Chris Wilson
  2017-06-20 19:55 ` Chris Wilson
                   ` (5 more replies)
  0 siblings, 6 replies; 17+ messages in thread
From: Chris Wilson @ 2017-06-20 16:12 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Trying to do a modeset from within a reset is fraught with danger. We
can fall into a cyclic deadlock where the modeset is waiting on a
previous modeset that is waiting on a request, and since the GPU hung
that request completion is waiting on the reset. As modesetting doesn't
allow its locks to be broken and restarted, or for its *own* reset
mechanism to take over the display, we have to do something very
evil instead. If we detect that we are stuck waiting to prepare the
display reset (by using a very simple timeout), resort to cancelling all
in-flight requests and throwing the user data into /dev/null, which is
marginally better than the driver locking up and keeping that data to
itself.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 18 ++++--------------
 drivers/gpu/drm/i915/i915_irq.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 7dcac3bfb771..8122cab400ef 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3040,7 +3040,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
 	/* Mark all executing requests as skipped */
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 	list_for_each_entry(request, &engine->timeline->requests, link)
-		dma_fence_set_error(&request->fence, -EIO);
+		if (!i915_gem_request_completed(request))
+			dma_fence_set_error(&request->fence, -EIO);
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	/* Mark all pending requests as complete so that any concurrent
@@ -3079,6 +3080,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
+	set_bit(I915_WEDGED, &i915->gpu_error.flags);
 	for_each_engine(engine, i915, id)
 		engine_set_wedged(engine);
 
@@ -3087,20 +3089,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
 
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
 {
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-	set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
-
-	/* Retire completed requests first so the list of inflight/incomplete
-	 * requests is accurate and we don't try and mark successful requests
-	 * as in error during __i915_gem_set_wedged_BKL().
-	 */
-	i915_gem_retire_requests(dev_priv);
-
 	stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
-
-	i915_gem_context_lost(dev_priv);
-
-	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
 }
 
 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
@@ -3155,6 +3144,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 * context and do not require stop_machine().
 	 */
 	intel_engines_reset_default_submission(i915);
+	i915_gem_context_lost(i915);
 
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index bce2d1feceb1..d2521a1973d9 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2599,6 +2599,40 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return ret;
 }
 
+struct wedge_me {
+	struct delayed_work work;
+	struct drm_i915_private *i915;
+	const char *name;
+};
+
+static void wedge_me(struct work_struct *work)
+{
+	struct wedge_me *w = container_of(work, typeof(*w), work.work);
+
+	DRM_ERROR("%s timed out, cancelling all in-flight rendering.\n",
+		  w->name);
+	i915_gem_set_wedged(w->i915);
+}
+
+static bool init_wedge_on_timeout(struct wedge_me *w,
+				  struct drm_i915_private *i915,
+				  long timeout,
+				  const char *name)
+{
+	w->i915 = i915;
+	w->name = name;
+	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
+
+	schedule_delayed_work(&w->work, HZ);
+	return true;
+}
+
+static void destroy_wedge_on_timeout(struct wedge_me *w)
+{
+	cancel_delayed_work_sync(&w->work);
+	destroy_delayed_work_on_stack(&w->work);
+}
+
 /**
  * i915_reset_and_wakeup - do process context error handling work
  * @dev_priv: i915 device private
@@ -2612,13 +2646,17 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
 	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
 	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
+	struct wedge_me w;
 
 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
 
 	DRM_DEBUG_DRIVER("resetting chip\n");
 	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
 
-	intel_prepare_reset(dev_priv);
+	if (init_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset")) {
+		intel_prepare_reset(dev_priv);
+		destroy_wedge_on_timeout(&w);
+	}
 
 	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
 	wake_up_all(&dev_priv->gpu_error.wait_queue);
@@ -2642,6 +2680,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 				     HZ));
 
 	intel_finish_reset(dev_priv);
+	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
 
 	if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
 		kobject_uevent_env(kobj,
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH] drm/i915: Break modeset deadlocks on reset
  2017-06-20 16:12 [PATCH] drm/i915: Break modeset deadlocks on reset Chris Wilson
@ 2017-06-20 19:55 ` Chris Wilson
  2017-06-21 11:30   ` Tvrtko Ursulin
  2017-06-20 20:11 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev2) Patchwork
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 17+ messages in thread
From: Chris Wilson @ 2017-06-20 19:55 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Trying to do a modeset from within a reset is fraught with danger. We
can fall into a cyclic deadlock where the modeset is waiting on a
previous modeset that is waiting on a request, and since the GPU hung
that request completion is waiting on the reset. As modesetting doesn't
allow its locks to be broken and restarted, or for its *own* reset
mechanism to take over the display, we have to do something very
evil instead. If we detect that we are stuck waiting to prepare the
display reset (by using a very simple timeout), resort to cancelling all
in-flight requests and throwing the user data into /dev/null, which is
marginally better than the driver locking up and keeping that data to
itself.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 18 ++++--------------
 drivers/gpu/drm/i915/i915_irq.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c2213016fd86..973f4f9e6b84 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3040,7 +3040,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
 	/* Mark all executing requests as skipped */
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 	list_for_each_entry(request, &engine->timeline->requests, link)
-		dma_fence_set_error(&request->fence, -EIO);
+		if (!i915_gem_request_completed(request))
+			dma_fence_set_error(&request->fence, -EIO);
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	/* Mark all pending requests as complete so that any concurrent
@@ -3079,6 +3080,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
+	set_bit(I915_WEDGED, &i915->gpu_error.flags);
 	for_each_engine(engine, i915, id)
 		engine_set_wedged(engine);
 
@@ -3087,20 +3089,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
 
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
 {
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-	set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
-
-	/* Retire completed requests first so the list of inflight/incomplete
-	 * requests is accurate and we don't try and mark successful requests
-	 * as in error during __i915_gem_set_wedged_BKL().
-	 */
-	i915_gem_retire_requests(dev_priv);
-
 	stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
-
-	i915_gem_contexts_lost(dev_priv);
-
-	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
 }
 
 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
@@ -3155,6 +3144,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 * context and do not require stop_machine().
 	 */
 	intel_engines_reset_default_submission(i915);
+	i915_gem_contexts_lost(i915);
 
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index bce2d1feceb1..6585bcdf61a6 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2599,6 +2599,40 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return ret;
 }
 
+struct wedge_me {
+	struct delayed_work work;
+	struct drm_i915_private *i915;
+	const char *name;
+};
+
+static void wedge_me(struct work_struct *work)
+{
+	struct wedge_me *w = container_of(work, typeof(*w), work.work);
+
+	DRM_ERROR("%s timed out, cancelling all in-flight rendering.\n",
+		  w->name);
+	i915_gem_set_wedged(w->i915);
+}
+
+static bool init_wedge_on_timeout(struct wedge_me *w,
+				  struct drm_i915_private *i915,
+				  long timeout,
+				  const char *name)
+{
+	w->i915 = i915;
+	w->name = name;
+	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
+
+	schedule_delayed_work(&w->work, HZ);
+	return true;
+}
+
+static void destroy_wedge_on_timeout(struct wedge_me *w)
+{
+	cancel_delayed_work_sync(&w->work);
+	destroy_delayed_work_on_stack(&w->work);
+}
+
 /**
  * i915_reset_and_wakeup - do process context error handling work
  * @dev_priv: i915 device private
@@ -2612,13 +2646,17 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
 	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
 	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
+	struct wedge_me w;
 
 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
 
 	DRM_DEBUG_DRIVER("resetting chip\n");
 	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
 
-	intel_prepare_reset(dev_priv);
+	if (init_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset")) {
+		intel_prepare_reset(dev_priv);
+		destroy_wedge_on_timeout(&w);
+	}
 
 	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
 	wake_up_all(&dev_priv->gpu_error.wait_queue);
@@ -2642,6 +2680,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 				     HZ));
 
 	intel_finish_reset(dev_priv);
+	mod_delayed_work(dev_priv->wq, &dev_priv->gt.retire_work, 0);
 
 	if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
 		kobject_uevent_env(kobj,
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev2)
  2017-06-20 16:12 [PATCH] drm/i915: Break modeset deadlocks on reset Chris Wilson
  2017-06-20 19:55 ` Chris Wilson
@ 2017-06-20 20:11 ` Patchwork
  2017-06-21 14:24 ` [PATCH v2] drm/i915: Break modeset deadlocks on reset Chris Wilson
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 17+ messages in thread
From: Patchwork @ 2017-06-20 20:11 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Break modeset deadlocks on reset (rev2)
URL   : https://patchwork.freedesktop.org/series/26059/
State : success

== Summary ==

Series 26059v2 drm/i915: Break modeset deadlocks on reset
https://patchwork.freedesktop.org/api/1.0/series/26059/revisions/2/mbox/

Test gem_exec_flush:
        Subgroup basic-batch-kernel-default-uc:
                fail       -> PASS       (fi-snb-2600) fdo#100007
Test gem_exec_suspend:
        Subgroup basic-s4-devices:
                dmesg-warn -> PASS       (fi-kbl-7560u) fdo#100125
Test prime_busy:
        Subgroup basic-wait-after-default:
                pass       -> DMESG-WARN (fi-skl-6700hq) fdo#101515 +1
Test prime_self_import:
        Subgroup basic-llseek-bad:
                dmesg-warn -> PASS       (fi-skl-6700hq)
Test drv_module_reload:
        Subgroup basic-reload:
                dmesg-warn -> PASS       (fi-skl-6700hq)

fdo#100007 https://bugs.freedesktop.org/show_bug.cgi?id=100007
fdo#100125 https://bugs.freedesktop.org/show_bug.cgi?id=100125
fdo#101515 https://bugs.freedesktop.org/show_bug.cgi?id=101515

fi-bdw-5557u     total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  time:446s
fi-bdw-gvtdvm    total:278  pass:256  dwarn:8   dfail:0   fail:0   skip:14  time:428s
fi-bsw-n3050     total:278  pass:241  dwarn:1   dfail:0   fail:0   skip:36  time:532s
fi-bxt-j4205     total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  time:512s
fi-byt-j1900     total:278  pass:252  dwarn:2   dfail:0   fail:0   skip:24  time:494s
fi-byt-n2820     total:278  pass:248  dwarn:2   dfail:0   fail:0   skip:28  time:485s
fi-glk-2a        total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  time:586s
fi-hsw-4770      total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time:428s
fi-hsw-4770r     total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time:413s
fi-ilk-650       total:278  pass:228  dwarn:0   dfail:0   fail:0   skip:50  time:418s
fi-ivb-3520m     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:495s
fi-ivb-3770      total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:469s
fi-kbl-7500u     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:466s
fi-kbl-7560u     total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:578s
fi-kbl-r         total:278  pass:259  dwarn:1   dfail:0   fail:0   skip:18  time:580s
fi-skl-6260u     total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:457s
fi-skl-6700hq    total:278  pass:220  dwarn:3   dfail:0   fail:30  skip:24  time:337s
fi-skl-6700k     total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  time:461s
fi-skl-6770hq    total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:477s
fi-skl-gvtdvm    total:278  pass:265  dwarn:0   dfail:0   fail:0   skip:13  time:436s
fi-snb-2520m     total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  time:546s
fi-snb-2600      total:278  pass:249  dwarn:0   dfail:0   fail:0   skip:29  time:418s

0c75e3f237ff950c7ba824cbb5f93f40b4bb9c02 drm-tip: 2017y-06m-20d-16h-15m-17s UTC integration manifest
4a3d7e2 drm/i915: Break modeset deadlocks on reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_5003/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/i915: Break modeset deadlocks on reset
  2017-06-20 19:55 ` Chris Wilson
@ 2017-06-21 11:30   ` Tvrtko Ursulin
  2017-06-21 11:46     ` Chris Wilson
  0 siblings, 1 reply; 17+ messages in thread
From: Tvrtko Ursulin @ 2017-06-21 11:30 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Mika Kuoppala


On 20/06/2017 20:55, Chris Wilson wrote:
> Trying to do a modeset from within a reset is fraught with danger. We
> can fall into a cyclic deadlock where the modeset is waiting on a
> previous modeset that is waiting on a request, and since the GPU hung
> that request completion is waiting on the reset. As modesetting doesn't
> allow its locks to be broken and restarted, or for its *own* reset
> mechanism to take over the display, we have to do something very
> evil instead. If we detect that we are stuck waiting to prepare the
> display reset (by using a very simple timeout), resort to cancelling all
> in-flight requests and throwing the user data into /dev/null, which is
> marginally better than the driver locking up and keeping that data to
> itself.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem.c | 18 ++++--------------
>   drivers/gpu/drm/i915/i915_irq.c | 41 ++++++++++++++++++++++++++++++++++++++++-
>   2 files changed, 44 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c2213016fd86..973f4f9e6b84 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3040,7 +3040,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
>   	/* Mark all executing requests as skipped */
>   	spin_lock_irqsave(&engine->timeline->lock, flags);
>   	list_for_each_entry(request, &engine->timeline->requests, link)
> -		dma_fence_set_error(&request->fence, -EIO);
> +		if (!i915_gem_request_completed(request))
> +			dma_fence_set_error(&request->fence, -EIO);
>   	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>   
>   	/* Mark all pending requests as complete so that any concurrent
> @@ -3079,6 +3080,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
>   
> +	set_bit(I915_WEDGED, &i915->gpu_error.flags);
>   	for_each_engine(engine, i915, id)
>   		engine_set_wedged(engine);
>   
> @@ -3087,20 +3089,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>   
>   void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
>   {
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> -	set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
> -
> -	/* Retire completed requests first so the list of inflight/incomplete
> -	 * requests is accurate and we don't try and mark successful requests
> -	 * as in error during __i915_gem_set_wedged_BKL().
> -	 */
> -	i915_gem_retire_requests(dev_priv);
> -
>   	stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
> -
> -	i915_gem_contexts_lost(dev_priv);
> -
> -	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);

What was the purpose of this, since it is now gone?

>   }
>   
>   bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> @@ -3155,6 +3144,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>   	 * context and do not require stop_machine().
>   	 */
>   	intel_engines_reset_default_submission(i915);
> +	i915_gem_contexts_lost(i915);
>   
>   	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
>   	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index bce2d1feceb1..6585bcdf61a6 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2599,6 +2599,40 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>   	return ret;
>   }
>   
> +struct wedge_me {
> +	struct delayed_work work;
> +	struct drm_i915_private *i915;
> +	const char *name;
> +};
> +
> +static void wedge_me(struct work_struct *work)
> +{
> +	struct wedge_me *w = container_of(work, typeof(*w), work.work);
> +
> +	DRM_ERROR("%s timed out, cancelling all in-flight rendering.\n",
> +		  w->name);

Do you plan to call this from other places to need the name string?

> +	i915_gem_set_wedged(w->i915);

Is it safe to do the execlist_port manipulation at this point? Couldn't 
we receive an interrupt for one request completing after we have cleared 
it from ports but before the actual reset?

> +}
> +
> +static bool init_wedge_on_timeout(struct wedge_me *w,
> +				  struct drm_i915_private *i915,
> +				  long timeout,
> +				  const char *name)
> +{
> +	w->i915 = i915;
> +	w->name = name;
> +	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> +
> +	schedule_delayed_work(&w->work, HZ);

One second is pessimistic enough etc?

> +	return true;

Is the return value needed?

> +}
> +
> +static void destroy_wedge_on_timeout(struct wedge_me *w)
> +{
> +	cancel_delayed_work_sync(&w->work);
> +	destroy_delayed_work_on_stack(&w->work);
> +}
> +

Slight objection to the "on stack" API since it is disconnected from the 
allocation so does not directly see whether it is really on stack. But 
it is close enough to it and with just one user so could be passable.

>   /**
>    * i915_reset_and_wakeup - do process context error handling work
>    * @dev_priv: i915 device private
> @@ -2612,13 +2646,17 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)

Will need rebase.

>   	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
>   	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
>   	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
> +	struct wedge_me w;
>   
>   	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
>   
>   	DRM_DEBUG_DRIVER("resetting chip\n");
>   	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
>   
> -	intel_prepare_reset(dev_priv);
> +	if (init_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset")) {
> +		intel_prepare_reset(dev_priv);
> +		destroy_wedge_on_timeout(&w);
> +	}
>   
>   	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
>   	wake_up_all(&dev_priv->gpu_error.wait_queue);
> @@ -2642,6 +2680,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>   				     HZ));
>   
>   	intel_finish_reset(dev_priv);
> +	mod_delayed_work(dev_priv->wq, &dev_priv->gt.retire_work, 0);

A comment explaining why this?

>   
>   	if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
>   		kobject_uevent_env(kobj,
> 

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/i915: Break modeset deadlocks on reset
  2017-06-21 11:30   ` Tvrtko Ursulin
@ 2017-06-21 11:46     ` Chris Wilson
  2017-06-21 12:40       ` Chris Wilson
  2017-06-21 13:37       ` Tvrtko Ursulin
  0 siblings, 2 replies; 17+ messages in thread
From: Chris Wilson @ 2017-06-21 11:46 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: Mika Kuoppala

Quoting Tvrtko Ursulin (2017-06-21 12:30:15)
> 
> On 20/06/2017 20:55, Chris Wilson wrote:
> > Trying to do a modeset from within a reset is fraught with danger. We
> > can fall into a cyclic deadlock where the modeset is waiting on a
> > previous modeset that is waiting on a request, and since the GPU hung
> > that request completion is waiting on the reset. As modesetting doesn't
> > allow its locks to be broken and restarted, or for its *own* reset
> > mechanism to take over the display, we have to do something very
> > evil instead. If we detect that we are stuck waiting to prepare the
> > display reset (by using a very simple timeout), resort to cancelling all
> > in-flight requests and throwing the user data into /dev/null, which is
> > marginally better than the driver locking up and keeping that data to
> > itself.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_gem.c | 18 ++++--------------
> >   drivers/gpu/drm/i915/i915_irq.c | 41 ++++++++++++++++++++++++++++++++++++++++-
> >   2 files changed, 44 insertions(+), 15 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index c2213016fd86..973f4f9e6b84 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -3040,7 +3040,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
> >       /* Mark all executing requests as skipped */
> >       spin_lock_irqsave(&engine->timeline->lock, flags);
> >       list_for_each_entry(request, &engine->timeline->requests, link)
> > -             dma_fence_set_error(&request->fence, -EIO);
> > +             if (!i915_gem_request_completed(request))
> > +                     dma_fence_set_error(&request->fence, -EIO);
> >       spin_unlock_irqrestore(&engine->timeline->lock, flags);
> >   
> >       /* Mark all pending requests as complete so that any concurrent
> > @@ -3079,6 +3080,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
> >       struct intel_engine_cs *engine;
> >       enum intel_engine_id id;
> >   
> > +     set_bit(I915_WEDGED, &i915->gpu_error.flags);
> >       for_each_engine(engine, i915, id)
> >               engine_set_wedged(engine);
> >   
> > @@ -3087,20 +3089,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
> >   
> >   void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
> >   {
> > -     lockdep_assert_held(&dev_priv->drm.struct_mutex);
> > -     set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
> > -
> > -     /* Retire completed requests first so the list of inflight/incomplete
> > -      * requests is accurate and we don't try and mark successful requests
> > -      * as in error during __i915_gem_set_wedged_BKL().
> > -      */
> > -     i915_gem_retire_requests(dev_priv);
> > -
> >       stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
> > -
> > -     i915_gem_contexts_lost(dev_priv);
> > -
> > -     mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
> 
> What was the purpose of this, since it is now gone?

It was to clean up the device after explicitly cancelling everything. In
the pre-reset path, I felt it was overkill and replaced with the retire_work
(which will then call idle_work as required).

Hmm, thinking about the reset cycle, the retirement is forced anyway. My
main concern is that we lose the cleanliness of a "normal" wedging.

> >   bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> > @@ -3155,6 +3144,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> >        * context and do not require stop_machine().
> >        */
> >       intel_engines_reset_default_submission(i915);
> > +     i915_gem_contexts_lost(i915);
> >   
> >       smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
> >       clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index bce2d1feceb1..6585bcdf61a6 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -2599,6 +2599,40 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
> >       return ret;
> >   }
> >   
> > +struct wedge_me {
> > +     struct delayed_work work;
> > +     struct drm_i915_private *i915;
> > +     const char *name;
> > +};
> > +
> > +static void wedge_me(struct work_struct *work)
> > +{
> > +     struct wedge_me *w = container_of(work, typeof(*w), work.work);
> > +
> > +     DRM_ERROR("%s timed out, cancelling all in-flight rendering.\n",
> > +               w->name);
> 
> Do you plan to call this from other places to need the name string?

I was keeping that option open.

> 
> > +     i915_gem_set_wedged(w->i915);
> 
> Is it safe to do the execlist_port manipulation at this point? Couldn't 
> we receive an interrupt for one request completing after we have cleared 
> it from ports but before the actual reset?

We do the port manipulation and queue cancellation inside stop_machine,
i.e. in complete isolation. That ensures we won't succumb to a race
there, we just have to be careful that the execlists cancellation works
once the machine process the interrupts afterwards. That I'm not sure
about...

> > +static bool init_wedge_on_timeout(struct wedge_me *w,
> > +                               struct drm_i915_private *i915,
> > +                               long timeout,
> > +                               const char *name)
> > +{
> > +     w->i915 = i915;
> > +     w->name = name;
> > +     INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> > +
> > +     schedule_delayed_work(&w->work, HZ);
> 
> One second is pessimistic enough etc?

It's a hard call. If we make it too long, then the modesetting times out
with even worse results. Too short and we may timeout in the middle of a
slow eDP cycle.

> 
> > +     return true;
> 
> Is the return value needed?

That's just for syntatic sugar. What I wanted was something like

	i915_try_or_wedge(params) {
		dangerous_func().
	}

if (init_wedge()) { } was easy with no macro abuse.

The best macro abuse I can think of is for(init_wedge(w, params); try_wedge(w); )

> > +}
> > +
> > +static void destroy_wedge_on_timeout(struct wedge_me *w)
> > +{
> > +     cancel_delayed_work_sync(&w->work);
> > +     destroy_delayed_work_on_stack(&w->work);
> > +}
> > +
> 
> Slight objection to the "on stack" API since it is disconnected from the 
> allocation so does not directly see whether it is really on stack. But 
> it is close enough to it and with just one user so could be passable.

I hear you, but the stack properties and ensuring a strictly controlled
critical section were too nice.

> >   /**
> >    * i915_reset_and_wakeup - do process context error handling work
> >    * @dev_priv: i915 device private
> > @@ -2612,13 +2646,17 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
> 
> Will need rebase.

git is magic.

> >       char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
> >       char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
> >       char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
> > +     struct wedge_me w;
> >   
> >       kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
> >   
> >       DRM_DEBUG_DRIVER("resetting chip\n");
> >       kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> >   
> > -     intel_prepare_reset(dev_priv);
> > +     if (init_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset")) {
> > +             intel_prepare_reset(dev_priv);
> > +             destroy_wedge_on_timeout(&w);
> > +     }
> >   
> >       set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
> >       wake_up_all(&dev_priv->gpu_error.wait_queue);
> > @@ -2642,6 +2680,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
> >                                    HZ));
> >   
> >       intel_finish_reset(dev_priv);
> > +     mod_delayed_work(dev_priv->wq, &dev_priv->gt.retire_work, 0);
> 
> A comment explaining why this?

It's the replacement for the removed idle_work, but now I realise that we
are guaranteed a i915_gem_retire_requests() as part of the reset procedure.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/i915: Break modeset deadlocks on reset
  2017-06-21 11:46     ` Chris Wilson
@ 2017-06-21 12:40       ` Chris Wilson
  2017-06-21 13:37       ` Tvrtko Ursulin
  1 sibling, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2017-06-21 12:40 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: Mika Kuoppala

Quoting Chris Wilson (2017-06-21 12:46:51)
> Quoting Tvrtko Ursulin (2017-06-21 12:30:15)
> > 
> > On 20/06/2017 20:55, Chris Wilson wrote:
> > > +     i915_gem_set_wedged(w->i915);
> > 
> > Is it safe to do the execlist_port manipulation at this point? Couldn't 
> > we receive an interrupt for one request completing after we have cleared 
> > it from ports but before the actual reset?
> 
> We do the port manipulation and queue cancellation inside stop_machine,
> i.e. in complete isolation. That ensures we won't succumb to a race
> there, we just have to be careful that the execlists cancellation works
> once the machine process the interrupts afterwards. That I'm not sure
> about...

I got very worried, but then remembered we installed a filter on the
intel_lrc_irq_handler; that is we don't call into the irq handler unless
port[0] is set. So we should not get an extra call between the
stop_machine() and the actual reset, but I suppose we might get a
conitinuation of a tasklet? I doubt it, but adding a clear_bit to
i915_gem_set_wedged would alleviate that concern.

I'm happier, but randomly calling i915_gem_set_wedged() is definitely
undertested (just gem_eio tries), it is and always has been meant as a
last resort to keep the driver limping along.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/i915: Break modeset deadlocks on reset
  2017-06-21 11:46     ` Chris Wilson
  2017-06-21 12:40       ` Chris Wilson
@ 2017-06-21 13:37       ` Tvrtko Ursulin
  2017-06-21 13:48         ` Chris Wilson
  1 sibling, 1 reply; 17+ messages in thread
From: Tvrtko Ursulin @ 2017-06-21 13:37 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Mika Kuoppala


On 21/06/2017 12:46, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2017-06-21 12:30:15)
>>
>> On 20/06/2017 20:55, Chris Wilson wrote:
>>> Trying to do a modeset from within a reset is fraught with danger. We
>>> can fall into a cyclic deadlock where the modeset is waiting on a
>>> previous modeset that is waiting on a request, and since the GPU hung
>>> that request completion is waiting on the reset. As modesetting doesn't
>>> allow its locks to be broken and restarted, or for its *own* reset
>>> mechanism to take over the display, we have to do something very
>>> evil instead. If we detect that we are stuck waiting to prepare the
>>> display reset (by using a very simple timeout), resort to cancelling all
>>> in-flight requests and throwing the user data into /dev/null, which is
>>> marginally better than the driver locking up and keeping that data to
>>> itself.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>>> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
>>> ---
>>>    drivers/gpu/drm/i915/i915_gem.c | 18 ++++--------------
>>>    drivers/gpu/drm/i915/i915_irq.c | 41 ++++++++++++++++++++++++++++++++++++++++-
>>>    2 files changed, 44 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index c2213016fd86..973f4f9e6b84 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -3040,7 +3040,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
>>>        /* Mark all executing requests as skipped */
>>>        spin_lock_irqsave(&engine->timeline->lock, flags);
>>>        list_for_each_entry(request, &engine->timeline->requests, link)
>>> -             dma_fence_set_error(&request->fence, -EIO);
>>> +             if (!i915_gem_request_completed(request))
>>> +                     dma_fence_set_error(&request->fence, -EIO);
>>>        spin_unlock_irqrestore(&engine->timeline->lock, flags);
>>>    
>>>        /* Mark all pending requests as complete so that any concurrent
>>> @@ -3079,6 +3080,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>>>        struct intel_engine_cs *engine;
>>>        enum intel_engine_id id;
>>>    
>>> +     set_bit(I915_WEDGED, &i915->gpu_error.flags);
>>>        for_each_engine(engine, i915, id)
>>>                engine_set_wedged(engine);
>>>    
>>> @@ -3087,20 +3089,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>>>    
>>>    void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
>>>    {
>>> -     lockdep_assert_held(&dev_priv->drm.struct_mutex);
>>> -     set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
>>> -
>>> -     /* Retire completed requests first so the list of inflight/incomplete
>>> -      * requests is accurate and we don't try and mark successful requests
>>> -      * as in error during __i915_gem_set_wedged_BKL().
>>> -      */
>>> -     i915_gem_retire_requests(dev_priv);
>>> -
>>>        stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
>>> -
>>> -     i915_gem_contexts_lost(dev_priv);
>>> -
>>> -     mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
>>
>> What was the purpose of this, since it is now gone?
> 
> It was to clean up the device after explicitly cancelling everything. In
> the pre-reset path, I felt it was overkill and replaced with the retire_work
> (which will then call idle_work as required).
> 
> Hmm, thinking about the reset cycle, the retirement is forced anyway. My
> main concern is that we lose the cleanliness of a "normal" wedging.
> 
>>>    bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>>> @@ -3155,6 +3144,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>>>         * context and do not require stop_machine().
>>>         */
>>>        intel_engines_reset_default_submission(i915);
>>> +     i915_gem_contexts_lost(i915);
>>>    
>>>        smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
>>>        clear_bit(I915_WEDGED, &i915->gpu_error.flags);
>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>> index bce2d1feceb1..6585bcdf61a6 100644
>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>> @@ -2599,6 +2599,40 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>>>        return ret;
>>>    }
>>>    
>>> +struct wedge_me {
>>> +     struct delayed_work work;
>>> +     struct drm_i915_private *i915;
>>> +     const char *name;
>>> +};
>>> +
>>> +static void wedge_me(struct work_struct *work)
>>> +{
>>> +     struct wedge_me *w = container_of(work, typeof(*w), work.work);
>>> +
>>> +     DRM_ERROR("%s timed out, cancelling all in-flight rendering.\n",
>>> +               w->name);
>>
>> Do you plan to call this from other places to need the name string?
> 
> I was keeping that option open. >
>>
>>> +     i915_gem_set_wedged(w->i915);
>>
>> Is it safe to do the execlist_port manipulation at this point? Couldn't
>> we receive an interrupt for one request completing after we have cleared
>> it from ports but before the actual reset?
> 
> We do the port manipulation and queue cancellation inside stop_machine,
> i.e. in complete isolation. That ensures we won't succumb to a race
> there, we just have to be careful that the execlists cancellation works
> once the machine process the interrupts afterwards. That I'm not sure
> about...
> 
>>> +static bool init_wedge_on_timeout(struct wedge_me *w,
>>> +                               struct drm_i915_private *i915,
>>> +                               long timeout,
>>> +                               const char *name)
>>> +{
>>> +     w->i915 = i915;
>>> +     w->name = name;
>>> +     INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
>>> +
>>> +     schedule_delayed_work(&w->work, HZ);
>>
>> One second is pessimistic enough etc?
> 
> It's a hard call. If we make it too long, then the modesetting times out
> with even worse results. Too short and we may timeout in the middle of a
> slow eDP cycle.
> 
>>
>>> +     return true;
>>
>> Is the return value needed?
> 
> That's just for syntatic sugar. What I wanted was something like
> 
> 	i915_try_or_wedge(params) {
> 		dangerous_func().
> 	}
> 
> if (init_wedge()) { } was easy with no macro abuse.
> 
> The best macro abuse I can think of is for(init_wedge(w, params); try_wedge(w); )
> 
>>> +}
>>> +
>>> +static void destroy_wedge_on_timeout(struct wedge_me *w)
>>> +{
>>> +     cancel_delayed_work_sync(&w->work);
>>> +     destroy_delayed_work_on_stack(&w->work);
>>> +}
>>> +
>>
>> Slight objection to the "on stack" API since it is disconnected from the
>> allocation so does not directly see whether it is really on stack. But
>> it is close enough to it and with just one user so could be passable.
> 
> I hear you, but the stack properties and ensuring a strictly controlled
> critical section were too nice.
> 
>>>    /**
>>>     * i915_reset_and_wakeup - do process context error handling work
>>>     * @dev_priv: i915 device private
>>> @@ -2612,13 +2646,17 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>>
>> Will need rebase.
> 
> git is magic.
> 
>>>        char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
>>>        char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
>>>        char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
>>> +     struct wedge_me w;
>>>    
>>>        kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
>>>    
>>>        DRM_DEBUG_DRIVER("resetting chip\n");
>>>        kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
>>>    
>>> -     intel_prepare_reset(dev_priv);
>>> +     if (init_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset")) {
>>> +             intel_prepare_reset(dev_priv);
>>> +             destroy_wedge_on_timeout(&w);
>>> +     }
>>>    
>>>        set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
>>>        wake_up_all(&dev_priv->gpu_error.wait_queue);
>>> @@ -2642,6 +2680,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>>>                                     HZ));
>>>    
>>>        intel_finish_reset(dev_priv);
>>> +     mod_delayed_work(dev_priv->wq, &dev_priv->gt.retire_work, 0);
>>
>> A comment explaining why this?
> 
> It's the replacement for the removed idle_work, but now I realise that we
> are guaranteed a i915_gem_retire_requests() as part of the reset procedure.

So will be respinning with this removed?

Regards,

Tvrtko


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] drm/i915: Break modeset deadlocks on reset
  2017-06-21 13:37       ` Tvrtko Ursulin
@ 2017-06-21 13:48         ` Chris Wilson
  0 siblings, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2017-06-21 13:48 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: Mika Kuoppala

Quoting Tvrtko Ursulin (2017-06-21 14:37:33)
> 
> On 21/06/2017 12:46, Chris Wilson wrote:
> > It's the replacement for the removed idle_work, but now I realise that we
> > are guaranteed a i915_gem_retire_requests() as part of the reset procedure.
> 
> So will be respinning with this removed?

I will send it on after finishing an oops reproducer (for
drm_setversion). Currently I've bdw/bxt doing hang testing just to gain
some more confidence -- though by design they shouldn't be hitting this
path at all anymore (per-engine lockless resets ftw). pnv now recovers
from its struct_mutex deadlock just fine.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v2] drm/i915: Break modeset deadlocks on reset
  2017-06-20 16:12 [PATCH] drm/i915: Break modeset deadlocks on reset Chris Wilson
  2017-06-20 19:55 ` Chris Wilson
  2017-06-20 20:11 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev2) Patchwork
@ 2017-06-21 14:24 ` Chris Wilson
  2017-06-21 15:55   ` Maarten Lankhorst
  2017-06-21 15:39 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev3) Patchwork
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 17+ messages in thread
From: Chris Wilson @ 2017-06-21 14:24 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Trying to do a modeset from within a reset is fraught with danger. We
can fall into a cyclic deadlock where the modeset is waiting on a
previous modeset that is waiting on a request, and since the GPU hung
that request completion is waiting on the reset. As modesetting doesn't
allow its locks to be broken and restarted, or for its *own* reset
mechanism to take over the display, we have to do something very
evil instead. If we detect that we are stuck waiting to prepare the
display reset (by using a very simple timeout), resort to cancelling all
in-flight requests and throwing the user data into /dev/null, which is
marginally better than the driver locking up and keeping that data to
itself.

This is not a fix; this is just a workaround that unbreaks machines
until we can resolve the deadlock in a way that doesn't lose data!

v2: Move the retirement from set-wegded to the i915_reset() error path,
after which we no longer any delayed worker cleanup for
i915_handle_error()
v3: C abuse for syntactic sugar

References: https://bugs.freedesktop.org/show_bug.cgi?id=99093
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c |  1 +
 drivers/gpu/drm/i915/i915_gem.c | 18 ++++-------------
 drivers/gpu/drm/i915/i915_irq.c | 45 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index d1aa10c9cc5d..1df957b986c7 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1914,6 +1914,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
 
 error:
 	i915_gem_set_wedged(dev_priv);
+	i915_gem_retire_requests(dev_priv);
 	goto finish;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ae3ce1314bd1..36d838677982 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3062,7 +3062,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
 	/* Mark all executing requests as skipped */
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 	list_for_each_entry(request, &engine->timeline->requests, link)
-		dma_fence_set_error(&request->fence, -EIO);
+		if (!i915_gem_request_completed(request))
+			dma_fence_set_error(&request->fence, -EIO);
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	/* Mark all pending requests as complete so that any concurrent
@@ -3108,6 +3109,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
+	set_bit(I915_WEDGED, &i915->gpu_error.flags);
 	for_each_engine(engine, i915, id)
 		engine_set_wedged(engine);
 
@@ -3116,20 +3118,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
 
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
 {
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-	set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
-
-	/* Retire completed requests first so the list of inflight/incomplete
-	 * requests is accurate and we don't try and mark successful requests
-	 * as in error during __i915_gem_set_wedged_BKL().
-	 */
-	i915_gem_retire_requests(dev_priv);
-
 	stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
-
-	i915_gem_contexts_lost(dev_priv);
-
-	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
 }
 
 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
@@ -3184,6 +3173,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 * context and do not require stop_machine().
 	 */
 	intel_engines_reset_default_submission(i915);
+	i915_gem_contexts_lost(i915);
 
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index b1c7d1a04612..c948a5bd031c 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2599,6 +2599,46 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return ret;
 }
 
+struct wedge_me {
+	struct delayed_work work;
+	struct drm_i915_private *i915;
+	const char *name;
+};
+
+static void wedge_me(struct work_struct *work)
+{
+	struct wedge_me *w = container_of(work, typeof(*w), work.work);
+
+	dev_err(w->i915->drm.dev,
+		"%s timed out, cancelling all in-flight rendering.\n",
+		w->name);
+	i915_gem_set_wedged(w->i915);
+}
+
+static void __init_wedge(struct wedge_me *w,
+			 struct drm_i915_private *i915,
+			 long timeout,
+			 const char *name)
+{
+	w->i915 = i915;
+	w->name = name;
+
+	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
+	schedule_delayed_work(&w->work, timeout);
+}
+
+static void __fini_wedge(struct wedge_me *w)
+{
+	cancel_delayed_work_sync(&w->work);
+	destroy_delayed_work_on_stack(&w->work);
+	w->i915 = NULL;
+}
+
+#define i915_wedge_on_timeout(W, DEV, TIMEOUT, MSG)			\
+	for (__init_wedge((W), (DEV), (TIMEOUT), (MSG));		\
+	     (W)->i915;							\
+	     __fini_wedge((W)))
+
 /**
  * i915_reset_device - do process context error handling work
  * @dev_priv: i915 device private
@@ -2612,13 +2652,16 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
 	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
 	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
 	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
+	struct wedge_me w;
 
 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
 
 	DRM_DEBUG_DRIVER("resetting chip\n");
 	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
 
-	intel_prepare_reset(dev_priv);
+	i915_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset") {
+		intel_prepare_reset(dev_priv);
+	}
 
 	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
 	wake_up_all(&dev_priv->gpu_error.wait_queue);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev3)
  2017-06-20 16:12 [PATCH] drm/i915: Break modeset deadlocks on reset Chris Wilson
                   ` (2 preceding siblings ...)
  2017-06-21 14:24 ` [PATCH v2] drm/i915: Break modeset deadlocks on reset Chris Wilson
@ 2017-06-21 15:39 ` Patchwork
  2017-06-22 10:56 ` [PATCH v3] drm/i915: Break modeset deadlocks on reset Chris Wilson
  2017-06-22 11:33 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev4) Patchwork
  5 siblings, 0 replies; 17+ messages in thread
From: Patchwork @ 2017-06-21 15:39 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Break modeset deadlocks on reset (rev3)
URL   : https://patchwork.freedesktop.org/series/26059/
State : success

== Summary ==

Series 26059v3 drm/i915: Break modeset deadlocks on reset
https://patchwork.freedesktop.org/api/1.0/series/26059/revisions/3/mbox/

Test gem_exec_suspend:
        Subgroup basic-s4-devices:
                dmesg-warn -> PASS       (fi-snb-2600) fdo#100125
Test kms_pipe_crc_basic:
        Subgroup suspend-read-crc-pipe-b:
                dmesg-warn -> PASS       (fi-byt-j1900) fdo#101516
Test prime_busy:
        Subgroup basic-wait-after-default:
                dmesg-warn -> PASS       (fi-skl-6700hq) fdo#101515 +3

fdo#100125 https://bugs.freedesktop.org/show_bug.cgi?id=100125
fdo#101516 https://bugs.freedesktop.org/show_bug.cgi?id=101516
fdo#101515 https://bugs.freedesktop.org/show_bug.cgi?id=101515

fi-bdw-5557u     total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  time:441s
fi-bdw-gvtdvm    total:278  pass:256  dwarn:8   dfail:0   fail:0   skip:14  time:427s
fi-bsw-n3050     total:278  pass:241  dwarn:1   dfail:0   fail:0   skip:36  time:520s
fi-bxt-j4205     total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  time:510s
fi-byt-j1900     total:278  pass:253  dwarn:1   dfail:0   fail:0   skip:24  time:491s
fi-byt-n2820     total:278  pass:249  dwarn:1   dfail:0   fail:0   skip:28  time:481s
fi-glk-2a        total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  time:596s
fi-hsw-4770      total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time:431s
fi-hsw-4770r     total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time:415s
fi-ilk-650       total:278  pass:228  dwarn:0   dfail:0   fail:0   skip:50  time:415s
fi-ivb-3520m     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:497s
fi-ivb-3770      total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:473s
fi-kbl-7500u     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:462s
fi-kbl-7560u     total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:573s
fi-kbl-r         total:278  pass:259  dwarn:1   dfail:0   fail:0   skip:18  time:575s
fi-skl-6260u     total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:454s
fi-skl-6700hq    total:278  pass:220  dwarn:3   dfail:0   fail:30  skip:24  time:341s
fi-skl-6700k     total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  time:464s
fi-skl-6770hq    total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:479s
fi-skl-gvtdvm    total:278  pass:265  dwarn:0   dfail:0   fail:0   skip:13  time:434s
fi-snb-2520m     total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  time:541s
fi-snb-2600      total:278  pass:249  dwarn:0   dfail:0   fail:0   skip:29  time:402s

40b1ea11ae139b383d62f22ed2007db408f42a3e drm-tip: 2017y-06m-21d-14h-16m-17s UTC integration manifest
37a5010 drm/i915: Break modeset deadlocks on reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_5015/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] drm/i915: Break modeset deadlocks on reset
  2017-06-21 14:24 ` [PATCH v2] drm/i915: Break modeset deadlocks on reset Chris Wilson
@ 2017-06-21 15:55   ` Maarten Lankhorst
  2017-06-21 16:06     ` Chris Wilson
  0 siblings, 1 reply; 17+ messages in thread
From: Maarten Lankhorst @ 2017-06-21 15:55 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Mika Kuoppala

Op 21-06-17 om 16:24 schreef Chris Wilson:
> Trying to do a modeset from within a reset is fraught with danger. We
> can fall into a cyclic deadlock where the modeset is waiting on a
> previous modeset that is waiting on a request, and since the GPU hung
> that request completion is waiting on the reset. As modesetting doesn't
> allow its locks to be broken and restarted, or for its *own* reset
> mechanism to take over the display, we have to do something very
> evil instead. If we detect that we are stuck waiting to prepare the
> display reset (by using a very simple timeout), resort to cancelling all
> in-flight requests and throwing the user data into /dev/null, which is
> marginally better than the driver locking up and keeping that data to
> itself.
>
> This is not a fix; this is just a workaround that unbreaks machines
> until we can resolve the deadlock in a way that doesn't lose data!
>
> v2: Move the retirement from set-wegded to the i915_reset() error path,
> after which we no longer any delayed worker cleanup for
> i915_handle_error()
> v3: C abuse for syntactic sugar
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=99093
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c |  1 +
>  drivers/gpu/drm/i915/i915_gem.c | 18 ++++-------------
>  drivers/gpu/drm/i915/i915_irq.c | 45 ++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 49 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index d1aa10c9cc5d..1df957b986c7 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1914,6 +1914,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
>  
>  error:
>  	i915_gem_set_wedged(dev_priv);
> +	i915_gem_retire_requests(dev_priv);
>  	goto finish;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index ae3ce1314bd1..36d838677982 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3062,7 +3062,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
>  	/* Mark all executing requests as skipped */
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
>  	list_for_each_entry(request, &engine->timeline->requests, link)
> -		dma_fence_set_error(&request->fence, -EIO);
> +		if (!i915_gem_request_completed(request))
> +			dma_fence_set_error(&request->fence, -EIO);
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  
>  	/* Mark all pending requests as complete so that any concurrent
> @@ -3108,6 +3109,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>  
> +	set_bit(I915_WEDGED, &i915->gpu_error.flags);
>  	for_each_engine(engine, i915, id)
>  		engine_set_wedged(engine);
>  
> @@ -3116,20 +3118,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>  
>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
>  {
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> -	set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
> -
> -	/* Retire completed requests first so the list of inflight/incomplete
> -	 * requests is accurate and we don't try and mark successful requests
> -	 * as in error during __i915_gem_set_wedged_BKL().
> -	 */
> -	i915_gem_retire_requests(dev_priv);
> -
>  	stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
> -
> -	i915_gem_contexts_lost(dev_priv);
> -
> -	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
>  }
>  
>  bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> @@ -3184,6 +3173,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  	 * context and do not require stop_machine().
>  	 */
>  	intel_engines_reset_default_submission(i915);
> +	i915_gem_contexts_lost(i915);
>  
>  	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
>  	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index b1c7d1a04612..c948a5bd031c 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2599,6 +2599,46 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>  	return ret;
>  }
>  
> +struct wedge_me {
> +	struct delayed_work work;
> +	struct drm_i915_private *i915;
> +	const char *name;
> +};
> +
> +static void wedge_me(struct work_struct *work)
> +{
> +	struct wedge_me *w = container_of(work, typeof(*w), work.work);
> +
> +	dev_err(w->i915->drm.dev,
> +		"%s timed out, cancelling all in-flight rendering.\n",
> +		w->name);
> +	i915_gem_set_wedged(w->i915);
> +}
> +
> +static void __init_wedge(struct wedge_me *w,
> +			 struct drm_i915_private *i915,
> +			 long timeout,
> +			 const char *name)
> +{
> +	w->i915 = i915;
> +	w->name = name;
> +
> +	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> +	schedule_delayed_work(&w->work, timeout);
> +}
> +
> +static void __fini_wedge(struct wedge_me *w)
> +{
> +	cancel_delayed_work_sync(&w->work);
> +	destroy_delayed_work_on_stack(&w->work);
> +	w->i915 = NULL;
> +}
> +
> +#define i915_wedge_on_timeout(W, DEV, TIMEOUT, MSG)			\
> +	for (__init_wedge((W), (DEV), (TIMEOUT), (MSG));		\
> +	     (W)->i915;							\
> +	     __fini_wedge((W)))
> +
>  /**
>   * i915_reset_device - do process context error handling work
>   * @dev_priv: i915 device private
> @@ -2612,13 +2652,16 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
>  	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
>  	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
>  	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
> +	struct wedge_me w;
>  
>  	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
>  
>  	DRM_DEBUG_DRIVER("resetting chip\n");
>  	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
>  
> -	intel_prepare_reset(dev_priv);
> +	i915_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset") {
> +		intel_prepare_reset(dev_priv);
> +	}
I think v2 was more clear here.

But if this is just for the modeset locks, why not add a timeout to i915_sw_fence_await_reservation on old_obj?

That would allow us to end the deadlocks without having to kill off all fences first.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] drm/i915: Break modeset deadlocks on reset
  2017-06-21 15:55   ` Maarten Lankhorst
@ 2017-06-21 16:06     ` Chris Wilson
  2017-06-21 17:27       ` Maarten Lankhorst
  0 siblings, 1 reply; 17+ messages in thread
From: Chris Wilson @ 2017-06-21 16:06 UTC (permalink / raw)
  To: Maarten Lankhorst, intel-gfx; +Cc: Mika Kuoppala

Quoting Maarten Lankhorst (2017-06-21 16:55:06)
> Op 21-06-17 om 16:24 schreef Chris Wilson:
> > Trying to do a modeset from within a reset is fraught with danger. We
> > can fall into a cyclic deadlock where the modeset is waiting on a
> > previous modeset that is waiting on a request, and since the GPU hung
> > that request completion is waiting on the reset. As modesetting doesn't
> > allow its locks to be broken and restarted, or for its *own* reset
> > mechanism to take over the display, we have to do something very
> > evil instead. If we detect that we are stuck waiting to prepare the
> > display reset (by using a very simple timeout), resort to cancelling all
> > in-flight requests and throwing the user data into /dev/null, which is
> > marginally better than the driver locking up and keeping that data to
> > itself.
> >
> > This is not a fix; this is just a workaround that unbreaks machines
> > until we can resolve the deadlock in a way that doesn't lose data!
> >
> > v2: Move the retirement from set-wegded to the i915_reset() error path,
> > after which we no longer any delayed worker cleanup for
> > i915_handle_error()
> > v3: C abuse for syntactic sugar
> >
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=99093
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.c |  1 +
> >  drivers/gpu/drm/i915/i915_gem.c | 18 ++++-------------
> >  drivers/gpu/drm/i915/i915_irq.c | 45 ++++++++++++++++++++++++++++++++++++++++-
> >  3 files changed, 49 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> > index d1aa10c9cc5d..1df957b986c7 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -1914,6 +1914,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
> >  
> >  error:
> >       i915_gem_set_wedged(dev_priv);
> > +     i915_gem_retire_requests(dev_priv);
> >       goto finish;
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index ae3ce1314bd1..36d838677982 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -3062,7 +3062,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
> >       /* Mark all executing requests as skipped */
> >       spin_lock_irqsave(&engine->timeline->lock, flags);
> >       list_for_each_entry(request, &engine->timeline->requests, link)
> > -             dma_fence_set_error(&request->fence, -EIO);
> > +             if (!i915_gem_request_completed(request))
> > +                     dma_fence_set_error(&request->fence, -EIO);
> >       spin_unlock_irqrestore(&engine->timeline->lock, flags);
> >  
> >       /* Mark all pending requests as complete so that any concurrent
> > @@ -3108,6 +3109,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
> >       struct intel_engine_cs *engine;
> >       enum intel_engine_id id;
> >  
> > +     set_bit(I915_WEDGED, &i915->gpu_error.flags);
> >       for_each_engine(engine, i915, id)
> >               engine_set_wedged(engine);
> >  
> > @@ -3116,20 +3118,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
> >  
> >  void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
> >  {
> > -     lockdep_assert_held(&dev_priv->drm.struct_mutex);
> > -     set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
> > -
> > -     /* Retire completed requests first so the list of inflight/incomplete
> > -      * requests is accurate and we don't try and mark successful requests
> > -      * as in error during __i915_gem_set_wedged_BKL().
> > -      */
> > -     i915_gem_retire_requests(dev_priv);
> > -
> >       stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
> > -
> > -     i915_gem_contexts_lost(dev_priv);
> > -
> > -     mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
> >  }
> >  
> >  bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> > @@ -3184,6 +3173,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> >        * context and do not require stop_machine().
> >        */
> >       intel_engines_reset_default_submission(i915);
> > +     i915_gem_contexts_lost(i915);
> >  
> >       smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
> >       clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> > index b1c7d1a04612..c948a5bd031c 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -2599,6 +2599,46 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
> >       return ret;
> >  }
> >  
> > +struct wedge_me {
> > +     struct delayed_work work;
> > +     struct drm_i915_private *i915;
> > +     const char *name;
> > +};
> > +
> > +static void wedge_me(struct work_struct *work)
> > +{
> > +     struct wedge_me *w = container_of(work, typeof(*w), work.work);
> > +
> > +     dev_err(w->i915->drm.dev,
> > +             "%s timed out, cancelling all in-flight rendering.\n",
> > +             w->name);
> > +     i915_gem_set_wedged(w->i915);
> > +}
> > +
> > +static void __init_wedge(struct wedge_me *w,
> > +                      struct drm_i915_private *i915,
> > +                      long timeout,
> > +                      const char *name)
> > +{
> > +     w->i915 = i915;
> > +     w->name = name;
> > +
> > +     INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> > +     schedule_delayed_work(&w->work, timeout);
> > +}
> > +
> > +static void __fini_wedge(struct wedge_me *w)
> > +{
> > +     cancel_delayed_work_sync(&w->work);
> > +     destroy_delayed_work_on_stack(&w->work);
> > +     w->i915 = NULL;
> > +}
> > +
> > +#define i915_wedge_on_timeout(W, DEV, TIMEOUT, MSG)                  \
> > +     for (__init_wedge((W), (DEV), (TIMEOUT), (MSG));                \
> > +          (W)->i915;                                                 \
> > +          __fini_wedge((W)))
> > +
> >  /**
> >   * i915_reset_device - do process context error handling work
> >   * @dev_priv: i915 device private
> > @@ -2612,13 +2652,16 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
> >       char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
> >       char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
> >       char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
> > +     struct wedge_me w;
> >  
> >       kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
> >  
> >       DRM_DEBUG_DRIVER("resetting chip\n");
> >       kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> >  
> > -     intel_prepare_reset(dev_priv);
> > +     i915_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset") {
> > +             intel_prepare_reset(dev_priv);
> > +     }
> I think v2 was more clear here.
> 
> But if this is just for the modeset locks, why not add a timeout to i915_sw_fence_await_reservation on old_obj?

That wait prevents further GPU death in batches that may still be
pending. The mechanism that we need to use is the reset. However,
resetting the GPU wants to meddle with modesetting, yet offers no way to
reset the display from within reset.

> That would allow us to end the deadlocks without having to kill off all fences first.

Instead you simply ignore fences. It is no more a fix to the deadlock
than cancelling the fence. The issue is not the wait on the rendering.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] drm/i915: Break modeset deadlocks on reset
  2017-06-21 16:06     ` Chris Wilson
@ 2017-06-21 17:27       ` Maarten Lankhorst
  0 siblings, 0 replies; 17+ messages in thread
From: Maarten Lankhorst @ 2017-06-21 17:27 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Mika Kuoppala

Op 21-06-17 om 18:06 schreef Chris Wilson:
> Quoting Maarten Lankhorst (2017-06-21 16:55:06)
>> Op 21-06-17 om 16:24 schreef Chris Wilson:
>>> Trying to do a modeset from within a reset is fraught with danger. We
>>> can fall into a cyclic deadlock where the modeset is waiting on a
>>> previous modeset that is waiting on a request, and since the GPU hung
>>> that request completion is waiting on the reset. As modesetting doesn't
>>> allow its locks to be broken and restarted, or for its *own* reset
>>> mechanism to take over the display, we have to do something very
>>> evil instead. If we detect that we are stuck waiting to prepare the
>>> display reset (by using a very simple timeout), resort to cancelling all
>>> in-flight requests and throwing the user data into /dev/null, which is
>>> marginally better than the driver locking up and keeping that data to
>>> itself.
>>>
>>> This is not a fix; this is just a workaround that unbreaks machines
>>> until we can resolve the deadlock in a way that doesn't lose data!
>>>
>>> v2: Move the retirement from set-wegded to the i915_reset() error path,
>>> after which we no longer any delayed worker cleanup for
>>> i915_handle_error()
>>> v3: C abuse for syntactic sugar
>>>
>>> References: https://bugs.freedesktop.org/show_bug.cgi?id=99093
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
>>> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
>>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>> ---
>>>  drivers/gpu/drm/i915/i915_drv.c |  1 +
>>>  drivers/gpu/drm/i915/i915_gem.c | 18 ++++-------------
>>>  drivers/gpu/drm/i915/i915_irq.c | 45 ++++++++++++++++++++++++++++++++++++++++-
>>>  3 files changed, 49 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
>>> index d1aa10c9cc5d..1df957b986c7 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.c
>>> +++ b/drivers/gpu/drm/i915/i915_drv.c
>>> @@ -1914,6 +1914,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
>>>  
>>>  error:
>>>       i915_gem_set_wedged(dev_priv);
>>> +     i915_gem_retire_requests(dev_priv);
>>>       goto finish;
>>>  }
>>>  
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index ae3ce1314bd1..36d838677982 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -3062,7 +3062,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
>>>       /* Mark all executing requests as skipped */
>>>       spin_lock_irqsave(&engine->timeline->lock, flags);
>>>       list_for_each_entry(request, &engine->timeline->requests, link)
>>> -             dma_fence_set_error(&request->fence, -EIO);
>>> +             if (!i915_gem_request_completed(request))
>>> +                     dma_fence_set_error(&request->fence, -EIO);
>>>       spin_unlock_irqrestore(&engine->timeline->lock, flags);
>>>  
>>>       /* Mark all pending requests as complete so that any concurrent
>>> @@ -3108,6 +3109,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>>>       struct intel_engine_cs *engine;
>>>       enum intel_engine_id id;
>>>  
>>> +     set_bit(I915_WEDGED, &i915->gpu_error.flags);
>>>       for_each_engine(engine, i915, id)
>>>               engine_set_wedged(engine);
>>>  
>>> @@ -3116,20 +3118,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>>>  
>>>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
>>>  {
>>> -     lockdep_assert_held(&dev_priv->drm.struct_mutex);
>>> -     set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
>>> -
>>> -     /* Retire completed requests first so the list of inflight/incomplete
>>> -      * requests is accurate and we don't try and mark successful requests
>>> -      * as in error during __i915_gem_set_wedged_BKL().
>>> -      */
>>> -     i915_gem_retire_requests(dev_priv);
>>> -
>>>       stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
>>> -
>>> -     i915_gem_contexts_lost(dev_priv);
>>> -
>>> -     mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
>>>  }
>>>  
>>>  bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>>> @@ -3184,6 +3173,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>>>        * context and do not require stop_machine().
>>>        */
>>>       intel_engines_reset_default_submission(i915);
>>> +     i915_gem_contexts_lost(i915);
>>>  
>>>       smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
>>>       clear_bit(I915_WEDGED, &i915->gpu_error.flags);
>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>> index b1c7d1a04612..c948a5bd031c 100644
>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>> @@ -2599,6 +2599,46 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>>>       return ret;
>>>  }
>>>  
>>> +struct wedge_me {
>>> +     struct delayed_work work;
>>> +     struct drm_i915_private *i915;
>>> +     const char *name;
>>> +};
>>> +
>>> +static void wedge_me(struct work_struct *work)
>>> +{
>>> +     struct wedge_me *w = container_of(work, typeof(*w), work.work);
>>> +
>>> +     dev_err(w->i915->drm.dev,
>>> +             "%s timed out, cancelling all in-flight rendering.\n",
>>> +             w->name);
>>> +     i915_gem_set_wedged(w->i915);
>>> +}
>>> +
>>> +static void __init_wedge(struct wedge_me *w,
>>> +                      struct drm_i915_private *i915,
>>> +                      long timeout,
>>> +                      const char *name)
>>> +{
>>> +     w->i915 = i915;
>>> +     w->name = name;
>>> +
>>> +     INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
>>> +     schedule_delayed_work(&w->work, timeout);
>>> +}
>>> +
>>> +static void __fini_wedge(struct wedge_me *w)
>>> +{
>>> +     cancel_delayed_work_sync(&w->work);
>>> +     destroy_delayed_work_on_stack(&w->work);
>>> +     w->i915 = NULL;
>>> +}
>>> +
>>> +#define i915_wedge_on_timeout(W, DEV, TIMEOUT, MSG)                  \
>>> +     for (__init_wedge((W), (DEV), (TIMEOUT), (MSG));                \
>>> +          (W)->i915;                                                 \
>>> +          __fini_wedge((W)))
>>> +
>>>  /**
>>>   * i915_reset_device - do process context error handling work
>>>   * @dev_priv: i915 device private
>>> @@ -2612,13 +2652,16 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
>>>       char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
>>>       char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
>>>       char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
>>> +     struct wedge_me w;
>>>  
>>>       kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
>>>  
>>>       DRM_DEBUG_DRIVER("resetting chip\n");
>>>       kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
>>>  
>>> -     intel_prepare_reset(dev_priv);
>>> +     i915_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset") {
>>> +             intel_prepare_reset(dev_priv);
>>> +     }
>> I think v2 was more clear here.
>>
>> But if this is just for the modeset locks, why not add a timeout to i915_sw_fence_await_reservation on old_obj?
> That wait prevents further GPU death in batches that may still be
> pending. The mechanism that we need to use is the reset. However,
> resetting the GPU wants to meddle with modesetting, yet offers no way to
> reset the display from within reset.
True. However it doesn't prevent gpu death when userspace disables the plane separately from a modeset.
>> That would allow us to end the deadlocks without having to kill off all fences first.
> Instead you simply ignore fences. It is no more a fix to the deadlock
> than cancelling the fence. The issue is not the wait on the rendering.
> -Chris
Could we do both?

~Maarten
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH v3] drm/i915: Break modeset deadlocks on reset
  2017-06-20 16:12 [PATCH] drm/i915: Break modeset deadlocks on reset Chris Wilson
                   ` (3 preceding siblings ...)
  2017-06-21 15:39 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev3) Patchwork
@ 2017-06-22 10:56 ` Chris Wilson
  2017-06-23 12:35   ` Tvrtko Ursulin
  2017-06-22 11:33 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev4) Patchwork
  5 siblings, 1 reply; 17+ messages in thread
From: Chris Wilson @ 2017-06-22 10:56 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Trying to do a modeset from within a reset is fraught with danger. We
can fall into a cyclic deadlock where the modeset is waiting on a
previous modeset that is waiting on a request, and since the GPU hung
that request completion is waiting on the reset. As modesetting doesn't
allow its locks to be broken and restarted, or for its *own* reset
mechanism to take over the display, we have to do something very
evil instead. If we detect that we are stuck waiting to prepare the
display reset (by using a very simple timeout), resort to cancelling all
in-flight requests and throwing the user data into /dev/null, which is
marginally better than the driver locking up and keeping that data to
itself.

This is not a fix; this is just a workaround that unbreaks machines
until we can resolve the deadlock in a way that doesn't lose data!

v2: Move the retirement from set-wegded to the i915_reset() error path,
after which we no longer any delayed worker cleanup for
i915_handle_error()
v3: C abuse for syntactic sugar
v4: Cover all waits with the timeout to catch more driver breakage

References: https://bugs.freedesktop.org/show_bug.cgi?id=99093
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c |  1 +
 drivers/gpu/drm/i915/i915_gem.c | 18 +++-------
 drivers/gpu/drm/i915/i915_irq.c | 79 ++++++++++++++++++++++++++++++-----------
 3 files changed, 64 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index d1aa10c9cc5d..1df957b986c7 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1914,6 +1914,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
 
 error:
 	i915_gem_set_wedged(dev_priv);
+	i915_gem_retire_requests(dev_priv);
 	goto finish;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 7391e2d36a31..8bc9a3f53006 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3046,7 +3046,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
 	/* Mark all executing requests as skipped */
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 	list_for_each_entry(request, &engine->timeline->requests, link)
-		dma_fence_set_error(&request->fence, -EIO);
+		if (!i915_gem_request_completed(request))
+			dma_fence_set_error(&request->fence, -EIO);
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	/* Mark all pending requests as complete so that any concurrent
@@ -3092,6 +3093,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
+	set_bit(I915_WEDGED, &i915->gpu_error.flags);
 	for_each_engine(engine, i915, id)
 		engine_set_wedged(engine);
 
@@ -3100,20 +3102,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
 
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
 {
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-	set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
-
-	/* Retire completed requests first so the list of inflight/incomplete
-	 * requests is accurate and we don't try and mark successful requests
-	 * as in error during __i915_gem_set_wedged_BKL().
-	 */
-	i915_gem_retire_requests(dev_priv);
-
 	stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
-
-	i915_gem_contexts_lost(dev_priv);
-
-	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
 }
 
 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
@@ -3168,6 +3157,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 * context and do not require stop_machine().
 	 */
 	intel_engines_reset_default_submission(i915);
+	i915_gem_contexts_lost(i915);
 
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 61047ee2eede..5fa2a1cf71b8 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2587,6 +2587,46 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return ret;
 }
 
+struct wedge_me {
+	struct delayed_work work;
+	struct drm_i915_private *i915;
+	const char *name;
+};
+
+static void wedge_me(struct work_struct *work)
+{
+	struct wedge_me *w = container_of(work, typeof(*w), work.work);
+
+	dev_err(w->i915->drm.dev,
+		"%s timed out, cancelling all in-flight rendering.\n",
+		w->name);
+	i915_gem_set_wedged(w->i915);
+}
+
+static void __init_wedge(struct wedge_me *w,
+			 struct drm_i915_private *i915,
+			 long timeout,
+			 const char *name)
+{
+	w->i915 = i915;
+	w->name = name;
+
+	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
+	schedule_delayed_work(&w->work, timeout);
+}
+
+static void __fini_wedge(struct wedge_me *w)
+{
+	cancel_delayed_work_sync(&w->work);
+	destroy_delayed_work_on_stack(&w->work);
+	w->i915 = NULL;
+}
+
+#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
+	for (__init_wedge((W), (DEV), (TIMEOUT), __func__);		\
+	     (W)->i915;							\
+	     __fini_wedge((W)))
+
 /**
  * i915_reset_device - do process context error handling work
  * @dev_priv: i915 device private
@@ -2600,36 +2640,35 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
 	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
 	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
 	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
+	struct wedge_me w;
 
 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
 
 	DRM_DEBUG_DRIVER("resetting chip\n");
 	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
 
-	intel_prepare_reset(dev_priv);
+	i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
+		intel_prepare_reset(dev_priv);
 
-	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
-	wake_up_all(&dev_priv->gpu_error.wait_queue);
+		/* Signal that locked waiters should reset the GPU */
+		set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
+		wake_up_all(&dev_priv->gpu_error.wait_queue);
 
-	do {
-		/*
-		 * All state reset _must_ be completed before we update the
-		 * reset counter, for otherwise waiters might miss the reset
-		 * pending state and not properly drop locks, resulting in
-		 * deadlocks with the reset work.
+		/* Wait for anyone holding the lock to wakeup, without
+		 * blocking indefinitely on struct_mutex.
 		 */
-		if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
-			i915_reset(dev_priv);
-			mutex_unlock(&dev_priv->drm.struct_mutex);
-		}
-
-		/* We need to wait for anyone holding the lock to wakeup */
-	} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
-				     I915_RESET_HANDOFF,
-				     TASK_UNINTERRUPTIBLE,
-				     HZ));
+		do {
+			if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
+				i915_reset(dev_priv);
+				mutex_unlock(&dev_priv->drm.struct_mutex);
+			}
+		} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
+					     I915_RESET_HANDOFF,
+					     TASK_UNINTERRUPTIBLE,
+					     1));
 
-	intel_finish_reset(dev_priv);
+		intel_finish_reset(dev_priv);
+	}
 
 	if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
 		kobject_uevent_env(kobj,
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev4)
  2017-06-20 16:12 [PATCH] drm/i915: Break modeset deadlocks on reset Chris Wilson
                   ` (4 preceding siblings ...)
  2017-06-22 10:56 ` [PATCH v3] drm/i915: Break modeset deadlocks on reset Chris Wilson
@ 2017-06-22 11:33 ` Patchwork
  5 siblings, 0 replies; 17+ messages in thread
From: Patchwork @ 2017-06-22 11:33 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Break modeset deadlocks on reset (rev4)
URL   : https://patchwork.freedesktop.org/series/26059/
State : success

== Summary ==

Series 26059v4 drm/i915: Break modeset deadlocks on reset
https://patchwork.freedesktop.org/api/1.0/series/26059/revisions/4/mbox/

Test gem_exec_suspend:
        Subgroup basic-s4-devices:
                dmesg-warn -> PASS       (fi-kbl-7560u) fdo#100125
Test kms_cursor_legacy:
        Subgroup basic-busy-flip-before-cursor-atomic:
                fail       -> PASS       (fi-snb-2600) fdo#100215
Test prime_busy:
        Subgroup basic-wait-after-default:
                dmesg-warn -> PASS       (fi-skl-6700hq) fdo#101515 +3

fdo#100125 https://bugs.freedesktop.org/show_bug.cgi?id=100125
fdo#100215 https://bugs.freedesktop.org/show_bug.cgi?id=100215
fdo#101515 https://bugs.freedesktop.org/show_bug.cgi?id=101515

fi-bdw-5557u     total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  time:448s
fi-bdw-gvtdvm    total:278  pass:256  dwarn:8   dfail:0   fail:0   skip:14  time:434s
fi-bsw-n3050     total:278  pass:241  dwarn:1   dfail:0   fail:0   skip:36  time:521s
fi-bxt-j4205     total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  time:502s
fi-byt-j1900     total:278  pass:252  dwarn:2   dfail:0   fail:0   skip:24  time:493s
fi-byt-n2820     total:278  pass:248  dwarn:2   dfail:0   fail:0   skip:28  time:485s
fi-glk-2a        total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  time:591s
fi-hsw-4770      total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time:434s
fi-hsw-4770r     total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time:409s
fi-ilk-650       total:278  pass:228  dwarn:0   dfail:0   fail:0   skip:50  time:424s
fi-ivb-3520m     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:496s
fi-ivb-3770      total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:476s
fi-kbl-7500u     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:467s
fi-kbl-7560u     total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:573s
fi-kbl-r         total:278  pass:259  dwarn:1   dfail:0   fail:0   skip:18  time:583s
fi-skl-6260u     total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:462s
fi-skl-6700hq    total:278  pass:220  dwarn:3   dfail:0   fail:30  skip:24  time:340s
fi-skl-6700k     total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  time:464s
fi-skl-6770hq    total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:475s
fi-skl-gvtdvm    total:278  pass:265  dwarn:0   dfail:0   fail:0   skip:13  time:434s
fi-snb-2520m     total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  time:545s
fi-snb-2600      total:278  pass:249  dwarn:0   dfail:0   fail:0   skip:29  time:402s

7f2c92652872fbbe37c8bca92eccb7218100c21d drm-tip: 2017y-06m-22d-08h-47m-32s UTC integration manifest
0cfaf18 drm/i915: Break modeset deadlocks on reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_5024/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3] drm/i915: Break modeset deadlocks on reset
  2017-06-22 10:56 ` [PATCH v3] drm/i915: Break modeset deadlocks on reset Chris Wilson
@ 2017-06-23 12:35   ` Tvrtko Ursulin
  2017-06-23 12:44     ` Chris Wilson
  0 siblings, 1 reply; 17+ messages in thread
From: Tvrtko Ursulin @ 2017-06-23 12:35 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Mika Kuoppala


On 22/06/2017 11:56, Chris Wilson wrote:
> Trying to do a modeset from within a reset is fraught with danger. We
> can fall into a cyclic deadlock where the modeset is waiting on a
> previous modeset that is waiting on a request, and since the GPU hung
> that request completion is waiting on the reset. As modesetting doesn't
> allow its locks to be broken and restarted, or for its *own* reset
> mechanism to take over the display, we have to do something very
> evil instead. If we detect that we are stuck waiting to prepare the
> display reset (by using a very simple timeout), resort to cancelling all
> in-flight requests and throwing the user data into /dev/null, which is
> marginally better than the driver locking up and keeping that data to
> itself.
> 
> This is not a fix; this is just a workaround that unbreaks machines
> until we can resolve the deadlock in a way that doesn't lose data!
> 
> v2: Move the retirement from set-wegded to the i915_reset() error path,
> after which we no longer any delayed worker cleanup for
> i915_handle_error()
> v3: C abuse for syntactic sugar
> v4: Cover all waits with the timeout to catch more driver breakage
> 
> References: https://bugs.freedesktop.org/show_bug.cgi?id=99093
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> ---
>   drivers/gpu/drm/i915/i915_drv.c |  1 +
>   drivers/gpu/drm/i915/i915_gem.c | 18 +++-------
>   drivers/gpu/drm/i915/i915_irq.c | 79 ++++++++++++++++++++++++++++++-----------
>   3 files changed, 64 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index d1aa10c9cc5d..1df957b986c7 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1914,6 +1914,7 @@ void i915_reset(struct drm_i915_private *dev_priv)
>   
>   error:
>   	i915_gem_set_wedged(dev_priv);
> +	i915_gem_retire_requests(dev_priv);
>   	goto finish;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 7391e2d36a31..8bc9a3f53006 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3046,7 +3046,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
>   	/* Mark all executing requests as skipped */
>   	spin_lock_irqsave(&engine->timeline->lock, flags);
>   	list_for_each_entry(request, &engine->timeline->requests, link)
> -		dma_fence_set_error(&request->fence, -EIO);
> +		if (!i915_gem_request_completed(request))
> +			dma_fence_set_error(&request->fence, -EIO);
>   	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>   
>   	/* Mark all pending requests as complete so that any concurrent
> @@ -3092,6 +3093,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
>   
> +	set_bit(I915_WEDGED, &i915->gpu_error.flags);
>   	for_each_engine(engine, i915, id)
>   		engine_set_wedged(engine);
>   
> @@ -3100,20 +3102,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
>   
>   void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
>   {
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> -	set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
> -
> -	/* Retire completed requests first so the list of inflight/incomplete
> -	 * requests is accurate and we don't try and mark successful requests
> -	 * as in error during __i915_gem_set_wedged_BKL().
> -	 */
> -	i915_gem_retire_requests(dev_priv);
> -
>   	stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
> -
> -	i915_gem_contexts_lost(dev_priv);
> -
> -	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
>   }
>   
>   bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> @@ -3168,6 +3157,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>   	 * context and do not require stop_machine().
>   	 */
>   	intel_engines_reset_default_submission(i915);
> +	i915_gem_contexts_lost(i915);
>   
>   	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
>   	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 61047ee2eede..5fa2a1cf71b8 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2587,6 +2587,46 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>   	return ret;
>   }
>   
> +struct wedge_me {
> +	struct delayed_work work;
> +	struct drm_i915_private *i915;
> +	const char *name;
> +};
> +
> +static void wedge_me(struct work_struct *work)
> +{
> +	struct wedge_me *w = container_of(work, typeof(*w), work.work);
> +
> +	dev_err(w->i915->drm.dev,
> +		"%s timed out, cancelling all in-flight rendering.\n",
> +		w->name);
> +	i915_gem_set_wedged(w->i915);
> +}
> +
> +static void __init_wedge(struct wedge_me *w,
> +			 struct drm_i915_private *i915,
> +			 long timeout,
> +			 const char *name)
> +{
> +	w->i915 = i915;
> +	w->name = name;
> +
> +	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> +	schedule_delayed_work(&w->work, timeout);
> +}
> +
> +static void __fini_wedge(struct wedge_me *w)
> +{
> +	cancel_delayed_work_sync(&w->work);
> +	destroy_delayed_work_on_stack(&w->work);
> +	w->i915 = NULL;
> +}
> +
> +#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
> +	for (__init_wedge((W), (DEV), (TIMEOUT), __func__);		\
> +	     (W)->i915;							\
> +	     __fini_wedge((W)))
> +
>   /**
>    * i915_reset_device - do process context error handling work
>    * @dev_priv: i915 device private
> @@ -2600,36 +2640,35 @@ static void i915_reset_device(struct drm_i915_private *dev_priv)
>   	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
>   	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
>   	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
> +	struct wedge_me w;
>   
>   	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
>   
>   	DRM_DEBUG_DRIVER("resetting chip\n");
>   	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
>   
> -	intel_prepare_reset(dev_priv);
> +	i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
> +		intel_prepare_reset(dev_priv);
>   
> -	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
> -	wake_up_all(&dev_priv->gpu_error.wait_queue);
> +		/* Signal that locked waiters should reset the GPU */
> +		set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
> +		wake_up_all(&dev_priv->gpu_error.wait_queue);
>   
> -	do {
> -		/*
> -		 * All state reset _must_ be completed before we update the
> -		 * reset counter, for otherwise waiters might miss the reset
> -		 * pending state and not properly drop locks, resulting in
> -		 * deadlocks with the reset work.
> +		/* Wait for anyone holding the lock to wakeup, without
> +		 * blocking indefinitely on struct_mutex.
>   		 */
> -		if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
> -			i915_reset(dev_priv);
> -			mutex_unlock(&dev_priv->drm.struct_mutex);
> -		}
> -
> -		/* We need to wait for anyone holding the lock to wakeup */
> -	} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
> -				     I915_RESET_HANDOFF,
> -				     TASK_UNINTERRUPTIBLE,
> -				     HZ));
> +		do {
> +			if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
> +				i915_reset(dev_priv);
> +				mutex_unlock(&dev_priv->drm.struct_mutex);
> +			}
> +		} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
> +					     I915_RESET_HANDOFF,
> +					     TASK_UNINTERRUPTIBLE,
> +					     1));
>   
> -	intel_finish_reset(dev_priv);
> +		intel_finish_reset(dev_priv);
> +	}
>   
>   	if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
>   		kobject_uevent_env(kobj,
> 

Looks OK - lets see how temporary it will end up being. :)

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3] drm/i915: Break modeset deadlocks on reset
  2017-06-23 12:35   ` Tvrtko Ursulin
@ 2017-06-23 12:44     ` Chris Wilson
  0 siblings, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2017-06-23 12:44 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx; +Cc: Mika Kuoppala

Quoting Tvrtko Ursulin (2017-06-23 13:35:24)
> 
> On 22/06/2017 11:56, Chris Wilson wrote:
> > Trying to do a modeset from within a reset is fraught with danger. We
> > can fall into a cyclic deadlock where the modeset is waiting on a
> > previous modeset that is waiting on a request, and since the GPU hung
> > that request completion is waiting on the reset. As modesetting doesn't
> > allow its locks to be broken and restarted, or for its *own* reset
> > mechanism to take over the display, we have to do something very
> > evil instead. If we detect that we are stuck waiting to prepare the
> > display reset (by using a very simple timeout), resort to cancelling all
> > in-flight requests and throwing the user data into /dev/null, which is
> > marginally better than the driver locking up and keeping that data to
> > itself.
> > 
> > This is not a fix; this is just a workaround that unbreaks machines
> > until we can resolve the deadlock in a way that doesn't lose data!
> > 
> > v2: Move the retirement from set-wegded to the i915_reset() error path,
> > after which we no longer any delayed worker cleanup for
> > i915_handle_error()
> > v3: C abuse for syntactic sugar
> > v4: Cover all waits with the timeout to catch more driver breakage
> > 
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=99093
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> [snip]
> Looks OK - lets see how temporary it will end up being. :)

I've grown to like the idea of having a watchdog here. We critically
depend on reset to ensure forward progress of requests, and this gives
us that extra layer of paranoid protection.
 
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Thanks, the real magic is still to come :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2017-06-23 12:44 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-20 16:12 [PATCH] drm/i915: Break modeset deadlocks on reset Chris Wilson
2017-06-20 19:55 ` Chris Wilson
2017-06-21 11:30   ` Tvrtko Ursulin
2017-06-21 11:46     ` Chris Wilson
2017-06-21 12:40       ` Chris Wilson
2017-06-21 13:37       ` Tvrtko Ursulin
2017-06-21 13:48         ` Chris Wilson
2017-06-20 20:11 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev2) Patchwork
2017-06-21 14:24 ` [PATCH v2] drm/i915: Break modeset deadlocks on reset Chris Wilson
2017-06-21 15:55   ` Maarten Lankhorst
2017-06-21 16:06     ` Chris Wilson
2017-06-21 17:27       ` Maarten Lankhorst
2017-06-21 15:39 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev3) Patchwork
2017-06-22 10:56 ` [PATCH v3] drm/i915: Break modeset deadlocks on reset Chris Wilson
2017-06-23 12:35   ` Tvrtko Ursulin
2017-06-23 12:44     ` Chris Wilson
2017-06-22 11:33 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev4) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.