All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Break modeset deadlocks on reset
@ 2017-06-20 16:12 Chris Wilson
  2017-06-20 19:55 ` Chris Wilson
                   ` (5 more replies)
  0 siblings, 6 replies; 17+ messages in thread
From: Chris Wilson @ 2017-06-20 16:12 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Trying to do a modeset from within a reset is fraught with danger. We
can fall into a cyclic deadlock where the modeset is waiting on a
previous modeset that is waiting on a request, and since the GPU hung
that request completion is waiting on the reset. As modesetting doesn't
allow its locks to be broken and restarted, or for its *own* reset
mechanism to take over the display, we have to do something very
evil instead. If we detect that we are stuck waiting to prepare the
display reset (by using a very simple timeout), resort to cancelling all
in-flight requests and throwing the user data into /dev/null, which is
marginally better than the driver locking up and keeping that data to
itself.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c | 18 ++++--------------
 drivers/gpu/drm/i915/i915_irq.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 7dcac3bfb771..8122cab400ef 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3040,7 +3040,8 @@ static void engine_set_wedged(struct intel_engine_cs *engine)
 	/* Mark all executing requests as skipped */
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 	list_for_each_entry(request, &engine->timeline->requests, link)
-		dma_fence_set_error(&request->fence, -EIO);
+		if (!i915_gem_request_completed(request))
+			dma_fence_set_error(&request->fence, -EIO);
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	/* Mark all pending requests as complete so that any concurrent
@@ -3079,6 +3080,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
+	set_bit(I915_WEDGED, &i915->gpu_error.flags);
 	for_each_engine(engine, i915, id)
 		engine_set_wedged(engine);
 
@@ -3087,20 +3089,7 @@ static int __i915_gem_set_wedged_BKL(void *data)
 
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
 {
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-	set_bit(I915_WEDGED, &dev_priv->gpu_error.flags);
-
-	/* Retire completed requests first so the list of inflight/incomplete
-	 * requests is accurate and we don't try and mark successful requests
-	 * as in error during __i915_gem_set_wedged_BKL().
-	 */
-	i915_gem_retire_requests(dev_priv);
-
 	stop_machine(__i915_gem_set_wedged_BKL, dev_priv, NULL);
-
-	i915_gem_context_lost(dev_priv);
-
-	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
 }
 
 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
@@ -3155,6 +3144,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 * context and do not require stop_machine().
 	 */
 	intel_engines_reset_default_submission(i915);
+	i915_gem_context_lost(i915);
 
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index bce2d1feceb1..d2521a1973d9 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2599,6 +2599,40 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return ret;
 }
 
+struct wedge_me {
+	struct delayed_work work;
+	struct drm_i915_private *i915;
+	const char *name;
+};
+
+static void wedge_me(struct work_struct *work)
+{
+	struct wedge_me *w = container_of(work, typeof(*w), work.work);
+
+	DRM_ERROR("%s timed out, cancelling all in-flight rendering.\n",
+		  w->name);
+	i915_gem_set_wedged(w->i915);
+}
+
+static bool init_wedge_on_timeout(struct wedge_me *w,
+				  struct drm_i915_private *i915,
+				  long timeout,
+				  const char *name)
+{
+	w->i915 = i915;
+	w->name = name;
+	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
+
+	schedule_delayed_work(&w->work, HZ);
+	return true;
+}
+
+static void destroy_wedge_on_timeout(struct wedge_me *w)
+{
+	cancel_delayed_work_sync(&w->work);
+	destroy_delayed_work_on_stack(&w->work);
+}
+
 /**
  * i915_reset_and_wakeup - do process context error handling work
  * @dev_priv: i915 device private
@@ -2612,13 +2646,17 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
 	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
 	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
+	struct wedge_me w;
 
 	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
 
 	DRM_DEBUG_DRIVER("resetting chip\n");
 	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
 
-	intel_prepare_reset(dev_priv);
+	if (init_wedge_on_timeout(&w, dev_priv, HZ, "intel_prepare_reset")) {
+		intel_prepare_reset(dev_priv);
+		destroy_wedge_on_timeout(&w);
+	}
 
 	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
 	wake_up_all(&dev_priv->gpu_error.wait_queue);
@@ -2642,6 +2680,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 				     HZ));
 
 	intel_finish_reset(dev_priv);
+	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
 
 	if (!test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
 		kobject_uevent_env(kobj,
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2017-06-23 12:44 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-20 16:12 [PATCH] drm/i915: Break modeset deadlocks on reset Chris Wilson
2017-06-20 19:55 ` Chris Wilson
2017-06-21 11:30   ` Tvrtko Ursulin
2017-06-21 11:46     ` Chris Wilson
2017-06-21 12:40       ` Chris Wilson
2017-06-21 13:37       ` Tvrtko Ursulin
2017-06-21 13:48         ` Chris Wilson
2017-06-20 20:11 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev2) Patchwork
2017-06-21 14:24 ` [PATCH v2] drm/i915: Break modeset deadlocks on reset Chris Wilson
2017-06-21 15:55   ` Maarten Lankhorst
2017-06-21 16:06     ` Chris Wilson
2017-06-21 17:27       ` Maarten Lankhorst
2017-06-21 15:39 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev3) Patchwork
2017-06-22 10:56 ` [PATCH v3] drm/i915: Break modeset deadlocks on reset Chris Wilson
2017-06-23 12:35   ` Tvrtko Ursulin
2017-06-23 12:44     ` Chris Wilson
2017-06-22 11:33 ` ✓ Fi.CI.BAT: success for drm/i915: Break modeset deadlocks on reset (rev4) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.