All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags
@ 2017-03-15 14:01 Chris Wilson
  2017-03-15 14:01 ` [PATCH 2/4] drm/i915: Move engine->submit_request selection to a vfunc Chris Wilson
                   ` (5 more replies)
  0 siblings, 6 replies; 10+ messages in thread
From: Chris Wilson @ 2017-03-15 14:01 UTC (permalink / raw)
  To: intel-gfx

I915_RESET_IN_PROGRESS is being used for both signaling the requirement
to i915_mutex_lock_interruptible() to avoid taking the struct_mutex and
to instruct a waiter (already holding the struct_mutex) to perform the
reset. To allow for a little more coordination, split these two meaning
into a couple of distinct flags. I915_RESET_BACKOFF tells
i915_mutex_lock_interruptible() not to acquire the mutex and
I915_RESET_HANDOFF tells the waiter to call i915_reset().

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c              | 16 +++++-----
 drivers/gpu/drm/i915/i915_drv.c                  |  7 +++--
 drivers/gpu/drm/i915/i915_drv.h                  | 40 +++++++++++++++++++++---
 drivers/gpu/drm/i915/i915_gem.c                  |  5 +--
 drivers/gpu/drm/i915/i915_gem_request.c          |  2 +-
 drivers/gpu/drm/i915/i915_irq.c                  | 38 ++++------------------
 drivers/gpu/drm/i915/intel_display.c             |  4 +--
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 18 +++++++----
 8 files changed, 71 insertions(+), 59 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 9db6b041a799..5fdf8c137235 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1305,16 +1305,18 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	enum intel_engine_id id;
 
 	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
-		seq_printf(m, "Wedged\n");
-	if (test_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags))
-		seq_printf(m, "Reset in progress\n");
+		seq_puts(m, "Wedged\n");
+	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
+		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
+	if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
+		seq_puts(m, "Reset in progress: reset handoff to waiter\n");
 	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
-		seq_printf(m, "Waiter holding struct mutex\n");
+		seq_puts(m, "Waiter holding struct mutex\n");
 	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
-		seq_printf(m, "struct_mutex blocked for reset\n");
+		seq_puts(m, "struct_mutex blocked for reset\n");
 
 	if (!i915.enable_hangcheck) {
-		seq_printf(m, "Hangcheck disabled\n");
+		seq_puts(m, "Hangcheck disabled\n");
 		return 0;
 	}
 
@@ -4121,7 +4123,7 @@ i915_wedged_set(void *data, u64 val)
 	 * while it is writing to 'i915_wedged'
 	 */
 
-	if (i915_reset_in_progress(&dev_priv->gpu_error))
+	if (i915_reset_backoff(&dev_priv->gpu_error))
 		return -EAGAIN;
 
 	i915_handle_error(dev_priv, val,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 9164167cd147..be3c81221d11 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1815,8 +1815,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
 	int ret;
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
-	if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags))
+	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
 		return;
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
@@ -1869,7 +1870,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
 wakeup:
 	i915_gem_reset_finish(dev_priv);
 	enable_irq(dev_priv->drm.irq);
-	wake_up_bit(&error->flags, I915_RESET_IN_PROGRESS);
+
+	clear_bit(I915_RESET_HANDOFF, &error->flags);
+	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
 	return;
 
 error:
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5156bcc59dea..0b02a6d710e1 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1595,8 +1595,33 @@ struct i915_gpu_error {
 	 */
 	unsigned long reset_count;
 
+	/**
+	 * flags: Control various stages of the GPU reset
+	 *
+	 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
+	 * other users acquiring the struct_mutex. To do this we set the
+	 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
+	 * and then check for that bit before acquiring the struct_mutex (in
+	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
+	 * secondary role in preventing two concurrent global reset attempts.
+	 *
+	 * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
+	 * struct_mutex. We try to acquire the struct_mutex in the reset worker,
+	 * but it may be held by some long running waiter (that we cannot
+	 * interrupt without causing trouble). Once we are ready to do the GPU
+	 * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
+	 * they already hold the struct_mutex and want to participate they can
+	 * inspect the bit and do the reset directly, otherwise the worker
+	 * waits for the struct_mutex.
+	 *
+	 * #I915_WEDGED - If reset fails and we can no longer use the GPU,
+	 * we set the #I915_WEDGED bit. Prior to command submission, e.g.
+	 * i915_gem_request_alloc(), this bit is checked and the sequence
+	 * aborted (with -EIO reported to userspace) if set.
+	 */
 	unsigned long flags;
-#define I915_RESET_IN_PROGRESS	0
+#define I915_RESET_BACKOFF	0
+#define I915_RESET_HANDOFF	1
 #define I915_WEDGED		(BITS_PER_LONG - 1)
 
 	/**
@@ -3387,9 +3412,14 @@ i915_gem_find_active_request(struct intel_engine_cs *engine);
 
 void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
 
-static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
+static inline bool i915_reset_backoff(struct i915_gpu_error *error)
+{
+	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
+}
+
+static inline bool i915_reset_handoff(struct i915_gpu_error *error)
 {
-	return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
+	return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
 }
 
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
@@ -3397,9 +3427,9 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
 }
 
-static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
+static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
 {
-	return i915_reset_in_progress(error) | i915_terminally_wedged(error);
+	return i915_reset_backoff(error) | i915_terminally_wedged(error);
 }
 
 static inline u32 i915_reset_count(struct i915_gpu_error *error)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d87983ba536f..ecd1b038318a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -103,16 +103,13 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
 
 	might_sleep();
 
-	if (!i915_reset_in_progress(error))
-		return 0;
-
 	/*
 	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
 	 * userspace. If it takes that long something really bad is going on and
 	 * we should simply try to bail out and fail as gracefully as possible.
 	 */
 	ret = wait_event_interruptible_timeout(error->reset_queue,
-					       !i915_reset_in_progress(error),
+					       !i915_reset_backoff(error),
 					       I915_RESET_TIMEOUT);
 	if (ret == 0) {
 		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 1e1d9f2072cd..0e8d1010cecb 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -1012,7 +1012,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 
 static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request *request)
 {
-	if (likely(!i915_reset_in_progress(&request->i915->gpu_error)))
+	if (likely(!i915_reset_handoff(&request->i915->gpu_error)))
 		return false;
 
 	__set_current_state(TASK_RUNNING);
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index e646c4eba65d..495e6a79cacf 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2631,22 +2631,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return ret;
 }
 
-static void i915_error_wake_up(struct drm_i915_private *dev_priv)
-{
-	/*
-	 * Notify all waiters for GPU completion events that reset state has
-	 * been changed, and that they need to restart their wait after
-	 * checking for potential errors (and bail out to drop locks if there is
-	 * a gpu reset pending so that i915_error_work_func can acquire them).
-	 */
-
-	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
-	wake_up_all(&dev_priv->gpu_error.wait_queue);
-
-	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
-	wake_up_all(&dev_priv->pending_flip_queue);
-}
-
 /**
  * i915_reset_and_wakeup - do process context error handling work
  * @dev_priv: i915 device private
@@ -2676,6 +2660,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	intel_runtime_pm_get(dev_priv);
 	intel_prepare_reset(dev_priv);
 
+	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
+	wake_up_all(&dev_priv->gpu_error.wait_queue);
+
 	do {
 		/*
 		 * All state reset _must_ be completed before we update the
@@ -2690,7 +2677,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 
 		/* We need to wait for anyone holding the lock to wakeup */
 	} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
-				     I915_RESET_IN_PROGRESS,
+				     I915_RESET_HANDOFF,
 				     TASK_UNINTERRUPTIBLE,
 				     HZ));
 
@@ -2705,6 +2692,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
 	 * Note: The wake_up also serves as a memory barrier so that
 	 * waiters see the updated value of the dev_priv->gpu_error.
 	 */
+	clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
 	wake_up_all(&dev_priv->gpu_error.reset_queue);
 }
 
@@ -2788,24 +2776,10 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
 	if (!engine_mask)
 		return;
 
-	if (test_and_set_bit(I915_RESET_IN_PROGRESS,
+	if (test_and_set_bit(I915_RESET_BACKOFF,
 			     &dev_priv->gpu_error.flags))
 		return;
 
-	/*
-	 * Wakeup waiting processes so that the reset function
-	 * i915_reset_and_wakeup doesn't deadlock trying to grab
-	 * various locks. By bumping the reset counter first, the woken
-	 * processes will see a reset in progress and back off,
-	 * releasing their locks and then wait for the reset completion.
-	 * We must do this for _all_ gpu waiters that might hold locks
-	 * that the reset work needs to acquire.
-	 *
-	 * Note: The wake_up also provides a memory barrier to ensure that the
-	 * waiters see the updated value of the reset flags.
-	 */
-	i915_error_wake_up(dev_priv);
-
 	i915_reset_and_wakeup(dev_priv);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 4b73513b46c1..5959c9b6dc97 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -3639,7 +3639,7 @@ static bool abort_flip_on_reset(struct intel_crtc *crtc)
 {
 	struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
 
-	if (i915_reset_in_progress(error))
+	if (i915_reset_backoff(error))
 		return true;
 
 	if (crtc->reset_count != i915_reset_count(error))
@@ -10595,7 +10595,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
 		goto cleanup;
 
 	intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
-	if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
+	if (i915_reset_backoff_or_wedged(&dev_priv->gpu_error)) {
 		ret = -EIO;
 		goto unlock;
 	}
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index d4acee6730e9..6ec7c731a267 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -301,7 +301,8 @@ static int igt_global_reset(void *arg)
 
 	/* Check that we can issue a global GPU reset */
 
-	set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
+	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 
 	mutex_lock(&i915->drm.struct_mutex);
 	reset_count = i915_reset_count(&i915->gpu_error);
@@ -314,7 +315,8 @@ static int igt_global_reset(void *arg)
 	}
 	mutex_unlock(&i915->drm.struct_mutex);
 
-	GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
+	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
+	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 	if (i915_terminally_wedged(&i915->gpu_error))
 		err = -EIO;
 
@@ -330,7 +332,7 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
 
 	reset_count = i915_reset_count(&rq->i915->gpu_error);
 
-	set_bit(I915_RESET_IN_PROGRESS, &rq->i915->gpu_error.flags);
+	set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
 	wake_up_all(&rq->i915->gpu_error.wait_queue);
 
 	return reset_count;
@@ -357,7 +359,7 @@ static int igt_wait_reset(void *arg)
 
 	/* Check that we detect a stuck waiter and issue a reset */
 
-	set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
+	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 
 	mutex_lock(&i915->drm.struct_mutex);
 	err = hang_init(&h, i915);
@@ -388,8 +390,8 @@ static int igt_wait_reset(void *arg)
 		err = timeout;
 		goto out_rq;
 	}
-	GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
 
+	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 	if (i915_reset_count(&i915->gpu_error) == reset_count) {
 		pr_err("No GPU reset recorded!\n");
 		err = -EINVAL;
@@ -402,6 +404,7 @@ static int igt_wait_reset(void *arg)
 	hang_fini(&h);
 unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
+	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 
 	if (i915_terminally_wedged(&i915->gpu_error))
 		return -EIO;
@@ -422,6 +425,7 @@ static int igt_reset_queue(void *arg)
 	if (!igt_can_mi_store_dword_imm(i915))
 		return 0;
 
+	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 	mutex_lock(&i915->drm.struct_mutex);
 	err = hang_init(&h, i915);
 	if (err)
@@ -470,8 +474,9 @@ static int igt_reset_queue(void *arg)
 
 			i915_reset(i915);
 
-			GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS,
+			GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
 					    &i915->gpu_error.flags));
+
 			if (prev->fence.error != -EIO) {
 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
 				       prev->fence.error);
@@ -514,6 +519,7 @@ static int igt_reset_queue(void *arg)
 	hang_fini(&h);
 unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
+	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
 
 	if (i915_terminally_wedged(&i915->gpu_error))
 		return -EIO;
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/4] drm/i915: Move engine->submit_request selection to a vfunc
  2017-03-15 14:01 [PATCH 1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags Chris Wilson
@ 2017-03-15 14:01 ` Chris Wilson
  2017-03-15 14:01 ` [PATCH 3/4] drm/i915: Restore engine->submit_request before unwedging Chris Wilson
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2017-03-15 14:01 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

It turns out that we may want to restore the original
engine->submit_request (and engine->schedule) callbacks from more than
just the guc <-> execlists transition. Move this to a vfunc so we can
have a common interface.

v2: Move initial selection to intel_engines_init_common(), repaint vfunc
with engine->set_default_submission (and a similar colour for the
helper).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c |  2 +-
 drivers/gpu/drm/i915/intel_engine_cs.c     | 12 ++++++++++++
 drivers/gpu/drm/i915/intel_lrc.c           | 15 +++++----------
 drivers/gpu/drm/i915/intel_lrc.h           |  1 -
 drivers/gpu/drm/i915/intel_ringbuffer.c    | 15 +++++++++++++--
 drivers/gpu/drm/i915/intel_ringbuffer.h    |  4 ++++
 6 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index b4d24cd7639a..ece541cea965 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1051,7 +1051,7 @@ void i915_guc_submission_disable(struct drm_i915_private *dev_priv)
 		return;
 
 	/* Revert back to manual ELSP submission */
-	intel_execlists_enable_submission(dev_priv);
+	intel_engines_reset_default_submission(dev_priv);
 }
 
 void i915_guc_submission_fini(struct drm_i915_private *dev_priv)
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 73fe718516a5..79bf11bf2e81 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -191,6 +191,7 @@ int intel_engines_init(struct drm_i915_private *dev_priv)
 			goto cleanup;
 		}
 
+		GEM_BUG_ON(!engine->submit_request);
 		mask |= ENGINE_MASK(id);
 	}
 
@@ -342,6 +343,8 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
 {
 	int ret;
 
+	engine->set_default_submission(engine);
+
 	/* We may need to do things with the shrinker which
 	 * require us to immediately switch back to the default
 	 * context. This can cause a problem as pinning the
@@ -1115,6 +1118,15 @@ bool intel_engines_are_idle(struct drm_i915_private *dev_priv)
 	return true;
 }
 
+void intel_engines_reset_default_submission(struct drm_i915_private *i915)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	for_each_engine(engine, i915, id)
+		engine->set_default_submission(engine);
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/mock_engine.c"
 #endif
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 89f38e7def9f..ae7dd9913298 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1560,15 +1560,10 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
 	kfree(engine);
 }
 
-void intel_execlists_enable_submission(struct drm_i915_private *dev_priv)
+static void execlists_set_default_submission(struct intel_engine_cs *engine)
 {
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	for_each_engine(engine, dev_priv, id) {
-		engine->submit_request = execlists_submit_request;
-		engine->schedule = execlists_schedule;
-	}
+	engine->submit_request = execlists_submit_request;
+	engine->schedule = execlists_schedule;
 }
 
 static void
@@ -1586,8 +1581,8 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	engine->emit_flush = gen8_emit_flush;
 	engine->emit_breadcrumb = gen8_emit_breadcrumb;
 	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
-	engine->submit_request = execlists_submit_request;
-	engine->schedule = execlists_schedule;
+
+	engine->set_default_submission = execlists_set_default_submission;
 
 	engine->irq_enable = gen8_logical_ring_enable_irq;
 	engine->irq_disable = gen8_logical_ring_disable_irq;
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 5fc07761caff..e8015e7bf4e9 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -87,6 +87,5 @@ uint64_t intel_lr_context_descriptor(struct i915_gem_context *ctx,
 /* Execlists */
 int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv,
 				    int enable_execlists);
-void intel_execlists_enable_submission(struct drm_i915_private *dev_priv);
 
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 4a864f8c9387..1befcdf9b646 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -2050,6 +2050,16 @@ static void intel_ring_init_irq(struct drm_i915_private *dev_priv,
 	}
 }
 
+static void i9xx_set_default_submission(struct intel_engine_cs *engine)
+{
+	engine->submit_request = i9xx_submit_request;
+}
+
+static void gen6_bsd_set_default_submission(struct intel_engine_cs *engine)
+{
+	engine->submit_request = gen6_bsd_submit_request;
+}
+
 static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
 				      struct intel_engine_cs *engine)
 {
@@ -2080,7 +2090,8 @@ static void intel_ring_default_vfuncs(struct drm_i915_private *dev_priv,
 				engine->emit_breadcrumb_sz++;
 		}
 	}
-	engine->submit_request = i9xx_submit_request;
+
+	engine->set_default_submission = i9xx_set_default_submission;
 
 	if (INTEL_GEN(dev_priv) >= 8)
 		engine->emit_bb_start = gen8_emit_bb_start;
@@ -2165,7 +2176,7 @@ int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine)
 	if (INTEL_GEN(dev_priv) >= 6) {
 		/* gen6 bsd needs a special wa for tail updates */
 		if (IS_GEN6(dev_priv))
-			engine->submit_request = gen6_bsd_submit_request;
+			engine->set_default_submission = gen6_bsd_set_default_submission;
 		engine->emit_flush = gen6_bsd_ring_flush;
 		if (INTEL_GEN(dev_priv) < 8)
 			engine->irq_enable_mask = GT_BSD_USER_INTERRUPT;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 0ef491df5b4e..6963e25187d0 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -273,6 +273,8 @@ struct intel_engine_cs {
 	void		(*reset_hw)(struct intel_engine_cs *engine,
 				    struct drm_i915_gem_request *req);
 
+	void		(*set_default_submission)(struct intel_engine_cs *engine);
+
 	int		(*context_pin)(struct intel_engine_cs *engine,
 				       struct i915_gem_context *ctx);
 	void		(*context_unpin)(struct intel_engine_cs *engine,
@@ -669,4 +671,6 @@ static inline u32 *gen8_emit_pipe_control(u32 *batch, u32 flags, u32 offset)
 bool intel_engine_is_idle(struct intel_engine_cs *engine);
 bool intel_engines_are_idle(struct drm_i915_private *dev_priv);
 
+void intel_engines_reset_default_submission(struct drm_i915_private *i915);
+
 #endif /* _INTEL_RINGBUFFER_H_ */
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 3/4] drm/i915: Restore engine->submit_request before unwedging
  2017-03-15 14:01 [PATCH 1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags Chris Wilson
  2017-03-15 14:01 ` [PATCH 2/4] drm/i915: Move engine->submit_request selection to a vfunc Chris Wilson
@ 2017-03-15 14:01 ` Chris Wilson
  2017-03-15 14:01 ` [PATCH 4/4] drm/i915: Wait for reset to complete before returning from debugfs/i915_wedged Chris Wilson
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2017-03-15 14:01 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

When we wedge the device, we override engine->submit_request with a nop
to ensure that all in-flight requests are marked in error. However, igt
would like to unwedge the device to test -EIO handling. This requires us
to flush those in-flight requests and restore the original
engine->submit_request.

v2: Use a vfunc to unify enabling request submission to engines
v3: Split new vfunc to a separate patch.
v4: Make the wait interruptible -- the third party fences we wait upon
may be indefinitely broken, so allow the reset to be aborted.

Fixes: 821ed7df6e2a ("drm/i915: Update reset path to fix incomplete requests")
Testcase: igt/gem_eio
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> #v3
---
 drivers/gpu/drm/i915/i915_drv.c |  9 ++++---
 drivers/gpu/drm/i915/i915_drv.h |  1 +
 drivers/gpu/drm/i915/i915_gem.c | 59 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index be3c81221d11..03d9e45694c9 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1821,7 +1821,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
 		return;
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
-	__clear_bit(I915_WEDGED, &error->flags);
+	if (!i915_gem_unset_wedged(dev_priv))
+		goto wakeup;
+
 	error->reset_count++;
 
 	pr_notice("drm/i915: Resetting chip after gpu hang\n");
@@ -1867,17 +1869,18 @@ void i915_reset(struct drm_i915_private *dev_priv)
 
 	i915_queue_hangcheck(dev_priv);
 
-wakeup:
+finish:
 	i915_gem_reset_finish(dev_priv);
 	enable_irq(dev_priv->drm.irq);
 
+wakeup:
 	clear_bit(I915_RESET_HANDOFF, &error->flags);
 	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
 	return;
 
 error:
 	i915_gem_set_wedged(dev_priv);
-	goto wakeup;
+	goto finish;
 }
 
 static int i915_pm_suspend(struct device *kdev)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 0b02a6d710e1..9981a216342e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3441,6 +3441,7 @@ int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
 void i915_gem_reset(struct drm_i915_private *dev_priv);
 void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
+bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
 
 void i915_gem_init_mmio(struct drm_i915_private *i915);
 int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ecd1b038318a..fd611b4c1a2c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2997,6 +2997,65 @@ void i915_gem_set_wedged(struct drm_i915_private *dev_priv)
 	mod_delayed_work(dev_priv->wq, &dev_priv->gt.idle_work, 0);
 }
 
+bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gem_timeline *tl;
+	int i;
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
+		return true;
+
+	/* Before unwedging, make sure that all pending operations
+	 * are flushed and errored out - we may have requests waiting upon
+	 * third party fences. We marked all inflight requests as EIO, and
+	 * every execbuf since returned EIO, for consistency we want all
+	 * the currently pending requests to also be marked as EIO, which
+	 * is done inside our nop_submit_request - and so we must wait.
+	 *
+	 * No more can be submitted until we reset the wedged bit.
+	 */
+	list_for_each_entry(tl, &i915->gt.timelines, link) {
+		for (i = 0; i < ARRAY_SIZE(tl->engine); i++) {
+			struct drm_i915_gem_request *rq;
+
+			rq = i915_gem_active_peek(&tl->engine[i].last_request,
+						  &i915->drm.struct_mutex);
+			if (!rq)
+				continue;
+
+			/* We can't use our normal waiter as we want to
+			 * avoid recursively trying to handle the current
+			 * reset. The basic dma_fence_default_wait() installs
+			 * a callback for dma_fence_signal(), which is
+			 * triggered by our nop handler (indirectly, the
+			 * callback enables the signaler thread which is
+			 * woken by the nop_submit_request() advancing the seqno
+			 * and when the seqno passes the fence, the signaler
+			 * then signals the fence waking us up).
+			 */
+			if (dma_fence_default_wait(&rq->fence, true,
+						   MAX_SCHEDULE_TIMEOUT) < 0)
+				return false;
+		}
+	}
+
+	/* Undo nop_submit_request. We prevent all new i915 requests from
+	 * being queued (by disallowing execbuf whilst wedged) so having
+	 * waited for all active requests above, we know the system is idle
+	 * and do not have to worry about a thread being inside
+	 * engine->submit_request() as we swap over. So unlike installing
+	 * the nop_submit_request on reset, we can do this from normal
+	 * context and do not require stop_machine().
+	 */
+	intel_engines_reset_default_submission(i915);
+
+	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
+	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
+
+	return true;
+}
+
 static void
 i915_gem_retire_work_handler(struct work_struct *work)
 {
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 4/4] drm/i915: Wait for reset to complete before returning from debugfs/i915_wedged
  2017-03-15 14:01 [PATCH 1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags Chris Wilson
  2017-03-15 14:01 ` [PATCH 2/4] drm/i915: Move engine->submit_request selection to a vfunc Chris Wilson
  2017-03-15 14:01 ` [PATCH 3/4] drm/i915: Restore engine->submit_request before unwedging Chris Wilson
@ 2017-03-15 14:01 ` Chris Wilson
  2017-03-16 16:38   ` Mika Kuoppala
  2017-03-15 15:10 ` ✗ Fi.CI.BAT: warning for series starting with [1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags Patchwork
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 10+ messages in thread
From: Chris Wilson @ 2017-03-15 14:01 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Provide some serialisation between user operations by waiting for the
reset initiated by setting i915_wedged to complete.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 5fdf8c137235..ada823815b8a 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -4129,6 +4129,10 @@ i915_wedged_set(void *data, u64 val)
 	i915_handle_error(dev_priv, val,
 			  "Manually setting wedged to %llu", val);
 
+	wait_on_bit(&dev_priv->gpu_error.flags,
+		    I915_RESET_HANDOFF,
+		    TASK_UNINTERRUPTIBLE);
+
 	return 0;
 }
 
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* ✗ Fi.CI.BAT: warning for series starting with [1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags
  2017-03-15 14:01 [PATCH 1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags Chris Wilson
                   ` (2 preceding siblings ...)
  2017-03-15 14:01 ` [PATCH 4/4] drm/i915: Wait for reset to complete before returning from debugfs/i915_wedged Chris Wilson
@ 2017-03-15 15:10 ` Patchwork
  2017-03-16 16:18 ` [PATCH 1/4] " Mika Kuoppala
  2017-03-16 16:47 ` Mika Kuoppala
  5 siblings, 0 replies; 10+ messages in thread
From: Patchwork @ 2017-03-15 15:10 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags
URL   : https://patchwork.freedesktop.org/series/21290/
State : warning

== Summary ==

Series 21290v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/21290/revisions/1/mbox/

Test drv_module_reload:
        Subgroup basic-reload-final:
                pass       -> DMESG-WARN (fi-skl-6770hq)
Test gem_exec_flush:
        Subgroup basic-batch-kernel-default-uc:
                fail       -> PASS       (fi-snb-2600) fdo#100007
Test kms_force_connector_basic:
        Subgroup prune-stale-modes:
                skip       -> PASS       (fi-snb-2600)

fdo#100007 https://bugs.freedesktop.org/show_bug.cgi?id=100007

fi-bdw-5557u     total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  time: 457s
fi-bsw-n3050     total:278  pass:239  dwarn:0   dfail:0   fail:0   skip:39  time: 573s
fi-bxt-j4205     total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  time: 529s
fi-bxt-t5700     total:278  pass:258  dwarn:0   dfail:0   fail:0   skip:20  time: 550s
fi-byt-j1900     total:278  pass:251  dwarn:0   dfail:0   fail:0   skip:27  time: 502s
fi-byt-n2820     total:278  pass:247  dwarn:0   dfail:0   fail:0   skip:31  time: 500s
fi-hsw-4770      total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time: 434s
fi-ivb-3520m     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time: 512s
fi-ivb-3770      total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time: 501s
fi-kbl-7500u     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time: 485s
fi-skl-6260u     total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time: 489s
fi-skl-6700k     total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  time: 481s
fi-skl-6770hq    total:278  pass:267  dwarn:1   dfail:0   fail:0   skip:10  time: 525s
fi-snb-2520m     total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  time: 553s
fi-snb-2600      total:278  pass:249  dwarn:0   dfail:0   fail:0   skip:29  time: 416s

25f04730646ed8f7bc59e9e9d1cc9b4c9ecbc09d drm-tip: 2017y-03m-15d-12h-56m-15s UTC integration manifest
c9bad29 drm/i915: Wait for reset to complete before returning from debugfs/i915_wedged
6e55d48 drm/i915: Restore engine->submit_request before unwedging
b6ed03d drm/i915: Move engine->submit_request selection to a vfunc
4cefe36 drm/i915: Split I915_RESET_IN_PROGRESS into two flags

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4183/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags
  2017-03-15 14:01 [PATCH 1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags Chris Wilson
                   ` (3 preceding siblings ...)
  2017-03-15 15:10 ` ✗ Fi.CI.BAT: warning for series starting with [1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags Patchwork
@ 2017-03-16 16:18 ` Mika Kuoppala
  2017-03-16 16:47 ` Mika Kuoppala
  5 siblings, 0 replies; 10+ messages in thread
From: Mika Kuoppala @ 2017-03-16 16:18 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> I915_RESET_IN_PROGRESS is being used for both signaling the requirement
> to i915_mutex_lock_interruptible() to avoid taking the struct_mutex and
> to instruct a waiter (already holding the struct_mutex) to perform the
> reset. To allow for a little more coordination, split these two meaning
> into a couple of distinct flags. I915_RESET_BACKOFF tells
> i915_mutex_lock_interruptible() not to acquire the mutex and
> I915_RESET_HANDOFF tells the waiter to call i915_reset().
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

I only could think of worse names for backoff/handoff.

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> ---
>  drivers/gpu/drm/i915/i915_debugfs.c              | 16 +++++-----
>  drivers/gpu/drm/i915/i915_drv.c                  |  7 +++--
>  drivers/gpu/drm/i915/i915_drv.h                  | 40 +++++++++++++++++++++---
>  drivers/gpu/drm/i915/i915_gem.c                  |  5 +--
>  drivers/gpu/drm/i915/i915_gem_request.c          |  2 +-
>  drivers/gpu/drm/i915/i915_irq.c                  | 38 ++++------------------
>  drivers/gpu/drm/i915/intel_display.c             |  4 +--
>  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 18 +++++++----
>  8 files changed, 71 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 9db6b041a799..5fdf8c137235 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1305,16 +1305,18 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>  	enum intel_engine_id id;
>  
>  	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
> -		seq_printf(m, "Wedged\n");
> -	if (test_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags))
> -		seq_printf(m, "Reset in progress\n");
> +		seq_puts(m, "Wedged\n");
> +	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
> +		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
> +	if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
> +		seq_puts(m, "Reset in progress: reset handoff to waiter\n");
>  	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
> -		seq_printf(m, "Waiter holding struct mutex\n");
> +		seq_puts(m, "Waiter holding struct mutex\n");
>  	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
> -		seq_printf(m, "struct_mutex blocked for reset\n");
> +		seq_puts(m, "struct_mutex blocked for reset\n");
>  
>  	if (!i915.enable_hangcheck) {
> -		seq_printf(m, "Hangcheck disabled\n");
> +		seq_puts(m, "Hangcheck disabled\n");
>  		return 0;
>  	}
>  
> @@ -4121,7 +4123,7 @@ i915_wedged_set(void *data, u64 val)
>  	 * while it is writing to 'i915_wedged'
>  	 */
>  
> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> +	if (i915_reset_backoff(&dev_priv->gpu_error))
>  		return -EAGAIN;
>  
>  	i915_handle_error(dev_priv, val,
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 9164167cd147..be3c81221d11 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1815,8 +1815,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
>  	int ret;
>  
>  	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
>  
> -	if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags))
> +	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
>  		return;
>  
>  	/* Clear any previous failed attempts at recovery. Time to try again. */
> @@ -1869,7 +1870,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
>  wakeup:
>  	i915_gem_reset_finish(dev_priv);
>  	enable_irq(dev_priv->drm.irq);
> -	wake_up_bit(&error->flags, I915_RESET_IN_PROGRESS);
> +
> +	clear_bit(I915_RESET_HANDOFF, &error->flags);
> +	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
>  	return;
>  
>  error:
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5156bcc59dea..0b02a6d710e1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1595,8 +1595,33 @@ struct i915_gpu_error {
>  	 */
>  	unsigned long reset_count;
>  
> +	/**
> +	 * flags: Control various stages of the GPU reset
> +	 *
> +	 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
> +	 * other users acquiring the struct_mutex. To do this we set the
> +	 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
> +	 * and then check for that bit before acquiring the struct_mutex (in
> +	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
> +	 * secondary role in preventing two concurrent global reset attempts.
> +	 *
> +	 * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
> +	 * struct_mutex. We try to acquire the struct_mutex in the reset worker,
> +	 * but it may be held by some long running waiter (that we cannot
> +	 * interrupt without causing trouble). Once we are ready to do the GPU
> +	 * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
> +	 * they already hold the struct_mutex and want to participate they can
> +	 * inspect the bit and do the reset directly, otherwise the worker
> +	 * waits for the struct_mutex.
> +	 *
> +	 * #I915_WEDGED - If reset fails and we can no longer use the GPU,
> +	 * we set the #I915_WEDGED bit. Prior to command submission, e.g.
> +	 * i915_gem_request_alloc(), this bit is checked and the sequence
> +	 * aborted (with -EIO reported to userspace) if set.
> +	 */
>  	unsigned long flags;
> -#define I915_RESET_IN_PROGRESS	0
> +#define I915_RESET_BACKOFF	0
> +#define I915_RESET_HANDOFF	1
>  #define I915_WEDGED		(BITS_PER_LONG - 1)
>  
>  	/**
> @@ -3387,9 +3412,14 @@ i915_gem_find_active_request(struct intel_engine_cs *engine);
>  
>  void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
>  
> -static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
> +static inline bool i915_reset_backoff(struct i915_gpu_error *error)
> +{
> +	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
> +}
> +
> +static inline bool i915_reset_handoff(struct i915_gpu_error *error)
>  {
> -	return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
> +	return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
>  }
>  
>  static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> @@ -3397,9 +3427,9 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
>  	return unlikely(test_bit(I915_WEDGED, &error->flags));
>  }
>  
> -static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
> +static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
>  {
> -	return i915_reset_in_progress(error) | i915_terminally_wedged(error);
> +	return i915_reset_backoff(error) | i915_terminally_wedged(error);
>  }
>  
>  static inline u32 i915_reset_count(struct i915_gpu_error *error)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d87983ba536f..ecd1b038318a 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -103,16 +103,13 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
>  
>  	might_sleep();
>  
> -	if (!i915_reset_in_progress(error))
> -		return 0;
> -
>  	/*
>  	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
>  	 * userspace. If it takes that long something really bad is going on and
>  	 * we should simply try to bail out and fail as gracefully as possible.
>  	 */
>  	ret = wait_event_interruptible_timeout(error->reset_queue,
> -					       !i915_reset_in_progress(error),
> +					       !i915_reset_backoff(error),
>  					       I915_RESET_TIMEOUT);
>  	if (ret == 0) {
>  		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 1e1d9f2072cd..0e8d1010cecb 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -1012,7 +1012,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
>  
>  static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request *request)
>  {
> -	if (likely(!i915_reset_in_progress(&request->i915->gpu_error)))
> +	if (likely(!i915_reset_handoff(&request->i915->gpu_error)))
>  		return false;
>  
>  	__set_current_state(TASK_RUNNING);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index e646c4eba65d..495e6a79cacf 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2631,22 +2631,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>  	return ret;
>  }
>  
> -static void i915_error_wake_up(struct drm_i915_private *dev_priv)
> -{
> -	/*
> -	 * Notify all waiters for GPU completion events that reset state has
> -	 * been changed, and that they need to restart their wait after
> -	 * checking for potential errors (and bail out to drop locks if there is
> -	 * a gpu reset pending so that i915_error_work_func can acquire them).
> -	 */
> -
> -	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
> -	wake_up_all(&dev_priv->gpu_error.wait_queue);
> -
> -	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
> -	wake_up_all(&dev_priv->pending_flip_queue);
> -}
> -
>  /**
>   * i915_reset_and_wakeup - do process context error handling work
>   * @dev_priv: i915 device private
> @@ -2676,6 +2660,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>  	intel_runtime_pm_get(dev_priv);
>  	intel_prepare_reset(dev_priv);
>  
> +	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
> +	wake_up_all(&dev_priv->gpu_error.wait_queue);
> +
>  	do {
>  		/*
>  		 * All state reset _must_ be completed before we update the
> @@ -2690,7 +2677,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>  
>  		/* We need to wait for anyone holding the lock to wakeup */
>  	} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
> -				     I915_RESET_IN_PROGRESS,
> +				     I915_RESET_HANDOFF,
>  				     TASK_UNINTERRUPTIBLE,
>  				     HZ));
>  
> @@ -2705,6 +2692,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>  	 * Note: The wake_up also serves as a memory barrier so that
>  	 * waiters see the updated value of the dev_priv->gpu_error.
>  	 */
> +	clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
>  	wake_up_all(&dev_priv->gpu_error.reset_queue);
>  }
>  
> @@ -2788,24 +2776,10 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
>  	if (!engine_mask)
>  		return;
>  
> -	if (test_and_set_bit(I915_RESET_IN_PROGRESS,
> +	if (test_and_set_bit(I915_RESET_BACKOFF,
>  			     &dev_priv->gpu_error.flags))
>  		return;
>  
> -	/*
> -	 * Wakeup waiting processes so that the reset function
> -	 * i915_reset_and_wakeup doesn't deadlock trying to grab
> -	 * various locks. By bumping the reset counter first, the woken
> -	 * processes will see a reset in progress and back off,
> -	 * releasing their locks and then wait for the reset completion.
> -	 * We must do this for _all_ gpu waiters that might hold locks
> -	 * that the reset work needs to acquire.
> -	 *
> -	 * Note: The wake_up also provides a memory barrier to ensure that the
> -	 * waiters see the updated value of the reset flags.
> -	 */
> -	i915_error_wake_up(dev_priv);
> -
>  	i915_reset_and_wakeup(dev_priv);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 4b73513b46c1..5959c9b6dc97 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -3639,7 +3639,7 @@ static bool abort_flip_on_reset(struct intel_crtc *crtc)
>  {
>  	struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
>  
> -	if (i915_reset_in_progress(error))
> +	if (i915_reset_backoff(error))
>  		return true;
>  
>  	if (crtc->reset_count != i915_reset_count(error))
> @@ -10595,7 +10595,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
>  		goto cleanup;
>  
>  	intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
> -	if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
> +	if (i915_reset_backoff_or_wedged(&dev_priv->gpu_error)) {
>  		ret = -EIO;
>  		goto unlock;
>  	}
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index d4acee6730e9..6ec7c731a267 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -301,7 +301,8 @@ static int igt_global_reset(void *arg)
>  
>  	/* Check that we can issue a global GPU reset */
>  
> -	set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
> +	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
>  
>  	mutex_lock(&i915->drm.struct_mutex);
>  	reset_count = i915_reset_count(&i915->gpu_error);
> @@ -314,7 +315,8 @@ static int igt_global_reset(void *arg)
>  	}
>  	mutex_unlock(&i915->drm.struct_mutex);
>  
> -	GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
> +	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
> +	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  	if (i915_terminally_wedged(&i915->gpu_error))
>  		err = -EIO;
>  
> @@ -330,7 +332,7 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
>  
>  	reset_count = i915_reset_count(&rq->i915->gpu_error);
>  
> -	set_bit(I915_RESET_IN_PROGRESS, &rq->i915->gpu_error.flags);
> +	set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
>  	wake_up_all(&rq->i915->gpu_error.wait_queue);
>  
>  	return reset_count;
> @@ -357,7 +359,7 @@ static int igt_wait_reset(void *arg)
>  
>  	/* Check that we detect a stuck waiter and issue a reset */
>  
> -	set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
> +	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  
>  	mutex_lock(&i915->drm.struct_mutex);
>  	err = hang_init(&h, i915);
> @@ -388,8 +390,8 @@ static int igt_wait_reset(void *arg)
>  		err = timeout;
>  		goto out_rq;
>  	}
> -	GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
>  
> +	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
>  	if (i915_reset_count(&i915->gpu_error) == reset_count) {
>  		pr_err("No GPU reset recorded!\n");
>  		err = -EINVAL;
> @@ -402,6 +404,7 @@ static int igt_wait_reset(void *arg)
>  	hang_fini(&h);
>  unlock:
>  	mutex_unlock(&i915->drm.struct_mutex);
> +	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  
>  	if (i915_terminally_wedged(&i915->gpu_error))
>  		return -EIO;
> @@ -422,6 +425,7 @@ static int igt_reset_queue(void *arg)
>  	if (!igt_can_mi_store_dword_imm(i915))
>  		return 0;
>  
> +	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  	mutex_lock(&i915->drm.struct_mutex);
>  	err = hang_init(&h, i915);
>  	if (err)
> @@ -470,8 +474,9 @@ static int igt_reset_queue(void *arg)
>  
>  			i915_reset(i915);
>  
> -			GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS,
> +			GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
>  					    &i915->gpu_error.flags));
> +
>  			if (prev->fence.error != -EIO) {
>  				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
>  				       prev->fence.error);
> @@ -514,6 +519,7 @@ static int igt_reset_queue(void *arg)
>  	hang_fini(&h);
>  unlock:
>  	mutex_unlock(&i915->drm.struct_mutex);
> +	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  
>  	if (i915_terminally_wedged(&i915->gpu_error))
>  		return -EIO;
> -- 
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/4] drm/i915: Wait for reset to complete before returning from debugfs/i915_wedged
  2017-03-15 14:01 ` [PATCH 4/4] drm/i915: Wait for reset to complete before returning from debugfs/i915_wedged Chris Wilson
@ 2017-03-16 16:38   ` Mika Kuoppala
  2017-03-16 17:01     ` Chris Wilson
  2017-03-16 17:02     ` Mika Kuoppala
  0 siblings, 2 replies; 10+ messages in thread
From: Mika Kuoppala @ 2017-03-16 16:38 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Provide some serialisation between user operations by waiting for the
> reset initiated by setting i915_wedged to complete.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c | 4 ++++
>  1 file changed, 4 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 5fdf8c137235..ada823815b8a 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -4129,6 +4129,10 @@ i915_wedged_set(void *data, u64 val)
>  	i915_handle_error(dev_priv, val,
>  			  "Manually setting wedged to %llu", val);
>  
> +	wait_on_bit(&dev_priv->gpu_error.flags,
> +		    I915_RESET_HANDOFF,
> +		    TASK_UNINTERRUPTIBLE);
> +

But what happens here if we have already prevented the reset and
the handoff never clears? Seems like it is a stuck task as the
handoff is not cleared on error path in i915_reset.

-Mika


>  	return 0;
>  }
>  
> -- 
> 2.11.0
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags
  2017-03-15 14:01 [PATCH 1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags Chris Wilson
                   ` (4 preceding siblings ...)
  2017-03-16 16:18 ` [PATCH 1/4] " Mika Kuoppala
@ 2017-03-16 16:47 ` Mika Kuoppala
  5 siblings, 0 replies; 10+ messages in thread
From: Mika Kuoppala @ 2017-03-16 16:47 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> I915_RESET_IN_PROGRESS is being used for both signaling the requirement
> to i915_mutex_lock_interruptible() to avoid taking the struct_mutex and
> to instruct a waiter (already holding the struct_mutex) to perform the
> reset. To allow for a little more coordination, split these two meaning
> into a couple of distinct flags. I915_RESET_BACKOFF tells
> i915_mutex_lock_interruptible() not to acquire the mutex and
> I915_RESET_HANDOFF tells the waiter to call i915_reset().
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

+ Acked-by: Michel Thierry <michel.thierry@intel.com>

from earlier version.
-Mika


> ---
>  drivers/gpu/drm/i915/i915_debugfs.c              | 16 +++++-----
>  drivers/gpu/drm/i915/i915_drv.c                  |  7 +++--
>  drivers/gpu/drm/i915/i915_drv.h                  | 40 +++++++++++++++++++++---
>  drivers/gpu/drm/i915/i915_gem.c                  |  5 +--
>  drivers/gpu/drm/i915/i915_gem_request.c          |  2 +-
>  drivers/gpu/drm/i915/i915_irq.c                  | 38 ++++------------------
>  drivers/gpu/drm/i915/intel_display.c             |  4 +--
>  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 18 +++++++----
>  8 files changed, 71 insertions(+), 59 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 9db6b041a799..5fdf8c137235 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1305,16 +1305,18 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>  	enum intel_engine_id id;
>  
>  	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
> -		seq_printf(m, "Wedged\n");
> -	if (test_bit(I915_RESET_IN_PROGRESS, &dev_priv->gpu_error.flags))
> -		seq_printf(m, "Reset in progress\n");
> +		seq_puts(m, "Wedged\n");
> +	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
> +		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
> +	if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
> +		seq_puts(m, "Reset in progress: reset handoff to waiter\n");
>  	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
> -		seq_printf(m, "Waiter holding struct mutex\n");
> +		seq_puts(m, "Waiter holding struct mutex\n");
>  	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
> -		seq_printf(m, "struct_mutex blocked for reset\n");
> +		seq_puts(m, "struct_mutex blocked for reset\n");
>  
>  	if (!i915.enable_hangcheck) {
> -		seq_printf(m, "Hangcheck disabled\n");
> +		seq_puts(m, "Hangcheck disabled\n");
>  		return 0;
>  	}
>  
> @@ -4121,7 +4123,7 @@ i915_wedged_set(void *data, u64 val)
>  	 * while it is writing to 'i915_wedged'
>  	 */
>  
> -	if (i915_reset_in_progress(&dev_priv->gpu_error))
> +	if (i915_reset_backoff(&dev_priv->gpu_error))
>  		return -EAGAIN;
>  
>  	i915_handle_error(dev_priv, val,
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 9164167cd147..be3c81221d11 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1815,8 +1815,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
>  	int ret;
>  
>  	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
>  
> -	if (!test_and_clear_bit(I915_RESET_IN_PROGRESS, &error->flags))
> +	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
>  		return;
>  
>  	/* Clear any previous failed attempts at recovery. Time to try again. */
> @@ -1869,7 +1870,9 @@ void i915_reset(struct drm_i915_private *dev_priv)
>  wakeup:
>  	i915_gem_reset_finish(dev_priv);
>  	enable_irq(dev_priv->drm.irq);
> -	wake_up_bit(&error->flags, I915_RESET_IN_PROGRESS);
> +
> +	clear_bit(I915_RESET_HANDOFF, &error->flags);
> +	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
>  	return;
>  
>  error:
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5156bcc59dea..0b02a6d710e1 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1595,8 +1595,33 @@ struct i915_gpu_error {
>  	 */
>  	unsigned long reset_count;
>  
> +	/**
> +	 * flags: Control various stages of the GPU reset
> +	 *
> +	 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
> +	 * other users acquiring the struct_mutex. To do this we set the
> +	 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
> +	 * and then check for that bit before acquiring the struct_mutex (in
> +	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
> +	 * secondary role in preventing two concurrent global reset attempts.
> +	 *
> +	 * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
> +	 * struct_mutex. We try to acquire the struct_mutex in the reset worker,
> +	 * but it may be held by some long running waiter (that we cannot
> +	 * interrupt without causing trouble). Once we are ready to do the GPU
> +	 * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
> +	 * they already hold the struct_mutex and want to participate they can
> +	 * inspect the bit and do the reset directly, otherwise the worker
> +	 * waits for the struct_mutex.
> +	 *
> +	 * #I915_WEDGED - If reset fails and we can no longer use the GPU,
> +	 * we set the #I915_WEDGED bit. Prior to command submission, e.g.
> +	 * i915_gem_request_alloc(), this bit is checked and the sequence
> +	 * aborted (with -EIO reported to userspace) if set.
> +	 */
>  	unsigned long flags;
> -#define I915_RESET_IN_PROGRESS	0
> +#define I915_RESET_BACKOFF	0
> +#define I915_RESET_HANDOFF	1
>  #define I915_WEDGED		(BITS_PER_LONG - 1)
>  
>  	/**
> @@ -3387,9 +3412,14 @@ i915_gem_find_active_request(struct intel_engine_cs *engine);
>  
>  void i915_gem_retire_requests(struct drm_i915_private *dev_priv);
>  
> -static inline bool i915_reset_in_progress(struct i915_gpu_error *error)
> +static inline bool i915_reset_backoff(struct i915_gpu_error *error)
> +{
> +	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
> +}
> +
> +static inline bool i915_reset_handoff(struct i915_gpu_error *error)
>  {
> -	return unlikely(test_bit(I915_RESET_IN_PROGRESS, &error->flags));
> +	return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
>  }
>  
>  static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
> @@ -3397,9 +3427,9 @@ static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
>  	return unlikely(test_bit(I915_WEDGED, &error->flags));
>  }
>  
> -static inline bool i915_reset_in_progress_or_wedged(struct i915_gpu_error *error)
> +static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
>  {
> -	return i915_reset_in_progress(error) | i915_terminally_wedged(error);
> +	return i915_reset_backoff(error) | i915_terminally_wedged(error);
>  }
>  
>  static inline u32 i915_reset_count(struct i915_gpu_error *error)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d87983ba536f..ecd1b038318a 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -103,16 +103,13 @@ i915_gem_wait_for_error(struct i915_gpu_error *error)
>  
>  	might_sleep();
>  
> -	if (!i915_reset_in_progress(error))
> -		return 0;
> -
>  	/*
>  	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
>  	 * userspace. If it takes that long something really bad is going on and
>  	 * we should simply try to bail out and fail as gracefully as possible.
>  	 */
>  	ret = wait_event_interruptible_timeout(error->reset_queue,
> -					       !i915_reset_in_progress(error),
> +					       !i915_reset_backoff(error),
>  					       I915_RESET_TIMEOUT);
>  	if (ret == 0) {
>  		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 1e1d9f2072cd..0e8d1010cecb 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -1012,7 +1012,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
>  
>  static bool __i915_wait_request_check_and_reset(struct drm_i915_gem_request *request)
>  {
> -	if (likely(!i915_reset_in_progress(&request->i915->gpu_error)))
> +	if (likely(!i915_reset_handoff(&request->i915->gpu_error)))
>  		return false;
>  
>  	__set_current_state(TASK_RUNNING);
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index e646c4eba65d..495e6a79cacf 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2631,22 +2631,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>  	return ret;
>  }
>  
> -static void i915_error_wake_up(struct drm_i915_private *dev_priv)
> -{
> -	/*
> -	 * Notify all waiters for GPU completion events that reset state has
> -	 * been changed, and that they need to restart their wait after
> -	 * checking for potential errors (and bail out to drop locks if there is
> -	 * a gpu reset pending so that i915_error_work_func can acquire them).
> -	 */
> -
> -	/* Wake up __wait_seqno, potentially holding dev->struct_mutex. */
> -	wake_up_all(&dev_priv->gpu_error.wait_queue);
> -
> -	/* Wake up intel_crtc_wait_for_pending_flips, holding crtc->mutex. */
> -	wake_up_all(&dev_priv->pending_flip_queue);
> -}
> -
>  /**
>   * i915_reset_and_wakeup - do process context error handling work
>   * @dev_priv: i915 device private
> @@ -2676,6 +2660,9 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>  	intel_runtime_pm_get(dev_priv);
>  	intel_prepare_reset(dev_priv);
>  
> +	set_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags);
> +	wake_up_all(&dev_priv->gpu_error.wait_queue);
> +
>  	do {
>  		/*
>  		 * All state reset _must_ be completed before we update the
> @@ -2690,7 +2677,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>  
>  		/* We need to wait for anyone holding the lock to wakeup */
>  	} while (wait_on_bit_timeout(&dev_priv->gpu_error.flags,
> -				     I915_RESET_IN_PROGRESS,
> +				     I915_RESET_HANDOFF,
>  				     TASK_UNINTERRUPTIBLE,
>  				     HZ));
>  
> @@ -2705,6 +2692,7 @@ static void i915_reset_and_wakeup(struct drm_i915_private *dev_priv)
>  	 * Note: The wake_up also serves as a memory barrier so that
>  	 * waiters see the updated value of the dev_priv->gpu_error.
>  	 */
> +	clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
>  	wake_up_all(&dev_priv->gpu_error.reset_queue);
>  }
>  
> @@ -2788,24 +2776,10 @@ void i915_handle_error(struct drm_i915_private *dev_priv,
>  	if (!engine_mask)
>  		return;
>  
> -	if (test_and_set_bit(I915_RESET_IN_PROGRESS,
> +	if (test_and_set_bit(I915_RESET_BACKOFF,
>  			     &dev_priv->gpu_error.flags))
>  		return;
>  
> -	/*
> -	 * Wakeup waiting processes so that the reset function
> -	 * i915_reset_and_wakeup doesn't deadlock trying to grab
> -	 * various locks. By bumping the reset counter first, the woken
> -	 * processes will see a reset in progress and back off,
> -	 * releasing their locks and then wait for the reset completion.
> -	 * We must do this for _all_ gpu waiters that might hold locks
> -	 * that the reset work needs to acquire.
> -	 *
> -	 * Note: The wake_up also provides a memory barrier to ensure that the
> -	 * waiters see the updated value of the reset flags.
> -	 */
> -	i915_error_wake_up(dev_priv);
> -
>  	i915_reset_and_wakeup(dev_priv);
>  }
>  
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 4b73513b46c1..5959c9b6dc97 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -3639,7 +3639,7 @@ static bool abort_flip_on_reset(struct intel_crtc *crtc)
>  {
>  	struct i915_gpu_error *error = &to_i915(crtc->base.dev)->gpu_error;
>  
> -	if (i915_reset_in_progress(error))
> +	if (i915_reset_backoff(error))
>  		return true;
>  
>  	if (crtc->reset_count != i915_reset_count(error))
> @@ -10595,7 +10595,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
>  		goto cleanup;
>  
>  	intel_crtc->reset_count = i915_reset_count(&dev_priv->gpu_error);
> -	if (i915_reset_in_progress_or_wedged(&dev_priv->gpu_error)) {
> +	if (i915_reset_backoff_or_wedged(&dev_priv->gpu_error)) {
>  		ret = -EIO;
>  		goto unlock;
>  	}
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index d4acee6730e9..6ec7c731a267 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -301,7 +301,8 @@ static int igt_global_reset(void *arg)
>  
>  	/* Check that we can issue a global GPU reset */
>  
> -	set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
> +	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
>  
>  	mutex_lock(&i915->drm.struct_mutex);
>  	reset_count = i915_reset_count(&i915->gpu_error);
> @@ -314,7 +315,8 @@ static int igt_global_reset(void *arg)
>  	}
>  	mutex_unlock(&i915->drm.struct_mutex);
>  
> -	GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
> +	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
> +	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  	if (i915_terminally_wedged(&i915->gpu_error))
>  		err = -EIO;
>  
> @@ -330,7 +332,7 @@ static u32 fake_hangcheck(struct drm_i915_gem_request *rq)
>  
>  	reset_count = i915_reset_count(&rq->i915->gpu_error);
>  
> -	set_bit(I915_RESET_IN_PROGRESS, &rq->i915->gpu_error.flags);
> +	set_bit(I915_RESET_HANDOFF, &rq->i915->gpu_error.flags);
>  	wake_up_all(&rq->i915->gpu_error.wait_queue);
>  
>  	return reset_count;
> @@ -357,7 +359,7 @@ static int igt_wait_reset(void *arg)
>  
>  	/* Check that we detect a stuck waiter and issue a reset */
>  
> -	set_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags);
> +	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  
>  	mutex_lock(&i915->drm.struct_mutex);
>  	err = hang_init(&h, i915);
> @@ -388,8 +390,8 @@ static int igt_wait_reset(void *arg)
>  		err = timeout;
>  		goto out_rq;
>  	}
> -	GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS, &i915->gpu_error.flags));
>  
> +	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
>  	if (i915_reset_count(&i915->gpu_error) == reset_count) {
>  		pr_err("No GPU reset recorded!\n");
>  		err = -EINVAL;
> @@ -402,6 +404,7 @@ static int igt_wait_reset(void *arg)
>  	hang_fini(&h);
>  unlock:
>  	mutex_unlock(&i915->drm.struct_mutex);
> +	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  
>  	if (i915_terminally_wedged(&i915->gpu_error))
>  		return -EIO;
> @@ -422,6 +425,7 @@ static int igt_reset_queue(void *arg)
>  	if (!igt_can_mi_store_dword_imm(i915))
>  		return 0;
>  
> +	set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  	mutex_lock(&i915->drm.struct_mutex);
>  	err = hang_init(&h, i915);
>  	if (err)
> @@ -470,8 +474,9 @@ static int igt_reset_queue(void *arg)
>  
>  			i915_reset(i915);
>  
> -			GEM_BUG_ON(test_bit(I915_RESET_IN_PROGRESS,
> +			GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
>  					    &i915->gpu_error.flags));
> +
>  			if (prev->fence.error != -EIO) {
>  				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
>  				       prev->fence.error);
> @@ -514,6 +519,7 @@ static int igt_reset_queue(void *arg)
>  	hang_fini(&h);
>  unlock:
>  	mutex_unlock(&i915->drm.struct_mutex);
> +	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
>  
>  	if (i915_terminally_wedged(&i915->gpu_error))
>  		return -EIO;
> -- 
> 2.11.0
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/4] drm/i915: Wait for reset to complete before returning from debugfs/i915_wedged
  2017-03-16 16:38   ` Mika Kuoppala
@ 2017-03-16 17:01     ` Chris Wilson
  2017-03-16 17:02     ` Mika Kuoppala
  1 sibling, 0 replies; 10+ messages in thread
From: Chris Wilson @ 2017-03-16 17:01 UTC (permalink / raw)
  To: Mika Kuoppala; +Cc: intel-gfx

On Thu, Mar 16, 2017 at 06:38:09PM +0200, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Provide some serialisation between user operations by waiting for the
> > reset initiated by setting i915_wedged to complete.

The automatic wait here makes
	echo 1 > i915_wedged; cat i915_error_state
do the right thing.

> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_debugfs.c | 4 ++++
> >  1 file changed, 4 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index 5fdf8c137235..ada823815b8a 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -4129,6 +4129,10 @@ i915_wedged_set(void *data, u64 val)
> >  	i915_handle_error(dev_priv, val,
> >  			  "Manually setting wedged to %llu", val);
> >  
> > +	wait_on_bit(&dev_priv->gpu_error.flags,
> > +		    I915_RESET_HANDOFF,
> > +		    TASK_UNINTERRUPTIBLE);
> > +
> 
> But what happens here if we have already prevented the reset and
> the handoff never clears? Seems like it is a stuck task as the
> handoff is not cleared on error path in i915_reset.

The error path should be clearing the handoff bit, and signaling the
wakeup.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH 4/4] drm/i915: Wait for reset to complete before returning from debugfs/i915_wedged
  2017-03-16 16:38   ` Mika Kuoppala
  2017-03-16 17:01     ` Chris Wilson
@ 2017-03-16 17:02     ` Mika Kuoppala
  1 sibling, 0 replies; 10+ messages in thread
From: Mika Kuoppala @ 2017-03-16 17:02 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Mika Kuoppala <mika.kuoppala@linux.intel.com> writes:

> Chris Wilson <chris@chris-wilson.co.uk> writes:
>
>> Provide some serialisation between user operations by waiting for the
>> reset initiated by setting i915_wedged to complete.
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
>> ---
>>  drivers/gpu/drm/i915/i915_debugfs.c | 4 ++++
>>  1 file changed, 4 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>> index 5fdf8c137235..ada823815b8a 100644
>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>> @@ -4129,6 +4129,10 @@ i915_wedged_set(void *data, u64 val)
>>  	i915_handle_error(dev_priv, val,
>>  			  "Manually setting wedged to %llu", val);
>>  
>> +	wait_on_bit(&dev_priv->gpu_error.flags,
>> +		    I915_RESET_HANDOFF,
>> +		    TASK_UNINTERRUPTIBLE);
>> +
>
> But what happens here if we have already prevented the reset and
> the handoff never clears? Seems like it is a stuck task as the
> handoff is not cleared on error path in i915_reset.
>

Forget the above. The goto in the i915_reset will handle that.

Reviewed-by: Mika Kuoppala <mika.kuoppala@intel.com>

> -Mika
>
>
>>  	return 0;
>>  }
>>  
>> -- 
>> 2.11.0
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2017-03-16 17:02 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-15 14:01 [PATCH 1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags Chris Wilson
2017-03-15 14:01 ` [PATCH 2/4] drm/i915: Move engine->submit_request selection to a vfunc Chris Wilson
2017-03-15 14:01 ` [PATCH 3/4] drm/i915: Restore engine->submit_request before unwedging Chris Wilson
2017-03-15 14:01 ` [PATCH 4/4] drm/i915: Wait for reset to complete before returning from debugfs/i915_wedged Chris Wilson
2017-03-16 16:38   ` Mika Kuoppala
2017-03-16 17:01     ` Chris Wilson
2017-03-16 17:02     ` Mika Kuoppala
2017-03-15 15:10 ` ✗ Fi.CI.BAT: warning for series starting with [1/4] drm/i915: Split I915_RESET_IN_PROGRESS into two flags Patchwork
2017-03-16 16:18 ` [PATCH 1/4] " Mika Kuoppala
2017-03-16 16:47 ` Mika Kuoppala

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.