All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging
@ 2018-03-15 13:14 Chris Wilson
  2018-03-15 13:14 ` [PATCH 2/2] drm/i915: Stop engines when declaring the machine wedged Chris Wilson
                   ` (5 more replies)
  0 siblings, 6 replies; 12+ messages in thread
From: Chris Wilson @ 2018-03-15 13:14 UTC (permalink / raw)
  To: intel-gfx

We still have an odd race with wedging/unwedging as shown by igt/gem_eio
that defies expectations. Add some more trace_printks to try and
visualize the flow over the precipice.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c     | 14 ++++++++++++++
 drivers/gpu/drm/i915/i915_request.c | 23 +++++++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 13d4b0e74641..2fbd622bba30 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3193,6 +3193,9 @@ void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
 
 static void nop_submit_request(struct i915_request *request)
 {
+	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
+		  request->engine->name,
+		  request->fence.context, request->fence.seqno);
 	dma_fence_set_error(&request->fence, -EIO);
 
 	i915_request_submit(request);
@@ -3202,6 +3205,9 @@ static void nop_complete_submit_request(struct i915_request *request)
 {
 	unsigned long flags;
 
+	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
+		  request->engine->name,
+		  request->fence.context, request->fence.seqno);
 	dma_fence_set_error(&request->fence, -EIO);
 
 	spin_lock_irqsave(&request->engine->timeline->lock, flags);
@@ -3215,6 +3221,8 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
+	GEM_TRACE("start\n");
+
 	if (drm_debug & DRM_UT_DRIVER) {
 		struct drm_printer p = drm_debug_printer(__func__);
 
@@ -3279,6 +3287,8 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 		i915_gem_reset_finish_engine(engine);
 	}
 
+	GEM_TRACE("end\n");
+
 	wake_up_all(&i915->gpu_error.reset_queue);
 }
 
@@ -3291,6 +3301,8 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
 		return true;
 
+	GEM_TRACE("start\n");
+
 	/*
 	 * Before unwedging, make sure that all pending operations
 	 * are flushed and errored out - we may have requests waiting upon
@@ -3341,6 +3353,8 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	intel_engines_reset_default_submission(i915);
 	i915_gem_contexts_lost(i915);
 
+	GEM_TRACE("end\n");
+
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
 
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 1810fa1b81cb..43c7134a9b93 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -207,11 +207,16 @@ static int reset_all_global_seqno(struct drm_i915_private *i915, u32 seqno)
 	if (ret)
 		return ret;
 
+	GEM_BUG_ON(i915->gt.active_requests);
+
 	/* If the seqno wraps around, we need to clear the breadcrumb rbtree */
 	for_each_engine(engine, i915, id) {
 		struct i915_gem_timeline *timeline;
 		struct intel_timeline *tl = engine->timeline;
 
+		GEM_TRACE("%s seqno %d -> %d\n",
+			  engine->name, tl->seqno, seqno);
+
 		if (!i915_seqno_passed(seqno, tl->seqno)) {
 			/* Flush any waiters before we reuse the seqno */
 			intel_engine_disarm_breadcrumbs(engine);
@@ -381,6 +386,11 @@ static void i915_request_retire(struct i915_request *request)
 	struct intel_engine_cs *engine = request->engine;
 	struct i915_gem_active *active, *next;
 
+	GEM_TRACE("%s(%d) fence %llx:%d, global_seqno %d\n",
+		  engine->name, intel_engine_get_seqno(engine),
+		  request->fence.context, request->fence.seqno,
+		  request->global_seqno);
+
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
 	GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
 	GEM_BUG_ON(!i915_request_completed(request));
@@ -488,6 +498,11 @@ void __i915_request_submit(struct i915_request *request)
 	struct intel_timeline *timeline;
 	u32 seqno;
 
+	GEM_TRACE("%s fence %llx:%d -> global_seqno %d\n",
+		  request->engine->name,
+		  request->fence.context, request->fence.seqno,
+		  engine->timeline->seqno);
+
 	GEM_BUG_ON(!irqs_disabled());
 	lockdep_assert_held(&engine->timeline->lock);
 
@@ -537,6 +552,11 @@ void __i915_request_unsubmit(struct i915_request *request)
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_timeline *timeline;
 
+	GEM_TRACE("%s fence %llx:%d <- global_seqno %d\n",
+		  request->engine->name,
+		  request->fence.context, request->fence.seqno,
+		  request->global_seqno);
+
 	GEM_BUG_ON(!irqs_disabled());
 	lockdep_assert_held(&engine->timeline->lock);
 
@@ -996,6 +1016,9 @@ void __i915_request_add(struct i915_request *request, bool flush_caches)
 	u32 *cs;
 	int err;
 
+	GEM_TRACE("%s fence %llx:%d\n",
+		  engine->name, request->fence.context, request->fence.seqno);
+
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
 	trace_i915_request_add(request);
 
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/2] drm/i915: Stop engines when declaring the machine wedged
  2018-03-15 13:14 [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging Chris Wilson
@ 2018-03-15 13:14 ` Chris Wilson
  2018-03-15 13:20   ` [PATCH] " Chris Wilson
  2018-03-15 13:26 ` [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging Chris Wilson
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 12+ messages in thread
From: Chris Wilson @ 2018-03-15 13:14 UTC (permalink / raw)
  To: intel-gfx

If we fail to reset the GPU, we declare the machine wedged. However, the
GPU may well still be running in the background with an in-flight
request. So despite our efforts in cleaning up the request queue and
faking the breadcrumb in the HWSP, the GPU may eventually write the
in-flght seqno there breaking all of our assumptions and throwing the
driver into a deep turmoil, wedging beyond wedged.

To avoid this we ideally want to reset the GPU. Since that has already
failed, make sure the rings have the stop bit set instead. This is part
of the normal GPU reset sequence, but that is actually disabled by
igt/gem_eio to force the wedged state. If we assume the worst, we must
poke at the bit again before we give up.

Testcase: igt/gem_eio
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c         |  1 +
 drivers/gpu/drm/i915/intel_engine_cs.c  | 43 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
 drivers/gpu/drm/i915/intel_uncore.c     | 46 +--------------------------------
 4 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2fbd622bba30..0f69fc6d34b4 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3238,6 +3238,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	 * rolling the global seqno forward (since this would complete requests
 	 * for which we haven't set the fence error to EIO yet).
 	 */
+	intel_engines_stop(i915, ALL_ENGINES);
 	for_each_engine(engine, i915, id) {
 		i915_gem_reset_prepare_engine(engine);
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 337dfa56a738..44eddf10f4d1 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1727,6 +1727,49 @@ unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915)
 	return which;
 }
 
+static void gen3_stop_engine(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	const u32 base = engine->mmio_base;
+	const i915_reg_t mode = RING_MI_MODE(base);
+
+	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
+	if (intel_wait_for_register_fw(dev_priv,
+				       mode,
+				       MODE_IDLE,
+				       MODE_IDLE,
+				       500))
+		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
+				 engine->name);
+
+	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
+	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
+
+	I915_WRITE_FW(RING_HEAD(base), 0);
+	I915_WRITE_FW(RING_TAIL(base), 0);
+	POSTING_READ_FW(RING_TAIL(base));
+
+	/* The ring must be empty before it is disabled */
+	I915_WRITE_FW(RING_CTL(base), 0);
+
+	/* Check acts as a post */
+	if (I915_READ_FW(RING_HEAD(base)) != 0)
+		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
+				 engine->name);
+}
+
+void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	if (INTEL_GEN(i915) < 3)
+		return;
+
+	for_each_engine_masked(engine, i915, engine_mask, id)
+		gen3_stop_engine(engine);
+}
+
 static void print_request(struct drm_printer *m,
 			  struct i915_request *rq,
 			  const char *prefix)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 1f50727a5ddb..1369f7c5b57e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -1018,6 +1018,8 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine);
 void intel_engines_park(struct drm_i915_private *i915);
 void intel_engines_unpark(struct drm_i915_private *i915);
 
+void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask);
+
 void intel_engines_reset_default_submission(struct drm_i915_private *i915);
 unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 4df7c2ef8576..cda4c06acf16 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1639,50 +1639,6 @@ int i915_reg_read_ioctl(struct drm_device *dev,
 	return ret;
 }
 
-static void gen3_stop_engine(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	const u32 base = engine->mmio_base;
-	const i915_reg_t mode = RING_MI_MODE(base);
-
-	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
-	if (intel_wait_for_register_fw(dev_priv,
-				       mode,
-				       MODE_IDLE,
-				       MODE_IDLE,
-				       500))
-		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
-				 engine->name);
-
-	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
-	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
-
-	I915_WRITE_FW(RING_HEAD(base), 0);
-	I915_WRITE_FW(RING_TAIL(base), 0);
-	POSTING_READ_FW(RING_TAIL(base));
-
-	/* The ring must be empty before it is disabled */
-	I915_WRITE_FW(RING_CTL(base), 0);
-
-	/* Check acts as a post */
-	if (I915_READ_FW(RING_HEAD(base)) != 0)
-		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
-				 engine->name);
-}
-
-static void i915_stop_engines(struct drm_i915_private *dev_priv,
-			      unsigned engine_mask)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	if (INTEL_GEN(dev_priv) < 3)
-		return;
-
-	for_each_engine_masked(engine, dev_priv, engine_mask, id)
-		gen3_stop_engine(engine);
-}
-
 static bool i915_in_reset(struct pci_dev *pdev)
 {
 	u8 gdrst;
@@ -2057,7 +2013,7 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 		 *
 		 * FIXME: Wa for more modern gens needs to be validated
 		 */
-		i915_stop_engines(dev_priv, engine_mask);
+		intel_engines_stop(dev_priv, engine_mask);
 
 		ret = -ENODEV;
 		if (reset)
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] drm/i915: Stop engines when declaring the machine wedged
  2018-03-15 13:14 ` [PATCH 2/2] drm/i915: Stop engines when declaring the machine wedged Chris Wilson
@ 2018-03-15 13:20   ` Chris Wilson
  2018-03-15 15:10     ` [PATCH v2] " Chris Wilson
  2018-03-15 15:44     ` [PATCH] " Mika Kuoppala
  0 siblings, 2 replies; 12+ messages in thread
From: Chris Wilson @ 2018-03-15 13:20 UTC (permalink / raw)
  To: intel-gfx

If we fail to reset the GPU, we declare the machine wedged. However, the
GPU may well still be running in the background with an in-flight
request. So despite our efforts in cleaning up the request queue and
faking the breadcrumb in the HWSP, the GPU may eventually write the
in-flght seqno there breaking all of our assumptions and throwing the
driver into a deep turmoil, wedging beyond wedged.

To avoid this we ideally want to reset the GPU. Since that has already
failed, make sure the rings have the stop bit set instead. This is part
of the normal GPU reset sequence, but that is actually disabled by
igt/gem_eio to force the wedged state. If we assume the worst, we must
poke at the bit again before we give up.

Testcase: igt/gem_eio
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
---
git add fail
We want to stop the tasklets before hitting stop-engines or else we
may trigger an early CS completion and more asserts.
-Chris
---
 drivers/gpu/drm/i915/i915_gem.c         |  2 ++
 drivers/gpu/drm/i915/intel_engine_cs.c  | 43 ++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
 drivers/gpu/drm/i915/intel_uncore.c     | 46 +--------------------------------
 4 files changed, 48 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2fbd622bba30..208619981ea1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3246,6 +3246,8 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	}
 	i915->caps.scheduler = 0;
 
+	intel_engines_stop(i915, ALL_ENGINES);
+
 	/*
 	 * Make sure no one is running the old callback before we proceed with
 	 * cancelling requests and resetting the completion tracking. Otherwise
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 337dfa56a738..44eddf10f4d1 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1727,6 +1727,49 @@ unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915)
 	return which;
 }
 
+static void gen3_stop_engine(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	const u32 base = engine->mmio_base;
+	const i915_reg_t mode = RING_MI_MODE(base);
+
+	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
+	if (intel_wait_for_register_fw(dev_priv,
+				       mode,
+				       MODE_IDLE,
+				       MODE_IDLE,
+				       500))
+		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
+				 engine->name);
+
+	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
+	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
+
+	I915_WRITE_FW(RING_HEAD(base), 0);
+	I915_WRITE_FW(RING_TAIL(base), 0);
+	POSTING_READ_FW(RING_TAIL(base));
+
+	/* The ring must be empty before it is disabled */
+	I915_WRITE_FW(RING_CTL(base), 0);
+
+	/* Check acts as a post */
+	if (I915_READ_FW(RING_HEAD(base)) != 0)
+		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
+				 engine->name);
+}
+
+void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	if (INTEL_GEN(i915) < 3)
+		return;
+
+	for_each_engine_masked(engine, i915, engine_mask, id)
+		gen3_stop_engine(engine);
+}
+
 static void print_request(struct drm_printer *m,
 			  struct i915_request *rq,
 			  const char *prefix)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 1f50727a5ddb..1369f7c5b57e 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -1018,6 +1018,8 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine);
 void intel_engines_park(struct drm_i915_private *i915);
 void intel_engines_unpark(struct drm_i915_private *i915);
 
+void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask);
+
 void intel_engines_reset_default_submission(struct drm_i915_private *i915);
 unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 4df7c2ef8576..cda4c06acf16 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1639,50 +1639,6 @@ int i915_reg_read_ioctl(struct drm_device *dev,
 	return ret;
 }
 
-static void gen3_stop_engine(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	const u32 base = engine->mmio_base;
-	const i915_reg_t mode = RING_MI_MODE(base);
-
-	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
-	if (intel_wait_for_register_fw(dev_priv,
-				       mode,
-				       MODE_IDLE,
-				       MODE_IDLE,
-				       500))
-		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
-				 engine->name);
-
-	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
-	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
-
-	I915_WRITE_FW(RING_HEAD(base), 0);
-	I915_WRITE_FW(RING_TAIL(base), 0);
-	POSTING_READ_FW(RING_TAIL(base));
-
-	/* The ring must be empty before it is disabled */
-	I915_WRITE_FW(RING_CTL(base), 0);
-
-	/* Check acts as a post */
-	if (I915_READ_FW(RING_HEAD(base)) != 0)
-		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
-				 engine->name);
-}
-
-static void i915_stop_engines(struct drm_i915_private *dev_priv,
-			      unsigned engine_mask)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	if (INTEL_GEN(dev_priv) < 3)
-		return;
-
-	for_each_engine_masked(engine, dev_priv, engine_mask, id)
-		gen3_stop_engine(engine);
-}
-
 static bool i915_in_reset(struct pci_dev *pdev)
 {
 	u8 gdrst;
@@ -2057,7 +2013,7 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
 		 *
 		 * FIXME: Wa for more modern gens needs to be validated
 		 */
-		i915_stop_engines(dev_priv, engine_mask);
+		intel_engines_stop(dev_priv, engine_mask);
 
 		ret = -ENODEV;
 		if (reset)
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging
  2018-03-15 13:14 [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging Chris Wilson
  2018-03-15 13:14 ` [PATCH 2/2] drm/i915: Stop engines when declaring the machine wedged Chris Wilson
@ 2018-03-15 13:26 ` Chris Wilson
  2018-03-15 13:41 ` ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev2) Patchwork
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2018-03-15 13:26 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2018-03-15 13:14:50)
> We still have an odd race with wedging/unwedging as shown by igt/gem_eio
> that defies expectations. Add some more trace_printks to try and
> visualize the flow over the precipice.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

From earlier thread,
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev2)
  2018-03-15 13:14 [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging Chris Wilson
  2018-03-15 13:14 ` [PATCH 2/2] drm/i915: Stop engines when declaring the machine wedged Chris Wilson
  2018-03-15 13:26 ` [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging Chris Wilson
@ 2018-03-15 13:41 ` Patchwork
  2018-03-15 15:44 ` ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev3) Patchwork
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 12+ messages in thread
From: Patchwork @ 2018-03-15 13:41 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev2)
URL   : https://patchwork.freedesktop.org/series/40029/
State : success

== Summary ==

Series 40029v2 series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging
https://patchwork.freedesktop.org/api/1.0/series/40029/revisions/2/mbox/

---- Known issues:

Test gem_exec_suspend:
        Subgroup basic-s3:
                pass       -> DMESG-WARN (fi-elk-e7500) fdo#103989
Test gem_mmap_gtt:
        Subgroup basic-small-bo-tiledx:
                fail       -> PASS       (fi-gdg-551) fdo#102575
Test kms_pipe_crc_basic:
        Subgroup hang-read-crc-pipe-b:
                pass       -> FAIL       (fi-skl-guc) fdo#103191

fdo#103989 https://bugs.freedesktop.org/show_bug.cgi?id=103989
fdo#102575 https://bugs.freedesktop.org/show_bug.cgi?id=102575
fdo#103191 https://bugs.freedesktop.org/show_bug.cgi?id=103191

fi-bdw-5557u     total:285  pass:264  dwarn:0   dfail:0   fail:0   skip:21  time:433s
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:443s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:381s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:539s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:298s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:514s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:517s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:504s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:411s
fi-cfl-s2        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:580s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:511s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:530s
fi-cnl-y3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:584s
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:430s
fi-gdg-551       total:285  pass:177  dwarn:0   dfail:0   fail:0   skip:108 time:317s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:537s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:402s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:421s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:466s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:427s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:474s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:468s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:515s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:652s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:442s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:524s
fi-skl-6700hq    total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:546s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:504s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:498s
fi-skl-guc       total:285  pass:256  dwarn:0   dfail:0   fail:1   skip:28  time:425s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:444s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:594s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:401s

95626f201526d97b6f284b397fd73c8a2d514ff4 drm-tip: 2018y-03m-15d-11h-01m-14s UTC integration manifest
deea2d192754 drm/i915: Stop engines when declaring the machine wedged
83bf2da86881 drm/i915: Trace GEM steps between submit and wedging

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8363/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2] drm/i915: Stop engines when declaring the machine wedged
  2018-03-15 13:20   ` [PATCH] " Chris Wilson
@ 2018-03-15 15:10     ` Chris Wilson
  2018-03-16  8:58       ` Mika Kuoppala
  2018-03-15 15:44     ` [PATCH] " Mika Kuoppala
  1 sibling, 1 reply; 12+ messages in thread
From: Chris Wilson @ 2018-03-15 15:10 UTC (permalink / raw)
  To: intel-gfx

If we fail to reset the GPU, we declare the machine wedged. However, the
GPU may well still be running in the background with an in-flight
request. So despite our efforts in cleaning up the request queue and
faking the breadcrumb in the HWSP, the GPU may eventually write the
in-flght seqno there breaking all of our assumptions and throwing the
driver into a deep turmoil, wedging beyond wedged.

To avoid this we ideally want to reset the GPU. Since that has already
failed, make sure the rings have the stop bit set instead. This is part
of the normal GPU reset sequence, but that is actually disabled by
igt/gem_eio to force the wedged state. If we assume the worst, we must
poke at the bit again before we give up.

v2: Move the intel_gpu_reset() from set-wedged in the reset error path
into i915_gem_set_wedged() itself. Even if the reset fails (e.g. if it is
disabled by gem_eio), it still tries to make sure the engines are
stopped. For i915_gem_set_wedged() callers from outside of i915_reset(),
this should make sure the GPU is disabled while the driver is marked as
being wedged.

Testcase: igt/gem_eio
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Michał Winiarski <michal.winiarski@intel.com>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Michel Thierry <michel.thierry@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c | 1 -
 drivers/gpu/drm/i915/i915_gem.c | 3 +++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index f03555efc520..3df5193487f3 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -1995,7 +1995,6 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
 error:
 	i915_gem_set_wedged(i915);
 	i915_retire_requests(i915);
-	intel_gpu_reset(i915, ALL_ENGINES);
 	goto finish;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2fbd622bba30..802df8e1a544 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3246,6 +3246,9 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	}
 	i915->caps.scheduler = 0;
 
+	/* Even if the GPU reset fails, it should still stop the engines */
+	intel_gpu_reset(i915, ALL_ENGINES);
+
 	/*
 	 * Make sure no one is running the old callback before we proceed with
 	 * cancelling requests and resetting the completion tracking. Otherwise
-- 
2.16.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev3)
  2018-03-15 13:14 [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging Chris Wilson
                   ` (2 preceding siblings ...)
  2018-03-15 13:41 ` ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev2) Patchwork
@ 2018-03-15 15:44 ` Patchwork
  2018-03-15 15:59 ` ✓ Fi.CI.IGT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev2) Patchwork
  2018-03-15 19:45 ` ✓ Fi.CI.IGT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev3) Patchwork
  5 siblings, 0 replies; 12+ messages in thread
From: Patchwork @ 2018-03-15 15:44 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev3)
URL   : https://patchwork.freedesktop.org/series/40029/
State : success

== Summary ==

Series 40029v3 series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging
https://patchwork.freedesktop.org/api/1.0/series/40029/revisions/3/mbox/

---- Known issues:

Test gem_exec_suspend:
        Subgroup basic-s3:
                pass       -> DMESG-WARN (fi-elk-e7500) fdo#103989
        Subgroup basic-s4-devices:
                pass       -> INCOMPLETE (fi-bdw-5557u) fdo#104162

fdo#103989 https://bugs.freedesktop.org/show_bug.cgi?id=103989
fdo#104162 https://bugs.freedesktop.org/show_bug.cgi?id=104162

fi-bdw-5557u     total:109  pass:105  dwarn:0   dfail:0   fail:0   skip:3  
fi-bdw-gvtdvm    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:441s
fi-blb-e6850     total:285  pass:220  dwarn:1   dfail:0   fail:0   skip:64  time:381s
fi-bsw-n3050     total:285  pass:239  dwarn:0   dfail:0   fail:0   skip:46  time:540s
fi-bwr-2160      total:285  pass:180  dwarn:0   dfail:0   fail:0   skip:105 time:299s
fi-bxt-j4205     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:506s
fi-byt-j1900     total:285  pass:250  dwarn:0   dfail:0   fail:0   skip:35  time:515s
fi-byt-n2820     total:285  pass:246  dwarn:0   dfail:0   fail:0   skip:39  time:502s
fi-cfl-8700k     total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:409s
fi-cfl-s2        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:578s
fi-cfl-u         total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:510s
fi-cnl-drrs      total:285  pass:254  dwarn:3   dfail:0   fail:0   skip:28  time:544s
fi-cnl-y3        total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:581s
fi-elk-e7500     total:285  pass:225  dwarn:1   dfail:0   fail:0   skip:59  time:426s
fi-gdg-551       total:285  pass:176  dwarn:0   dfail:0   fail:1   skip:108 time:316s
fi-glk-1         total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:541s
fi-hsw-4770      total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:402s
fi-ilk-650       total:285  pass:225  dwarn:0   dfail:0   fail:0   skip:60  time:427s
fi-ivb-3520m     total:285  pass:256  dwarn:0   dfail:0   fail:0   skip:29  time:464s
fi-ivb-3770      total:285  pass:252  dwarn:0   dfail:0   fail:0   skip:33  time:426s
fi-kbl-7500u     total:285  pass:260  dwarn:1   dfail:0   fail:0   skip:24  time:482s
fi-kbl-7567u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:473s
fi-kbl-r         total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:513s
fi-pnv-d510      total:285  pass:219  dwarn:1   dfail:0   fail:0   skip:65  time:654s
fi-skl-6260u     total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:440s
fi-skl-6600u     total:285  pass:258  dwarn:0   dfail:0   fail:0   skip:27  time:527s
fi-skl-6700hq    total:285  pass:259  dwarn:0   dfail:0   fail:0   skip:26  time:549s
fi-skl-6700k2    total:285  pass:261  dwarn:0   dfail:0   fail:0   skip:24  time:498s
fi-skl-6770hq    total:285  pass:265  dwarn:0   dfail:0   fail:0   skip:20  time:500s
fi-skl-guc       total:285  pass:257  dwarn:0   dfail:0   fail:0   skip:28  time:427s
fi-skl-gvtdvm    total:285  pass:262  dwarn:0   dfail:0   fail:0   skip:23  time:443s
fi-snb-2520m     total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:579s
fi-snb-2600      total:285  pass:245  dwarn:0   dfail:0   fail:0   skip:40  time:404s

95626f201526d97b6f284b397fd73c8a2d514ff4 drm-tip: 2018y-03m-15d-11h-01m-14s UTC integration manifest
54a000090b07 drm/i915: Stop engines when declaring the machine wedged
62012cc290af drm/i915: Trace GEM steps between submit and wedging

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8366/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] drm/i915: Stop engines when declaring the machine wedged
  2018-03-15 13:20   ` [PATCH] " Chris Wilson
  2018-03-15 15:10     ` [PATCH v2] " Chris Wilson
@ 2018-03-15 15:44     ` Mika Kuoppala
  1 sibling, 0 replies; 12+ messages in thread
From: Mika Kuoppala @ 2018-03-15 15:44 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> If we fail to reset the GPU, we declare the machine wedged. However, the
> GPU may well still be running in the background with an in-flight
> request. So despite our efforts in cleaning up the request queue and
> faking the breadcrumb in the HWSP, the GPU may eventually write the
> in-flght seqno there breaking all of our assumptions and throwing the
> driver into a deep turmoil, wedging beyond wedged.
>
> To avoid this we ideally want to reset the GPU. Since that has already
> failed, make sure the rings have the stop bit set instead. This is part
> of the normal GPU reset sequence, but that is actually disabled by
> igt/gem_eio to force the wedged state. If we assume the worst, we must
> poke at the bit again before we give up.

I am pondering if you would save so much trouble by just
adding

int intel_get_gpu_reset(struct drm_i915_private *dev_priv,
bool ignore_modparam)

and then forcing a stop engine and the last reset
straight in wedging

-Mika

>
> Testcase: igt/gem_eio
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> ---
> git add fail
> We want to stop the tasklets before hitting stop-engines or else we
> may trigger an early CS completion and more asserts.
> -Chris
> ---
>  drivers/gpu/drm/i915/i915_gem.c         |  2 ++
>  drivers/gpu/drm/i915/intel_engine_cs.c  | 43 ++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  2 ++
>  drivers/gpu/drm/i915/intel_uncore.c     | 46 +--------------------------------
>  4 files changed, 48 insertions(+), 45 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 2fbd622bba30..208619981ea1 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3246,6 +3246,8 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  	}
>  	i915->caps.scheduler = 0;
>  
> +	intel_engines_stop(i915, ALL_ENGINES);
> +
>  	/*
>  	 * Make sure no one is running the old callback before we proceed with
>  	 * cancelling requests and resetting the completion tracking. Otherwise
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 337dfa56a738..44eddf10f4d1 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1727,6 +1727,49 @@ unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915)
>  	return which;
>  }
>  
> +static void gen3_stop_engine(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	const u32 base = engine->mmio_base;
> +	const i915_reg_t mode = RING_MI_MODE(base);
> +
> +	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
> +	if (intel_wait_for_register_fw(dev_priv,
> +				       mode,
> +				       MODE_IDLE,
> +				       MODE_IDLE,
> +				       500))
> +		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
> +				 engine->name);
> +
> +	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
> +	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
> +
> +	I915_WRITE_FW(RING_HEAD(base), 0);
> +	I915_WRITE_FW(RING_TAIL(base), 0);
> +	POSTING_READ_FW(RING_TAIL(base));
> +
> +	/* The ring must be empty before it is disabled */
> +	I915_WRITE_FW(RING_CTL(base), 0);
> +
> +	/* Check acts as a post */
> +	if (I915_READ_FW(RING_HEAD(base)) != 0)
> +		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
> +				 engine->name);
> +}
> +
> +void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	if (INTEL_GEN(i915) < 3)
> +		return;
> +
> +	for_each_engine_masked(engine, i915, engine_mask, id)
> +		gen3_stop_engine(engine);
> +}
> +
>  static void print_request(struct drm_printer *m,
>  			  struct i915_request *rq,
>  			  const char *prefix)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 1f50727a5ddb..1369f7c5b57e 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -1018,6 +1018,8 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine);
>  void intel_engines_park(struct drm_i915_private *i915);
>  void intel_engines_unpark(struct drm_i915_private *i915);
>  
> +void intel_engines_stop(struct drm_i915_private *i915, unsigned engine_mask);
> +
>  void intel_engines_reset_default_submission(struct drm_i915_private *i915);
>  unsigned int intel_engines_has_context_isolation(struct drm_i915_private *i915);
>  
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 4df7c2ef8576..cda4c06acf16 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1639,50 +1639,6 @@ int i915_reg_read_ioctl(struct drm_device *dev,
>  	return ret;
>  }
>  
> -static void gen3_stop_engine(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -	const u32 base = engine->mmio_base;
> -	const i915_reg_t mode = RING_MI_MODE(base);
> -
> -	I915_WRITE_FW(mode, _MASKED_BIT_ENABLE(STOP_RING));
> -	if (intel_wait_for_register_fw(dev_priv,
> -				       mode,
> -				       MODE_IDLE,
> -				       MODE_IDLE,
> -				       500))
> -		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n",
> -				 engine->name);
> -
> -	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
> -	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
> -
> -	I915_WRITE_FW(RING_HEAD(base), 0);
> -	I915_WRITE_FW(RING_TAIL(base), 0);
> -	POSTING_READ_FW(RING_TAIL(base));
> -
> -	/* The ring must be empty before it is disabled */
> -	I915_WRITE_FW(RING_CTL(base), 0);
> -
> -	/* Check acts as a post */
> -	if (I915_READ_FW(RING_HEAD(base)) != 0)
> -		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
> -				 engine->name);
> -}
> -
> -static void i915_stop_engines(struct drm_i915_private *dev_priv,
> -			      unsigned engine_mask)
> -{
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	if (INTEL_GEN(dev_priv) < 3)
> -		return;
> -
> -	for_each_engine_masked(engine, dev_priv, engine_mask, id)
> -		gen3_stop_engine(engine);
> -}
> -
>  static bool i915_in_reset(struct pci_dev *pdev)
>  {
>  	u8 gdrst;
> @@ -2057,7 +2013,7 @@ int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
>  		 *
>  		 * FIXME: Wa for more modern gens needs to be validated
>  		 */
> -		i915_stop_engines(dev_priv, engine_mask);
> +		intel_engines_stop(dev_priv, engine_mask);
>  
>  		ret = -ENODEV;
>  		if (reset)
> -- 
> 2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* ✓ Fi.CI.IGT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev2)
  2018-03-15 13:14 [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging Chris Wilson
                   ` (3 preceding siblings ...)
  2018-03-15 15:44 ` ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev3) Patchwork
@ 2018-03-15 15:59 ` Patchwork
  2018-03-15 19:45 ` ✓ Fi.CI.IGT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev3) Patchwork
  5 siblings, 0 replies; 12+ messages in thread
From: Patchwork @ 2018-03-15 15:59 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev2)
URL   : https://patchwork.freedesktop.org/series/40029/
State : success

== Summary ==

---- Known issues:

Test gem_eio:
        Subgroup in-flight-external:
                incomplete -> PASS       (shard-apl) fdo#105341 +1
Test kms_rotation_crc:
        Subgroup cursor-rotation-180:
                pass       -> FAIL       (shard-snb) fdo#105185
Test kms_sysfs_edid_timing:
                pass       -> WARN       (shard-apl) fdo#100047

fdo#105341 https://bugs.freedesktop.org/show_bug.cgi?id=105341
fdo#105185 https://bugs.freedesktop.org/show_bug.cgi?id=105185
fdo#100047 https://bugs.freedesktop.org/show_bug.cgi?id=100047

shard-apl        total:3442 pass:1814 dwarn:1   dfail:0   fail:7   skip:1619 time:12992s
shard-hsw        total:3442 pass:1768 dwarn:1   dfail:0   fail:1   skip:1671 time:11796s
shard-snb        total:3442 pass:1357 dwarn:1   dfail:0   fail:3   skip:2081 time:7204s
Blacklisted hosts:
shard-kbl        total:2097 pass:1170 dwarn:0   dfail:0   fail:7   skip:919 time:5852s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8363/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* ✓ Fi.CI.IGT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev3)
  2018-03-15 13:14 [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging Chris Wilson
                   ` (4 preceding siblings ...)
  2018-03-15 15:59 ` ✓ Fi.CI.IGT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev2) Patchwork
@ 2018-03-15 19:45 ` Patchwork
  5 siblings, 0 replies; 12+ messages in thread
From: Patchwork @ 2018-03-15 19:45 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev3)
URL   : https://patchwork.freedesktop.org/series/40029/
State : success

== Summary ==

---- Known issues:

Test gem_eio:
        Subgroup in-flight-contexts:
                incomplete -> PASS       (shard-apl) fdo#105341 +1
Test kms_flip:
        Subgroup 2x-plain-flip-ts-check-interruptible:
                pass       -> FAIL       (shard-hsw) fdo#100368
        Subgroup flip-vs-panning-vs-hang:
                pass       -> DMESG-WARN (shard-snb) fdo#103821
Test kms_sysfs_edid_timing:
                pass       -> WARN       (shard-apl) fdo#100047

fdo#105341 https://bugs.freedesktop.org/show_bug.cgi?id=105341
fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
fdo#103821 https://bugs.freedesktop.org/show_bug.cgi?id=103821
fdo#100047 https://bugs.freedesktop.org/show_bug.cgi?id=100047

shard-apl        total:3442 pass:1814 dwarn:1   dfail:0   fail:7   skip:1619 time:12933s
shard-hsw        total:3442 pass:1767 dwarn:1   dfail:0   fail:2   skip:1671 time:11820s
shard-snb        total:3442 pass:1357 dwarn:2   dfail:0   fail:2   skip:2081 time:7237s
Blacklisted hosts:
shard-kbl        total:2141 pass:1185 dwarn:26  dfail:0   fail:8   skip:921 time:5833s

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_8366/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] drm/i915: Stop engines when declaring the machine wedged
  2018-03-15 15:10     ` [PATCH v2] " Chris Wilson
@ 2018-03-16  8:58       ` Mika Kuoppala
  2018-03-16 10:29         ` Chris Wilson
  0 siblings, 1 reply; 12+ messages in thread
From: Mika Kuoppala @ 2018-03-16  8:58 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> If we fail to reset the GPU, we declare the machine wedged. However, the
> GPU may well still be running in the background with an in-flight
> request. So despite our efforts in cleaning up the request queue and
> faking the breadcrumb in the HWSP, the GPU may eventually write the
> in-flght seqno there breaking all of our assumptions and throwing the
> driver into a deep turmoil, wedging beyond wedged.
>
> To avoid this we ideally want to reset the GPU. Since that has already
> failed, make sure the rings have the stop bit set instead. This is part
> of the normal GPU reset sequence, but that is actually disabled by
> igt/gem_eio to force the wedged state. If we assume the worst, we must
> poke at the bit again before we give up.
>
> v2: Move the intel_gpu_reset() from set-wedged in the reset error path
> into i915_gem_set_wedged() itself. Even if the reset fails (e.g. if it is
> disabled by gem_eio), it still tries to make sure the engines are
> stopped. For i915_gem_set_wedged() callers from outside of i915_reset(),
> this should make sure the GPU is disabled while the driver is marked as
> being wedged.
>
> Testcase: igt/gem_eio
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Cc: Michał Winiarski <michal.winiarski@intel.com>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Michel Thierry <michel.thierry@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c | 1 -
>  drivers/gpu/drm/i915/i915_gem.c | 3 +++
>  2 files changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index f03555efc520..3df5193487f3 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -1995,7 +1995,6 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
>  error:
>  	i915_gem_set_wedged(i915);
>  	i915_retire_requests(i915);
> -	intel_gpu_reset(i915, ALL_ENGINES);
>  	goto finish;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 2fbd622bba30..802df8e1a544 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3246,6 +3246,9 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  	}
>  	i915->caps.scheduler = 0;
>  
> +	/* Even if the GPU reset fails, it should still stop the engines */
> +	intel_gpu_reset(i915, ALL_ENGINES);
> +

Comment is very welcome in here as modparm.reset usage isn't
so transparent.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

>  	/*
>  	 * Make sure no one is running the old callback before we proceed with
>  	 * cancelling requests and resetting the completion tracking. Otherwise
> -- 
> 2.16.2
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] drm/i915: Stop engines when declaring the machine wedged
  2018-03-16  8:58       ` Mika Kuoppala
@ 2018-03-16 10:29         ` Chris Wilson
  0 siblings, 0 replies; 12+ messages in thread
From: Chris Wilson @ 2018-03-16 10:29 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2018-03-16 08:58:28)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > If we fail to reset the GPU, we declare the machine wedged. However, the
> > GPU may well still be running in the background with an in-flight
> > request. So despite our efforts in cleaning up the request queue and
> > faking the breadcrumb in the HWSP, the GPU may eventually write the
> > in-flght seqno there breaking all of our assumptions and throwing the
> > driver into a deep turmoil, wedging beyond wedged.
> >
> > To avoid this we ideally want to reset the GPU. Since that has already
> > failed, make sure the rings have the stop bit set instead. This is part
> > of the normal GPU reset sequence, but that is actually disabled by
> > igt/gem_eio to force the wedged state. If we assume the worst, we must
> > poke at the bit again before we give up.
> >
> > v2: Move the intel_gpu_reset() from set-wedged in the reset error path
> > into i915_gem_set_wedged() itself. Even if the reset fails (e.g. if it is
> > disabled by gem_eio), it still tries to make sure the engines are
> > stopped. For i915_gem_set_wedged() callers from outside of i915_reset(),
> > this should make sure the GPU is disabled while the driver is marked as
> > being wedged.
> >
> > Testcase: igt/gem_eio
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Cc: Michał Winiarski <michal.winiarski@intel.com>
> > Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> > Cc: Michel Thierry <michel.thierry@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.c | 1 -
> >  drivers/gpu/drm/i915/i915_gem.c | 3 +++
> >  2 files changed, 3 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> > index f03555efc520..3df5193487f3 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.c
> > +++ b/drivers/gpu/drm/i915/i915_drv.c
> > @@ -1995,7 +1995,6 @@ void i915_reset(struct drm_i915_private *i915, unsigned int flags)
> >  error:
> >       i915_gem_set_wedged(i915);
> >       i915_retire_requests(i915);
> > -     intel_gpu_reset(i915, ALL_ENGINES);
> >       goto finish;
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 2fbd622bba30..802df8e1a544 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -3246,6 +3246,9 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
> >       }
> >       i915->caps.scheduler = 0;
> >  
> > +     /* Even if the GPU reset fails, it should still stop the engines */
> > +     intel_gpu_reset(i915, ALL_ENGINES);
> > +
> 
> Comment is very welcome in here as modparm.reset usage isn't
> so transparent.
> 
> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

Ta, gem_eio tamed, hopefully!
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-03-16 10:29 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-15 13:14 [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging Chris Wilson
2018-03-15 13:14 ` [PATCH 2/2] drm/i915: Stop engines when declaring the machine wedged Chris Wilson
2018-03-15 13:20   ` [PATCH] " Chris Wilson
2018-03-15 15:10     ` [PATCH v2] " Chris Wilson
2018-03-16  8:58       ` Mika Kuoppala
2018-03-16 10:29         ` Chris Wilson
2018-03-15 15:44     ` [PATCH] " Mika Kuoppala
2018-03-15 13:26 ` [PATCH 1/2] drm/i915: Trace GEM steps between submit and wedging Chris Wilson
2018-03-15 13:41 ` ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev2) Patchwork
2018-03-15 15:44 ` ✓ Fi.CI.BAT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev3) Patchwork
2018-03-15 15:59 ` ✓ Fi.CI.IGT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev2) Patchwork
2018-03-15 19:45 ` ✓ Fi.CI.IGT: success for series starting with [1/2] drm/i915: Trace GEM steps between submit and wedging (rev3) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.