All of lore.kernel.org
 help / color / mirror / Atom feed
* Mika's reward
@ 2019-01-14 21:04 Chris Wilson
  2019-01-14 21:04 ` [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged() Chris Wilson
                   ` (8 more replies)
  0 siblings, 9 replies; 20+ messages in thread
From: Chris Wilson @ 2019-01-14 21:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

In preparation for relaxing the conditions under which we wait to allow
waiting on the GPU from any context (e.g. holding nearly any other
mutex) is to remove taking struct_mutex from inside GPU reset. The issue
being that any mutex required for GPU reset is required to avoid
indefinite waits while waiting on the GPU -- so any mutex on which we
wait, may not be used inside reset (or the shrinker). struct_mutex
circumvents this requirement by complicated recursion avoidance, a
lesson we do not want to repeat.
-chris


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
  2019-01-14 21:04 Mika's reward Chris Wilson
@ 2019-01-14 21:04 ` Chris Wilson
  2019-01-15 11:56   ` Mika Kuoppala
  2019-01-14 21:04 ` [PATCH 2/8] drm/i915: Differentiate between ggtt->mutex and ppgtt->mutex Chris Wilson
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 20+ messages in thread
From: Chris Wilson @ 2019-01-14 21:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Make i915_gem_set_wedged() and i915_gem_unset_wedged() behaviour more
consistently if called concurrently.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c               | 32 ++++++++++++++-----
 drivers/gpu/drm/i915/i915_gpu_error.h         |  4 ++-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  1 +
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0bfed33178e1..910c49befc50 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3173,10 +3173,15 @@ static void nop_submit_request(struct i915_request *request)
 
 void i915_gem_set_wedged(struct drm_i915_private *i915)
 {
+	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	GEM_TRACE("start\n");
+	mutex_lock(&error->wedge_mutex);
+	if (test_bit(I915_WEDGED, &error->flags)) {
+		mutex_unlock(&error->wedge_mutex);
+		return;
+	}
 
 	if (GEM_SHOW_DEBUG()) {
 		struct drm_printer p = drm_debug_printer(__func__);
@@ -3185,8 +3190,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 			intel_engine_dump(engine, &p, "%s\n", engine->name);
 	}
 
-	if (test_and_set_bit(I915_WEDGED, &i915->gpu_error.flags))
-		goto out;
+	GEM_TRACE("start\n");
 
 	/*
 	 * First, stop submission to hw, but do not yet complete requests by
@@ -3222,23 +3226,31 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 		intel_engine_wakeup(engine);
 	}
 
-out:
+	smp_mb__before_atomic();
+	set_bit(I915_WEDGED, &error->flags);
+
 	GEM_TRACE("end\n");
+	mutex_unlock(&error->wedge_mutex);
 
-	wake_up_all(&i915->gpu_error.reset_queue);
+	wake_up_all(&error->reset_queue);
 }
 
 bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 {
+	struct i915_gpu_error *error = &i915->gpu_error;
 	struct i915_timeline *tl;
+	bool ret = false;
 
 	lockdep_assert_held(&i915->drm.struct_mutex);
-	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
+
+	if (!test_bit(I915_WEDGED, &error->flags))
 		return true;
 
 	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
 		return false;
 
+	mutex_lock(&error->wedge_mutex);
+
 	GEM_TRACE("start\n");
 
 	/*
@@ -3272,7 +3284,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 		 */
 		if (dma_fence_default_wait(&rq->fence, true,
 					   MAX_SCHEDULE_TIMEOUT) < 0)
-			return false;
+			goto unlock;
 	}
 	i915_retire_requests(i915);
 	GEM_BUG_ON(i915->gt.active_requests);
@@ -3295,8 +3307,11 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
+	ret = true;
+unlock:
+	mutex_unlock(&i915->gpu_error.wedge_mutex);
 
-	return true;
+	return ret;
 }
 
 static void
@@ -5692,6 +5707,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
 			  i915_gem_idle_work_handler);
 	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
 	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
+	mutex_init(&dev_priv->gpu_error.wedge_mutex);
 
 	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 6d9f45468ac1..604291f7762d 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -271,8 +271,8 @@ struct i915_gpu_error {
 #define I915_RESET_BACKOFF	0
 #define I915_RESET_HANDOFF	1
 #define I915_RESET_MODESET	2
+#define I915_RESET_ENGINE	3
 #define I915_WEDGED		(BITS_PER_LONG - 1)
-#define I915_RESET_ENGINE	(I915_WEDGED - I915_NUM_ENGINES)
 
 	/** Number of times an engine has been reset */
 	u32 reset_engine_count[I915_NUM_ENGINES];
@@ -283,6 +283,8 @@ struct i915_gpu_error {
 	/** Reason for the current *global* reset */
 	const char *reason;
 
+	struct mutex wedge_mutex; /* serialises wedging/unwedging */
+
 	/**
 	 * Waitqueue to signal when a hang is detected. Used to for waiters
 	 * to release the struct_mutex for the reset to procede.
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 082809569681..3cda66292e76 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -188,6 +188,7 @@ struct drm_i915_private *mock_gem_device(void)
 
 	init_waitqueue_head(&i915->gpu_error.wait_queue);
 	init_waitqueue_head(&i915->gpu_error.reset_queue);
+	mutex_init(&i915->gpu_error.wedge_mutex);
 
 	i915->wq = alloc_ordered_workqueue("mock", 0);
 	if (!i915->wq)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/8] drm/i915: Differentiate between ggtt->mutex and ppgtt->mutex
  2019-01-14 21:04 Mika's reward Chris Wilson
  2019-01-14 21:04 ` [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged() Chris Wilson
@ 2019-01-14 21:04 ` Chris Wilson
  2019-01-14 21:04 ` [PATCH 3/8] drm/i915: Pull all the reset functionality together into i915_reset.c Chris Wilson
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2019-01-14 21:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

We have two classes of VM, global GTT and per-process GTT. In order to
allow ourselves the freedom to mix both along call chains, distinguish
the two classes with regards to their mutex and lockdep maps.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.c       | 10 +++++-----
 drivers/gpu/drm/i915/i915_gem_gtt.h       |  2 ++
 drivers/gpu/drm/i915/selftests/mock_gtt.c |  6 +++---
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index dbea14bf67cc..74e6d02dcbbf 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -473,8 +473,7 @@ static void vm_free_page(struct i915_address_space *vm, struct page *page)
 	spin_unlock(&vm->free_pages.lock);
 }
 
-static void i915_address_space_init(struct i915_address_space *vm,
-				    struct drm_i915_private *dev_priv)
+static void i915_address_space_init(struct i915_address_space *vm, int subclass)
 {
 	/*
 	 * The vm->mutex must be reclaim safe (for use in the shrinker).
@@ -482,6 +481,7 @@ static void i915_address_space_init(struct i915_address_space *vm,
 	 * attempt holding the lock is immediately reported by lockdep.
 	 */
 	mutex_init(&vm->mutex);
+	lockdep_set_subclass(&vm->mutex, subclass);
 	i915_gem_shrinker_taints_mutex(vm->i915, &vm->mutex);
 
 	GEM_BUG_ON(!vm->total);
@@ -1547,7 +1547,7 @@ static struct i915_hw_ppgtt *gen8_ppgtt_create(struct drm_i915_private *i915)
 	/* From bdw, there is support for read-only pages in the PPGTT. */
 	ppgtt->vm.has_read_only = true;
 
-	i915_address_space_init(&ppgtt->vm, i915);
+	i915_address_space_init(&ppgtt->vm, VM_CLASS_PPGTT);
 
 	/* There are only few exceptions for gen >=6. chv and bxt.
 	 * And we are not sure about the latter so play safe for now.
@@ -1996,7 +1996,7 @@ static struct i915_hw_ppgtt *gen6_ppgtt_create(struct drm_i915_private *i915)
 
 	ppgtt->base.vm.total = I915_PDES * GEN6_PTES * I915_GTT_PAGE_SIZE;
 
-	i915_address_space_init(&ppgtt->base.vm, i915);
+	i915_address_space_init(&ppgtt->base.vm, VM_CLASS_PPGTT);
 
 	ppgtt->base.vm.allocate_va_range = gen6_alloc_va_range;
 	ppgtt->base.vm.clear_range = gen6_ppgtt_clear_range;
@@ -3433,7 +3433,7 @@ int i915_ggtt_init_hw(struct drm_i915_private *dev_priv)
 	 * and beyond the end of the GTT if we do not provide a guard.
 	 */
 	mutex_lock(&dev_priv->drm.struct_mutex);
-	i915_address_space_init(&ggtt->vm, dev_priv);
+	i915_address_space_init(&ggtt->vm, VM_CLASS_GGTT);
 
 	ggtt->vm.is_ggtt = true;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index e2360f16427a..9229b03d629b 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -288,6 +288,8 @@ struct i915_address_space {
 	bool closed;
 
 	struct mutex mutex; /* protects vma and our lists */
+#define VM_CLASS_GGTT 0
+#define VM_CLASS_PPGTT 1
 
 	u64 scratch_pte;
 	struct i915_page_dma scratch_page;
diff --git a/drivers/gpu/drm/i915/selftests/mock_gtt.c b/drivers/gpu/drm/i915/selftests/mock_gtt.c
index 6ae418c76015..976c862b3842 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gtt.c
@@ -70,7 +70,7 @@ mock_ppgtt(struct drm_i915_private *i915,
 	ppgtt->vm.total = round_down(U64_MAX, PAGE_SIZE);
 	ppgtt->vm.file = ERR_PTR(-ENODEV);
 
-	i915_address_space_init(&ppgtt->vm, i915);
+	i915_address_space_init(&ppgtt->vm, VM_CLASS_PPGTT);
 
 	ppgtt->vm.clear_range = nop_clear_range;
 	ppgtt->vm.insert_page = mock_insert_page;
@@ -102,6 +102,7 @@ void mock_init_ggtt(struct drm_i915_private *i915)
 	struct i915_ggtt *ggtt = &i915->ggtt;
 
 	ggtt->vm.i915 = i915;
+	ggtt->vm.is_ggtt = true;
 
 	ggtt->gmadr = (struct resource) DEFINE_RES_MEM(0, 2048 * PAGE_SIZE);
 	ggtt->mappable_end = resource_size(&ggtt->gmadr);
@@ -117,9 +118,8 @@ void mock_init_ggtt(struct drm_i915_private *i915)
 	ggtt->vm.vma_ops.set_pages   = ggtt_set_pages;
 	ggtt->vm.vma_ops.clear_pages = clear_pages;
 
-	i915_address_space_init(&ggtt->vm, i915);
 
-	ggtt->vm.is_ggtt = true;
+	i915_address_space_init(&ggtt->vm, VM_CLASS_GGTT);
 }
 
 void mock_fini_ggtt(struct drm_i915_private *i915)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/8] drm/i915: Pull all the reset functionality together into i915_reset.c
  2019-01-14 21:04 Mika's reward Chris Wilson
  2019-01-14 21:04 ` [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged() Chris Wilson
  2019-01-14 21:04 ` [PATCH 2/8] drm/i915: Differentiate between ggtt->mutex and ppgtt->mutex Chris Wilson
@ 2019-01-14 21:04 ` Chris Wilson
  2019-01-16 15:06   ` Mika Kuoppala
  2019-01-14 21:04 ` [PATCH 4/8] drm/i915: Make all GPU resets atomic Chris Wilson
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 20+ messages in thread
From: Chris Wilson @ 2019-01-14 21:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Currently the code to reset the GPU and our state is spread widely
across a few files. Pull the logic together into a common file.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Makefile                 |    3 +-
 drivers/gpu/drm/i915/i915_debugfs.c           |    2 +
 drivers/gpu/drm/i915/i915_drv.c               |  206 +--
 drivers/gpu/drm/i915/i915_drv.h               |   33 +-
 drivers/gpu/drm/i915/i915_gem.c               |  446 +-----
 drivers/gpu/drm/i915/i915_gem_gtt.c           |    1 +
 drivers/gpu/drm/i915/i915_irq.c               |  238 ---
 drivers/gpu/drm/i915/i915_request.c           |    1 +
 drivers/gpu/drm/i915/i915_reset.c             | 1389 +++++++++++++++++
 drivers/gpu/drm/i915/i915_reset.h             |   56 +
 drivers/gpu/drm/i915/intel_display.c          |   15 +-
 drivers/gpu/drm/i915/intel_engine_cs.c        |    1 +
 drivers/gpu/drm/i915/intel_guc.h              |    3 +
 drivers/gpu/drm/i915/intel_hangcheck.c        |    1 +
 drivers/gpu/drm/i915/intel_uc.c               |    1 +
 drivers/gpu/drm/i915/intel_uncore.c           |  556 -------
 drivers/gpu/drm/i915/selftests/intel_lrc.c    |    2 +
 .../drm/i915/selftests/intel_workarounds.c    |    1 +
 18 files changed, 1483 insertions(+), 1472 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_reset.c
 create mode 100644 drivers/gpu/drm/i915/i915_reset.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index c34bee16730d..611115ed00db 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -40,9 +40,10 @@ i915-y := i915_drv.o \
 	  i915_mm.o \
 	  i915_params.o \
 	  i915_pci.o \
+	  i915_reset.o \
 	  i915_suspend.o \
-	  i915_syncmap.o \
 	  i915_sw_fence.o \
+	  i915_syncmap.o \
 	  i915_sysfs.o \
 	  intel_csr.o \
 	  intel_device_info.o \
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index da6d2581cb0e..a93abb2274e6 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -32,6 +32,8 @@
 #include "intel_drv.h"
 #include "intel_guc_submission.h"
 
+#include "i915_reset.h"
+
 static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
 {
 	return to_i915(node->minor->dev);
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index dafbbfadd1ad..f462a4d28af4 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -48,6 +48,7 @@
 #include "i915_drv.h"
 #include "i915_trace.h"
 #include "i915_pmu.h"
+#include "i915_reset.h"
 #include "i915_query.h"
 #include "i915_vgpu.h"
 #include "intel_drv.h"
@@ -2205,211 +2206,6 @@ static int i915_resume_switcheroo(struct drm_device *dev)
 	return i915_drm_resume(dev);
 }
 
-/**
- * i915_reset - reset chip after a hang
- * @i915: #drm_i915_private to reset
- * @stalled_mask: mask of the stalled engines with the guilty requests
- * @reason: user error message for why we are resetting
- *
- * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
- * on failure.
- *
- * Caller must hold the struct_mutex.
- *
- * Procedure is fairly simple:
- *   - reset the chip using the reset reg
- *   - re-init context state
- *   - re-init hardware status page
- *   - re-init ring buffer
- *   - re-init interrupt state
- *   - re-init display
- */
-void i915_reset(struct drm_i915_private *i915,
-		unsigned int stalled_mask,
-		const char *reason)
-{
-	struct i915_gpu_error *error = &i915->gpu_error;
-	int ret;
-	int i;
-
-	GEM_TRACE("flags=%lx\n", error->flags);
-
-	might_sleep();
-	lockdep_assert_held(&i915->drm.struct_mutex);
-	assert_rpm_wakelock_held(i915);
-	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
-
-	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
-		return;
-
-	/* Clear any previous failed attempts at recovery. Time to try again. */
-	if (!i915_gem_unset_wedged(i915))
-		goto wakeup;
-
-	if (reason)
-		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
-	error->reset_count++;
-
-	ret = i915_gem_reset_prepare(i915);
-	if (ret) {
-		dev_err(i915->drm.dev, "GPU recovery failed\n");
-		goto taint;
-	}
-
-	if (!intel_has_gpu_reset(i915)) {
-		if (i915_modparams.reset)
-			dev_err(i915->drm.dev, "GPU reset not supported\n");
-		else
-			DRM_DEBUG_DRIVER("GPU reset disabled\n");
-		goto error;
-	}
-
-	for (i = 0; i < 3; i++) {
-		ret = intel_gpu_reset(i915, ALL_ENGINES);
-		if (ret == 0)
-			break;
-
-		msleep(100);
-	}
-	if (ret) {
-		dev_err(i915->drm.dev, "Failed to reset chip\n");
-		goto taint;
-	}
-
-	/* Ok, now get things going again... */
-
-	/*
-	 * Everything depends on having the GTT running, so we need to start
-	 * there.
-	 */
-	ret = i915_ggtt_enable_hw(i915);
-	if (ret) {
-		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
-			  ret);
-		goto error;
-	}
-
-	i915_gem_reset(i915, stalled_mask);
-	intel_overlay_reset(i915);
-
-	/*
-	 * Next we need to restore the context, but we don't use those
-	 * yet either...
-	 *
-	 * Ring buffer needs to be re-initialized in the KMS case, or if X
-	 * was running at the time of the reset (i.e. we weren't VT
-	 * switched away).
-	 */
-	ret = i915_gem_init_hw(i915);
-	if (ret) {
-		DRM_ERROR("Failed to initialise HW following reset (%d)\n",
-			  ret);
-		goto error;
-	}
-
-	i915_queue_hangcheck(i915);
-
-finish:
-	i915_gem_reset_finish(i915);
-wakeup:
-	clear_bit(I915_RESET_HANDOFF, &error->flags);
-	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
-	return;
-
-taint:
-	/*
-	 * History tells us that if we cannot reset the GPU now, we
-	 * never will. This then impacts everything that is run
-	 * subsequently. On failing the reset, we mark the driver
-	 * as wedged, preventing further execution on the GPU.
-	 * We also want to go one step further and add a taint to the
-	 * kernel so that any subsequent faults can be traced back to
-	 * this failure. This is important for CI, where if the
-	 * GPU/driver fails we would like to reboot and restart testing
-	 * rather than continue on into oblivion. For everyone else,
-	 * the system should still plod along, but they have been warned!
-	 */
-	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
-error:
-	i915_gem_set_wedged(i915);
-	i915_retire_requests(i915);
-	goto finish;
-}
-
-static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
-					struct intel_engine_cs *engine)
-{
-	return intel_gpu_reset(dev_priv, intel_engine_flag(engine));
-}
-
-/**
- * i915_reset_engine - reset GPU engine to recover from a hang
- * @engine: engine to reset
- * @msg: reason for GPU reset; or NULL for no dev_notice()
- *
- * Reset a specific GPU engine. Useful if a hang is detected.
- * Returns zero on successful reset or otherwise an error code.
- *
- * Procedure is:
- *  - identifies the request that caused the hang and it is dropped
- *  - reset engine (which will force the engine to idle)
- *  - re-init/configure engine
- */
-int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
-{
-	struct i915_gpu_error *error = &engine->i915->gpu_error;
-	struct i915_request *active_request;
-	int ret;
-
-	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
-	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
-
-	active_request = i915_gem_reset_prepare_engine(engine);
-	if (IS_ERR_OR_NULL(active_request)) {
-		/* Either the previous reset failed, or we pardon the reset. */
-		ret = PTR_ERR(active_request);
-		goto out;
-	}
-
-	if (msg)
-		dev_notice(engine->i915->drm.dev,
-			   "Resetting %s for %s\n", engine->name, msg);
-	error->reset_engine_count[engine->id]++;
-
-	if (!engine->i915->guc.execbuf_client)
-		ret = intel_gt_reset_engine(engine->i915, engine);
-	else
-		ret = intel_guc_reset_engine(&engine->i915->guc, engine);
-	if (ret) {
-		/* If we fail here, we expect to fallback to a global reset */
-		DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
-				 engine->i915->guc.execbuf_client ? "GuC " : "",
-				 engine->name, ret);
-		goto out;
-	}
-
-	/*
-	 * The request that caused the hang is stuck on elsp, we know the
-	 * active request and can drop it, adjust head to skip the offending
-	 * request to resume executing remaining requests in the queue.
-	 */
-	i915_gem_reset_engine(engine, active_request, true);
-
-	/*
-	 * The engine and its registers (and workarounds in case of render)
-	 * have been reset to their default values. Follow the init_ring
-	 * process to program RING_MODE, HWSP and re-enable submission.
-	 */
-	ret = engine->init_hw(engine);
-	if (ret)
-		goto out;
-
-out:
-	intel_engine_cancel_stop_cs(engine);
-	i915_gem_reset_finish_engine(engine);
-	return ret;
-}
-
 static int i915_pm_prepare(struct device *kdev)
 {
 	struct pci_dev *pdev = to_pci_dev(kdev);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fa99824f63b3..224d433ac7b6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2611,19 +2611,7 @@ extern const struct dev_pm_ops i915_pm_ops;
 extern int i915_driver_load(struct pci_dev *pdev,
 			    const struct pci_device_id *ent);
 extern void i915_driver_unload(struct drm_device *dev);
-extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
-extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
-
-extern void i915_reset(struct drm_i915_private *i915,
-		       unsigned int stalled_mask,
-		       const char *reason);
-extern int i915_reset_engine(struct intel_engine_cs *engine,
-			     const char *reason);
-
-extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
-extern int intel_reset_guc(struct drm_i915_private *dev_priv);
-extern int intel_guc_reset_engine(struct intel_guc *guc,
-				  struct intel_engine_cs *engine);
+
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
@@ -2666,20 +2654,11 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
 			   &dev_priv->gpu_error.hangcheck_work, delay);
 }
 
-__printf(4, 5)
-void i915_handle_error(struct drm_i915_private *dev_priv,
-		       u32 engine_mask,
-		       unsigned long flags,
-		       const char *fmt, ...);
-#define I915_ERROR_CAPTURE BIT(0)
-
 extern void intel_irq_init(struct drm_i915_private *dev_priv);
 extern void intel_irq_fini(struct drm_i915_private *dev_priv);
 int intel_irq_install(struct drm_i915_private *dev_priv);
 void intel_irq_uninstall(struct drm_i915_private *dev_priv);
 
-void i915_clear_error_registers(struct drm_i915_private *dev_priv);
-
 static inline bool intel_gvt_active(struct drm_i915_private *dev_priv)
 {
 	return dev_priv->gvt;
@@ -3044,18 +3023,8 @@ static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
 	return READ_ONCE(error->reset_engine_count[engine->id]);
 }
 
-struct i915_request *
-i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
-int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
-void i915_gem_reset(struct drm_i915_private *dev_priv,
-		    unsigned int stalled_mask);
-void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
-void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
 bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
-void i915_gem_reset_engine(struct intel_engine_cs *engine,
-			   struct i915_request *request,
-			   bool stalled);
 
 void i915_gem_init_mmio(struct drm_i915_private *i915);
 int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 910c49befc50..a7e0d61a45ea 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -27,15 +27,6 @@
 
 #include <drm/drm_vma_manager.h>
 #include <drm/i915_drm.h>
-#include "i915_drv.h"
-#include "i915_gem_clflush.h"
-#include "i915_vgpu.h"
-#include "i915_trace.h"
-#include "intel_drv.h"
-#include "intel_frontbuffer.h"
-#include "intel_mocs.h"
-#include "intel_workarounds.h"
-#include "i915_gemfs.h"
 #include <linux/dma-fence-array.h>
 #include <linux/kthread.h>
 #include <linux/reservation.h>
@@ -46,6 +37,18 @@
 #include <linux/pci.h>
 #include <linux/dma-buf.h>
 
+#include "i915_drv.h"
+#include "i915_gem_clflush.h"
+#include "i915_gemfs.h"
+#include "i915_reset.h"
+#include "i915_trace.h"
+#include "i915_vgpu.h"
+
+#include "intel_drv.h"
+#include "intel_frontbuffer.h"
+#include "intel_mocs.h"
+#include "intel_workarounds.h"
+
 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
 
 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
@@ -2859,61 +2862,6 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
 	return 0;
 }
 
-static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
-					const struct i915_gem_context *ctx)
-{
-	unsigned int score;
-	unsigned long prev_hang;
-
-	if (i915_gem_context_is_banned(ctx))
-		score = I915_CLIENT_SCORE_CONTEXT_BAN;
-	else
-		score = 0;
-
-	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
-	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
-		score += I915_CLIENT_SCORE_HANG_FAST;
-
-	if (score) {
-		atomic_add(score, &file_priv->ban_score);
-
-		DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
-				 ctx->name, score,
-				 atomic_read(&file_priv->ban_score));
-	}
-}
-
-static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
-{
-	unsigned int score;
-	bool banned, bannable;
-
-	atomic_inc(&ctx->guilty_count);
-
-	bannable = i915_gem_context_is_bannable(ctx);
-	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
-	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
-
-	/* Cool contexts don't accumulate client ban score */
-	if (!bannable)
-		return;
-
-	if (banned) {
-		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
-				 ctx->name, atomic_read(&ctx->guilty_count),
-				 score);
-		i915_gem_context_set_banned(ctx);
-	}
-
-	if (!IS_ERR_OR_NULL(ctx->file_priv))
-		i915_gem_client_mark_guilty(ctx->file_priv, ctx);
-}
-
-static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
-{
-	atomic_inc(&ctx->active_count);
-}
-
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine)
 {
@@ -2944,376 +2892,6 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
 	return active;
 }
 
-/*
- * Ensure irq handler finishes, and not run again.
- * Also return the active request so that we only search for it once.
- */
-struct i915_request *
-i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
-{
-	struct i915_request *request;
-
-	/*
-	 * During the reset sequence, we must prevent the engine from
-	 * entering RC6. As the context state is undefined until we restart
-	 * the engine, if it does enter RC6 during the reset, the state
-	 * written to the powercontext is undefined and so we may lose
-	 * GPU state upon resume, i.e. fail to restart after a reset.
-	 */
-	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
-
-	request = engine->reset.prepare(engine);
-	if (request && request->fence.error == -EIO)
-		request = ERR_PTR(-EIO); /* Previous reset failed! */
-
-	return request;
-}
-
-int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
-{
-	struct intel_engine_cs *engine;
-	struct i915_request *request;
-	enum intel_engine_id id;
-	int err = 0;
-
-	for_each_engine(engine, dev_priv, id) {
-		request = i915_gem_reset_prepare_engine(engine);
-		if (IS_ERR(request)) {
-			err = PTR_ERR(request);
-			continue;
-		}
-
-		engine->hangcheck.active_request = request;
-	}
-
-	i915_gem_revoke_fences(dev_priv);
-	intel_uc_sanitize(dev_priv);
-
-	return err;
-}
-
-static void engine_skip_context(struct i915_request *request)
-{
-	struct intel_engine_cs *engine = request->engine;
-	struct i915_gem_context *hung_ctx = request->gem_context;
-	struct i915_timeline *timeline = request->timeline;
-	unsigned long flags;
-
-	GEM_BUG_ON(timeline == &engine->timeline);
-
-	spin_lock_irqsave(&engine->timeline.lock, flags);
-	spin_lock(&timeline->lock);
-
-	list_for_each_entry_continue(request, &engine->timeline.requests, link)
-		if (request->gem_context == hung_ctx)
-			i915_request_skip(request, -EIO);
-
-	list_for_each_entry(request, &timeline->requests, link)
-		i915_request_skip(request, -EIO);
-
-	spin_unlock(&timeline->lock);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
-}
-
-/* Returns the request if it was guilty of the hang */
-static struct i915_request *
-i915_gem_reset_request(struct intel_engine_cs *engine,
-		       struct i915_request *request,
-		       bool stalled)
-{
-	/* The guilty request will get skipped on a hung engine.
-	 *
-	 * Users of client default contexts do not rely on logical
-	 * state preserved between batches so it is safe to execute
-	 * queued requests following the hang. Non default contexts
-	 * rely on preserved state, so skipping a batch loses the
-	 * evolution of the state and it needs to be considered corrupted.
-	 * Executing more queued batches on top of corrupted state is
-	 * risky. But we take the risk by trying to advance through
-	 * the queued requests in order to make the client behaviour
-	 * more predictable around resets, by not throwing away random
-	 * amount of batches it has prepared for execution. Sophisticated
-	 * clients can use gem_reset_stats_ioctl and dma fence status
-	 * (exported via sync_file info ioctl on explicit fences) to observe
-	 * when it loses the context state and should rebuild accordingly.
-	 *
-	 * The context ban, and ultimately the client ban, mechanism are safety
-	 * valves if client submission ends up resulting in nothing more than
-	 * subsequent hangs.
-	 */
-
-	if (i915_request_completed(request)) {
-		GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
-			  engine->name, request->global_seqno,
-			  request->fence.context, request->fence.seqno,
-			  intel_engine_get_seqno(engine));
-		stalled = false;
-	}
-
-	if (stalled) {
-		i915_gem_context_mark_guilty(request->gem_context);
-		i915_request_skip(request, -EIO);
-
-		/* If this context is now banned, skip all pending requests. */
-		if (i915_gem_context_is_banned(request->gem_context))
-			engine_skip_context(request);
-	} else {
-		/*
-		 * Since this is not the hung engine, it may have advanced
-		 * since the hang declaration. Double check by refinding
-		 * the active request at the time of the reset.
-		 */
-		request = i915_gem_find_active_request(engine);
-		if (request) {
-			unsigned long flags;
-
-			i915_gem_context_mark_innocent(request->gem_context);
-			dma_fence_set_error(&request->fence, -EAGAIN);
-
-			/* Rewind the engine to replay the incomplete rq */
-			spin_lock_irqsave(&engine->timeline.lock, flags);
-			request = list_prev_entry(request, link);
-			if (&request->link == &engine->timeline.requests)
-				request = NULL;
-			spin_unlock_irqrestore(&engine->timeline.lock, flags);
-		}
-	}
-
-	return request;
-}
-
-void i915_gem_reset_engine(struct intel_engine_cs *engine,
-			   struct i915_request *request,
-			   bool stalled)
-{
-	if (request)
-		request = i915_gem_reset_request(engine, request, stalled);
-
-	/* Setup the CS to resume from the breadcrumb of the hung request */
-	engine->reset.reset(engine, request);
-}
-
-void i915_gem_reset(struct drm_i915_private *dev_priv,
-		    unsigned int stalled_mask)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
-	i915_retire_requests(dev_priv);
-
-	for_each_engine(engine, dev_priv, id) {
-		struct intel_context *ce;
-
-		i915_gem_reset_engine(engine,
-				      engine->hangcheck.active_request,
-				      stalled_mask & ENGINE_MASK(id));
-		ce = fetch_and_zero(&engine->last_retired_context);
-		if (ce)
-			intel_context_unpin(ce);
-
-		/*
-		 * Ostensibily, we always want a context loaded for powersaving,
-		 * so if the engine is idle after the reset, send a request
-		 * to load our scratch kernel_context.
-		 *
-		 * More mysteriously, if we leave the engine idle after a reset,
-		 * the next userspace batch may hang, with what appears to be
-		 * an incoherent read by the CS (presumably stale TLB). An
-		 * empty request appears sufficient to paper over the glitch.
-		 */
-		if (intel_engine_is_idle(engine)) {
-			struct i915_request *rq;
-
-			rq = i915_request_alloc(engine,
-						dev_priv->kernel_context);
-			if (!IS_ERR(rq))
-				i915_request_add(rq);
-		}
-	}
-
-	i915_gem_restore_fences(dev_priv);
-}
-
-void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
-{
-	engine->reset.finish(engine);
-
-	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
-}
-
-void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
-	for_each_engine(engine, dev_priv, id) {
-		engine->hangcheck.active_request = NULL;
-		i915_gem_reset_finish_engine(engine);
-	}
-}
-
-static void nop_submit_request(struct i915_request *request)
-{
-	unsigned long flags;
-
-	GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
-		  request->engine->name,
-		  request->fence.context, request->fence.seqno);
-	dma_fence_set_error(&request->fence, -EIO);
-
-	spin_lock_irqsave(&request->engine->timeline.lock, flags);
-	__i915_request_submit(request);
-	intel_engine_write_global_seqno(request->engine, request->global_seqno);
-	spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
-}
-
-void i915_gem_set_wedged(struct drm_i915_private *i915)
-{
-	struct i915_gpu_error *error = &i915->gpu_error;
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	mutex_lock(&error->wedge_mutex);
-	if (test_bit(I915_WEDGED, &error->flags)) {
-		mutex_unlock(&error->wedge_mutex);
-		return;
-	}
-
-	if (GEM_SHOW_DEBUG()) {
-		struct drm_printer p = drm_debug_printer(__func__);
-
-		for_each_engine(engine, i915, id)
-			intel_engine_dump(engine, &p, "%s\n", engine->name);
-	}
-
-	GEM_TRACE("start\n");
-
-	/*
-	 * First, stop submission to hw, but do not yet complete requests by
-	 * rolling the global seqno forward (since this would complete requests
-	 * for which we haven't set the fence error to EIO yet).
-	 */
-	for_each_engine(engine, i915, id)
-		i915_gem_reset_prepare_engine(engine);
-
-	/* Even if the GPU reset fails, it should still stop the engines */
-	if (INTEL_GEN(i915) >= 5)
-		intel_gpu_reset(i915, ALL_ENGINES);
-
-	for_each_engine(engine, i915, id) {
-		engine->submit_request = nop_submit_request;
-		engine->schedule = NULL;
-	}
-	i915->caps.scheduler = 0;
-
-	/*
-	 * Make sure no request can slip through without getting completed by
-	 * either this call here to intel_engine_write_global_seqno, or the one
-	 * in nop_submit_request.
-	 */
-	synchronize_rcu();
-
-	/* Mark all executing requests as skipped */
-	for_each_engine(engine, i915, id)
-		engine->cancel_requests(engine);
-
-	for_each_engine(engine, i915, id) {
-		i915_gem_reset_finish_engine(engine);
-		intel_engine_wakeup(engine);
-	}
-
-	smp_mb__before_atomic();
-	set_bit(I915_WEDGED, &error->flags);
-
-	GEM_TRACE("end\n");
-	mutex_unlock(&error->wedge_mutex);
-
-	wake_up_all(&error->reset_queue);
-}
-
-bool i915_gem_unset_wedged(struct drm_i915_private *i915)
-{
-	struct i915_gpu_error *error = &i915->gpu_error;
-	struct i915_timeline *tl;
-	bool ret = false;
-
-	lockdep_assert_held(&i915->drm.struct_mutex);
-
-	if (!test_bit(I915_WEDGED, &error->flags))
-		return true;
-
-	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
-		return false;
-
-	mutex_lock(&error->wedge_mutex);
-
-	GEM_TRACE("start\n");
-
-	/*
-	 * Before unwedging, make sure that all pending operations
-	 * are flushed and errored out - we may have requests waiting upon
-	 * third party fences. We marked all inflight requests as EIO, and
-	 * every execbuf since returned EIO, for consistency we want all
-	 * the currently pending requests to also be marked as EIO, which
-	 * is done inside our nop_submit_request - and so we must wait.
-	 *
-	 * No more can be submitted until we reset the wedged bit.
-	 */
-	list_for_each_entry(tl, &i915->gt.timelines, link) {
-		struct i915_request *rq;
-
-		rq = i915_gem_active_peek(&tl->last_request,
-					  &i915->drm.struct_mutex);
-		if (!rq)
-			continue;
-
-		/*
-		 * We can't use our normal waiter as we want to
-		 * avoid recursively trying to handle the current
-		 * reset. The basic dma_fence_default_wait() installs
-		 * a callback for dma_fence_signal(), which is
-		 * triggered by our nop handler (indirectly, the
-		 * callback enables the signaler thread which is
-		 * woken by the nop_submit_request() advancing the seqno
-		 * and when the seqno passes the fence, the signaler
-		 * then signals the fence waking us up).
-		 */
-		if (dma_fence_default_wait(&rq->fence, true,
-					   MAX_SCHEDULE_TIMEOUT) < 0)
-			goto unlock;
-	}
-	i915_retire_requests(i915);
-	GEM_BUG_ON(i915->gt.active_requests);
-
-	intel_engines_sanitize(i915, false);
-
-	/*
-	 * Undo nop_submit_request. We prevent all new i915 requests from
-	 * being queued (by disallowing execbuf whilst wedged) so having
-	 * waited for all active requests above, we know the system is idle
-	 * and do not have to worry about a thread being inside
-	 * engine->submit_request() as we swap over. So unlike installing
-	 * the nop_submit_request on reset, we can do this from normal
-	 * context and do not require stop_machine().
-	 */
-	intel_engines_reset_default_submission(i915);
-	i915_gem_contexts_lost(i915);
-
-	GEM_TRACE("end\n");
-
-	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
-	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
-	ret = true;
-unlock:
-	mutex_unlock(&i915->gpu_error.wedge_mutex);
-
-	return ret;
-}
-
 static void
 i915_gem_retire_work_handler(struct work_struct *work)
 {
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 74e6d02dcbbf..68e02e46186b 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -37,6 +37,7 @@
 
 #include "i915_drv.h"
 #include "i915_vgpu.h"
+#include "i915_reset.h"
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 94187e68d39a..1c6cf024a509 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2930,46 +2930,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-struct wedge_me {
-	struct delayed_work work;
-	struct drm_i915_private *i915;
-	const char *name;
-};
-
-static void wedge_me(struct work_struct *work)
-{
-	struct wedge_me *w = container_of(work, typeof(*w), work.work);
-
-	dev_err(w->i915->drm.dev,
-		"%s timed out, cancelling all in-flight rendering.\n",
-		w->name);
-	i915_gem_set_wedged(w->i915);
-}
-
-static void __init_wedge(struct wedge_me *w,
-			 struct drm_i915_private *i915,
-			 long timeout,
-			 const char *name)
-{
-	w->i915 = i915;
-	w->name = name;
-
-	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
-	schedule_delayed_work(&w->work, timeout);
-}
-
-static void __fini_wedge(struct wedge_me *w)
-{
-	cancel_delayed_work_sync(&w->work);
-	destroy_delayed_work_on_stack(&w->work);
-	w->i915 = NULL;
-}
-
-#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
-	for (__init_wedge((W), (DEV), (TIMEOUT), __func__);		\
-	     (W)->i915;							\
-	     __fini_wedge((W)))
-
 static u32
 gen11_gt_engine_identity(struct drm_i915_private * const i915,
 			 const unsigned int bank, const unsigned int bit)
@@ -3180,204 +3140,6 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-static void i915_reset_device(struct drm_i915_private *dev_priv,
-			      u32 engine_mask,
-			      const char *reason)
-{
-	struct i915_gpu_error *error = &dev_priv->gpu_error;
-	struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
-	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
-	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
-	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
-	struct wedge_me w;
-
-	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
-
-	DRM_DEBUG_DRIVER("resetting chip\n");
-	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
-
-	/* Use a watchdog to ensure that our reset completes */
-	i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
-		intel_prepare_reset(dev_priv);
-
-		error->reason = reason;
-		error->stalled_mask = engine_mask;
-
-		/* Signal that locked waiters should reset the GPU */
-		smp_mb__before_atomic();
-		set_bit(I915_RESET_HANDOFF, &error->flags);
-		wake_up_all(&error->wait_queue);
-
-		/* Wait for anyone holding the lock to wakeup, without
-		 * blocking indefinitely on struct_mutex.
-		 */
-		do {
-			if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
-				i915_reset(dev_priv, engine_mask, reason);
-				mutex_unlock(&dev_priv->drm.struct_mutex);
-			}
-		} while (wait_on_bit_timeout(&error->flags,
-					     I915_RESET_HANDOFF,
-					     TASK_UNINTERRUPTIBLE,
-					     1));
-
-		error->stalled_mask = 0;
-		error->reason = NULL;
-
-		intel_finish_reset(dev_priv);
-	}
-
-	if (!test_bit(I915_WEDGED, &error->flags))
-		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
-}
-
-void i915_clear_error_registers(struct drm_i915_private *dev_priv)
-{
-	u32 eir;
-
-	if (!IS_GEN(dev_priv, 2))
-		I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER));
-
-	if (INTEL_GEN(dev_priv) < 4)
-		I915_WRITE(IPEIR, I915_READ(IPEIR));
-	else
-		I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965));
-
-	I915_WRITE(EIR, I915_READ(EIR));
-	eir = I915_READ(EIR);
-	if (eir) {
-		/*
-		 * some errors might have become stuck,
-		 * mask them.
-		 */
-		DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
-		I915_WRITE(EMR, I915_READ(EMR) | eir);
-		I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT);
-	}
-
-	if (INTEL_GEN(dev_priv) >= 8) {
-		I915_WRITE(GEN8_RING_FAULT_REG,
-			   I915_READ(GEN8_RING_FAULT_REG) & ~RING_FAULT_VALID);
-		POSTING_READ(GEN8_RING_FAULT_REG);
-	} else if (INTEL_GEN(dev_priv) >= 6) {
-		struct intel_engine_cs *engine;
-		enum intel_engine_id id;
-
-		for_each_engine(engine, dev_priv, id) {
-			I915_WRITE(RING_FAULT_REG(engine),
-				   I915_READ(RING_FAULT_REG(engine)) &
-				   ~RING_FAULT_VALID);
-		}
-		POSTING_READ(RING_FAULT_REG(dev_priv->engine[RCS]));
-	}
-}
-
-/**
- * i915_handle_error - handle a gpu error
- * @dev_priv: i915 device private
- * @engine_mask: mask representing engines that are hung
- * @flags: control flags
- * @fmt: Error message format string
- *
- * Do some basic checking of register state at error time and
- * dump it to the syslog.  Also call i915_capture_error_state() to make
- * sure we get a record and make it available in debugfs.  Fire a uevent
- * so userspace knows something bad happened (should trigger collection
- * of a ring dump etc.).
- */
-void i915_handle_error(struct drm_i915_private *dev_priv,
-		       u32 engine_mask,
-		       unsigned long flags,
-		       const char *fmt, ...)
-{
-	struct intel_engine_cs *engine;
-	intel_wakeref_t wakeref;
-	unsigned int tmp;
-	char error_msg[80];
-	char *msg = NULL;
-
-	if (fmt) {
-		va_list args;
-
-		va_start(args, fmt);
-		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
-		va_end(args);
-
-		msg = error_msg;
-	}
-
-	/*
-	 * In most cases it's guaranteed that we get here with an RPM
-	 * reference held, for example because there is a pending GPU
-	 * request that won't finish until the reset is done. This
-	 * isn't the case at least when we get here by doing a
-	 * simulated reset via debugfs, so get an RPM reference.
-	 */
-	wakeref = intel_runtime_pm_get(dev_priv);
-
-	engine_mask &= INTEL_INFO(dev_priv)->ring_mask;
-
-	if (flags & I915_ERROR_CAPTURE) {
-		i915_capture_error_state(dev_priv, engine_mask, msg);
-		i915_clear_error_registers(dev_priv);
-	}
-
-	/*
-	 * Try engine reset when available. We fall back to full reset if
-	 * single reset fails.
-	 */
-	if (intel_has_reset_engine(dev_priv) &&
-	    !i915_terminally_wedged(&dev_priv->gpu_error)) {
-		for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
-			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
-			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					     &dev_priv->gpu_error.flags))
-				continue;
-
-			if (i915_reset_engine(engine, msg) == 0)
-				engine_mask &= ~intel_engine_flag(engine);
-
-			clear_bit(I915_RESET_ENGINE + engine->id,
-				  &dev_priv->gpu_error.flags);
-			wake_up_bit(&dev_priv->gpu_error.flags,
-				    I915_RESET_ENGINE + engine->id);
-		}
-	}
-
-	if (!engine_mask)
-		goto out;
-
-	/* Full reset needs the mutex, stop any other user trying to do so. */
-	if (test_and_set_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags)) {
-		wait_event(dev_priv->gpu_error.reset_queue,
-			   !test_bit(I915_RESET_BACKOFF,
-				     &dev_priv->gpu_error.flags));
-		goto out;
-	}
-
-	/* Prevent any other reset-engine attempt. */
-	for_each_engine(engine, dev_priv, tmp) {
-		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					&dev_priv->gpu_error.flags))
-			wait_on_bit(&dev_priv->gpu_error.flags,
-				    I915_RESET_ENGINE + engine->id,
-				    TASK_UNINTERRUPTIBLE);
-	}
-
-	i915_reset_device(dev_priv, engine_mask, msg);
-
-	for_each_engine(engine, dev_priv, tmp) {
-		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &dev_priv->gpu_error.flags);
-	}
-
-	clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
-	wake_up_all(&dev_priv->gpu_error.reset_queue);
-
-out:
-	intel_runtime_pm_put(dev_priv, wakeref);
-}
-
 /* Called from drm generic code, passed 'crtc' which
  * we use as a pipe index
  */
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index d1355154886a..5403d4e2cee0 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -29,6 +29,7 @@
 #include <linux/sched/signal.h>
 
 #include "i915_drv.h"
+#include "i915_reset.h"
 
 static const char *i915_fence_get_driver_name(struct dma_fence *fence)
 {
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
new file mode 100644
index 000000000000..e2e40b44a9a8
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -0,0 +1,1389 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2008-2018 Intel Corporation
+ */
+
+#include <linux/sched/mm.h>
+
+#include "i915_drv.h"
+#include "i915_gpu_error.h"
+#include "i915_reset.h"
+
+#include "intel_guc.h"
+
+static void engine_skip_context(struct i915_request *rq)
+{
+	struct intel_engine_cs *engine = rq->engine;
+	struct i915_gem_context *hung_ctx = rq->gem_context;
+	struct i915_timeline *timeline = rq->timeline;
+	unsigned long flags;
+
+	GEM_BUG_ON(timeline == &engine->timeline);
+
+	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock(&timeline->lock);
+
+	list_for_each_entry_continue(rq, &engine->timeline.requests, link)
+		if (rq->gem_context == hung_ctx)
+			i915_request_skip(rq, -EIO);
+
+	list_for_each_entry(rq, &timeline->requests, link)
+		i915_request_skip(rq, -EIO);
+
+	spin_unlock(&timeline->lock);
+	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+}
+
+static void client_mark_guilty(struct drm_i915_file_private *file_priv,
+			       const struct i915_gem_context *ctx)
+{
+	unsigned int score;
+	unsigned long prev_hang;
+
+	if (i915_gem_context_is_banned(ctx))
+		score = I915_CLIENT_SCORE_CONTEXT_BAN;
+	else
+		score = 0;
+
+	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
+	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
+		score += I915_CLIENT_SCORE_HANG_FAST;
+
+	if (score) {
+		atomic_add(score, &file_priv->ban_score);
+
+		DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
+				 ctx->name, score,
+				 atomic_read(&file_priv->ban_score));
+	}
+}
+
+static void context_mark_guilty(struct i915_gem_context *ctx)
+{
+	unsigned int score;
+	bool banned, bannable;
+
+	atomic_inc(&ctx->guilty_count);
+
+	bannable = i915_gem_context_is_bannable(ctx);
+	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
+	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
+
+	/* Cool contexts don't accumulate client ban score */
+	if (!bannable)
+		return;
+
+	if (banned) {
+		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
+				 ctx->name, atomic_read(&ctx->guilty_count),
+				 score);
+		i915_gem_context_set_banned(ctx);
+	}
+
+	if (!IS_ERR_OR_NULL(ctx->file_priv))
+		client_mark_guilty(ctx->file_priv, ctx);
+}
+
+static void context_mark_innocent(struct i915_gem_context *ctx)
+{
+	atomic_inc(&ctx->active_count);
+}
+
+static void gen3_stop_engine(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	const u32 base = engine->mmio_base;
+
+	if (intel_engine_stop_cs(engine))
+		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name);
+
+	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
+	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
+
+	I915_WRITE_FW(RING_HEAD(base), 0);
+	I915_WRITE_FW(RING_TAIL(base), 0);
+	POSTING_READ_FW(RING_TAIL(base));
+
+	/* The ring must be empty before it is disabled */
+	I915_WRITE_FW(RING_CTL(base), 0);
+
+	/* Check acts as a post */
+	if (I915_READ_FW(RING_HEAD(base)) != 0)
+		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
+				 engine->name);
+}
+
+static void i915_stop_engines(struct drm_i915_private *i915,
+			      unsigned int engine_mask)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	if (INTEL_GEN(i915) < 3)
+		return;
+
+	for_each_engine_masked(engine, i915, engine_mask, id)
+		gen3_stop_engine(engine);
+}
+
+static bool i915_in_reset(struct pci_dev *pdev)
+{
+	u8 gdrst;
+
+	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
+	return gdrst & GRDOM_RESET_STATUS;
+}
+
+static int i915_do_reset(struct drm_i915_private *i915,
+			 unsigned int engine_mask,
+			 unsigned int retry)
+{
+	struct pci_dev *pdev = i915->drm.pdev;
+	int err;
+
+	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
+	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
+	usleep_range(50, 200);
+	err = wait_for(i915_in_reset(pdev), 500);
+
+	/* Clear the reset request. */
+	pci_write_config_byte(pdev, I915_GDRST, 0);
+	usleep_range(50, 200);
+	if (!err)
+		err = wait_for(!i915_in_reset(pdev), 500);
+
+	return err;
+}
+
+static bool g4x_reset_complete(struct pci_dev *pdev)
+{
+	u8 gdrst;
+
+	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
+	return (gdrst & GRDOM_RESET_ENABLE) == 0;
+}
+
+static int g33_do_reset(struct drm_i915_private *i915,
+			unsigned int engine_mask,
+			unsigned int retry)
+{
+	struct pci_dev *pdev = i915->drm.pdev;
+
+	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
+	return wait_for(g4x_reset_complete(pdev), 500);
+}
+
+static int g4x_do_reset(struct drm_i915_private *dev_priv,
+			unsigned int engine_mask,
+			unsigned int retry)
+{
+	struct pci_dev *pdev = dev_priv->drm.pdev;
+	int ret;
+
+	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
+	I915_WRITE(VDECCLK_GATE_D,
+		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
+	POSTING_READ(VDECCLK_GATE_D);
+
+	pci_write_config_byte(pdev, I915_GDRST,
+			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
+	ret =  wait_for(g4x_reset_complete(pdev), 500);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
+		goto out;
+	}
+
+	pci_write_config_byte(pdev, I915_GDRST,
+			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
+	ret =  wait_for(g4x_reset_complete(pdev), 500);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
+		goto out;
+	}
+
+out:
+	pci_write_config_byte(pdev, I915_GDRST, 0);
+
+	I915_WRITE(VDECCLK_GATE_D,
+		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
+	POSTING_READ(VDECCLK_GATE_D);
+
+	return ret;
+}
+
+static int ironlake_do_reset(struct drm_i915_private *dev_priv,
+			     unsigned int engine_mask,
+			     unsigned int retry)
+{
+	int ret;
+
+	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
+	ret = intel_wait_for_register(dev_priv,
+				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
+				      500);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
+		goto out;
+	}
+
+	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
+	ret = intel_wait_for_register(dev_priv,
+				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
+				      500);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
+		goto out;
+	}
+
+out:
+	I915_WRITE(ILK_GDSR, 0);
+	POSTING_READ(ILK_GDSR);
+	return ret;
+}
+
+/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
+static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
+				u32 hw_domain_mask)
+{
+	int err;
+
+	/*
+	 * GEN6_GDRST is not in the gt power well, no need to check
+	 * for fifo space for the write or forcewake the chip for
+	 * the read
+	 */
+	I915_WRITE_FW(GEN6_GDRST, hw_domain_mask);
+
+	/* Wait for the device to ack the reset requests */
+	err = __intel_wait_for_register_fw(dev_priv,
+					   GEN6_GDRST, hw_domain_mask, 0,
+					   500, 0,
+					   NULL);
+	if (err)
+		DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
+				 hw_domain_mask);
+
+	return err;
+}
+
+static int gen6_reset_engines(struct drm_i915_private *i915,
+			      unsigned int engine_mask,
+			      unsigned int retry)
+{
+	struct intel_engine_cs *engine;
+	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
+		[RCS] = GEN6_GRDOM_RENDER,
+		[BCS] = GEN6_GRDOM_BLT,
+		[VCS] = GEN6_GRDOM_MEDIA,
+		[VCS2] = GEN8_GRDOM_MEDIA2,
+		[VECS] = GEN6_GRDOM_VECS,
+	};
+	u32 hw_mask;
+
+	if (engine_mask == ALL_ENGINES) {
+		hw_mask = GEN6_GRDOM_FULL;
+	} else {
+		unsigned int tmp;
+
+		hw_mask = 0;
+		for_each_engine_masked(engine, i915, engine_mask, tmp)
+			hw_mask |= hw_engine_mask[engine->id];
+	}
+
+	return gen6_hw_domain_reset(i915, hw_mask);
+}
+
+static u32 gen11_lock_sfc(struct drm_i915_private *dev_priv,
+			  struct intel_engine_cs *engine)
+{
+	u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
+	i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
+	u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
+	i915_reg_t sfc_usage;
+	u32 sfc_usage_bit;
+	u32 sfc_reset_bit;
+
+	switch (engine->class) {
+	case VIDEO_DECODE_CLASS:
+		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
+			return 0;
+
+		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
+		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
+
+		sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
+		sfc_forced_lock_ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
+
+		sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
+		sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
+		sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
+		break;
+
+	case VIDEO_ENHANCEMENT_CLASS:
+		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
+		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
+
+		sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
+		sfc_forced_lock_ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
+
+		sfc_usage = GEN11_VECS_SFC_USAGE(engine);
+		sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
+		sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
+		break;
+
+	default:
+		return 0;
+	}
+
+	/*
+	 * Tell the engine that a software reset is going to happen. The engine
+	 * will then try to force lock the SFC (if currently locked, it will
+	 * remain so until we tell the engine it is safe to unlock; if currently
+	 * unlocked, it will ignore this and all new lock requests). If SFC
+	 * ends up being locked to the engine we want to reset, we have to reset
+	 * it as well (we will unlock it once the reset sequence is completed).
+	 */
+	I915_WRITE_FW(sfc_forced_lock,
+		      I915_READ_FW(sfc_forced_lock) | sfc_forced_lock_bit);
+
+	if (__intel_wait_for_register_fw(dev_priv,
+					 sfc_forced_lock_ack,
+					 sfc_forced_lock_ack_bit,
+					 sfc_forced_lock_ack_bit,
+					 1000, 0, NULL)) {
+		DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
+		return 0;
+	}
+
+	if (I915_READ_FW(sfc_usage) & sfc_usage_bit)
+		return sfc_reset_bit;
+
+	return 0;
+}
+
+static void gen11_unlock_sfc(struct drm_i915_private *dev_priv,
+			     struct intel_engine_cs *engine)
+{
+	u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
+	i915_reg_t sfc_forced_lock;
+	u32 sfc_forced_lock_bit;
+
+	switch (engine->class) {
+	case VIDEO_DECODE_CLASS:
+		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
+			return;
+
+		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
+		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
+		break;
+
+	case VIDEO_ENHANCEMENT_CLASS:
+		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
+		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
+		break;
+
+	default:
+		return;
+	}
+
+	I915_WRITE_FW(sfc_forced_lock,
+		      I915_READ_FW(sfc_forced_lock) & ~sfc_forced_lock_bit);
+}
+
+static int gen11_reset_engines(struct drm_i915_private *i915,
+			       unsigned int engine_mask,
+			       unsigned int retry)
+{
+	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
+		[RCS] = GEN11_GRDOM_RENDER,
+		[BCS] = GEN11_GRDOM_BLT,
+		[VCS] = GEN11_GRDOM_MEDIA,
+		[VCS2] = GEN11_GRDOM_MEDIA2,
+		[VCS3] = GEN11_GRDOM_MEDIA3,
+		[VCS4] = GEN11_GRDOM_MEDIA4,
+		[VECS] = GEN11_GRDOM_VECS,
+		[VECS2] = GEN11_GRDOM_VECS2,
+	};
+	struct intel_engine_cs *engine;
+	unsigned int tmp;
+	u32 hw_mask;
+	int ret;
+
+	BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES);
+
+	if (engine_mask == ALL_ENGINES) {
+		hw_mask = GEN11_GRDOM_FULL;
+	} else {
+		hw_mask = 0;
+		for_each_engine_masked(engine, i915, engine_mask, tmp) {
+			hw_mask |= hw_engine_mask[engine->id];
+			hw_mask |= gen11_lock_sfc(i915, engine);
+		}
+	}
+
+	ret = gen6_hw_domain_reset(i915, hw_mask);
+
+	if (engine_mask != ALL_ENGINES)
+		for_each_engine_masked(engine, i915, engine_mask, tmp)
+			gen11_unlock_sfc(i915, engine);
+
+	return ret;
+}
+
+static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	int ret;
+
+	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
+		      _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
+
+	ret = __intel_wait_for_register_fw(dev_priv,
+					   RING_RESET_CTL(engine->mmio_base),
+					   RESET_CTL_READY_TO_RESET,
+					   RESET_CTL_READY_TO_RESET,
+					   700, 0,
+					   NULL);
+	if (ret)
+		DRM_ERROR("%s: reset request timeout\n", engine->name);
+
+	return ret;
+}
+
+static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+
+	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
+		      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
+}
+
+static int gen8_reset_engines(struct drm_i915_private *i915,
+			      unsigned int engine_mask,
+			      unsigned int retry)
+{
+	struct intel_engine_cs *engine;
+	const bool reset_non_ready = retry >= 1;
+	unsigned int tmp;
+	int ret;
+
+	for_each_engine_masked(engine, i915, engine_mask, tmp) {
+		ret = gen8_engine_reset_prepare(engine);
+		if (ret && !reset_non_ready)
+			goto skip_reset;
+
+		/*
+		 * If this is not the first failed attempt to prepare,
+		 * we decide to proceed anyway.
+		 *
+		 * By doing so we risk context corruption and with
+		 * some gens (kbl), possible system hang if reset
+		 * happens during active bb execution.
+		 *
+		 * We rather take context corruption instead of
+		 * failed reset with a wedged driver/gpu. And
+		 * active bb execution case should be covered by
+		 * i915_stop_engines we have before the reset.
+		 */
+	}
+
+	if (INTEL_GEN(i915) >= 11)
+		ret = gen11_reset_engines(i915, engine_mask, retry);
+	else
+		ret = gen6_reset_engines(i915, engine_mask, retry);
+
+skip_reset:
+	for_each_engine_masked(engine, i915, engine_mask, tmp)
+		gen8_engine_reset_cancel(engine);
+
+	return ret;
+}
+
+typedef int (*reset_func)(struct drm_i915_private *,
+			  unsigned int engine_mask,
+			  unsigned int retry);
+
+static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
+{
+	if (!i915_modparams.reset)
+		return NULL;
+
+	if (INTEL_GEN(i915) >= 8)
+		return gen8_reset_engines;
+	else if (INTEL_GEN(i915) >= 6)
+		return gen6_reset_engines;
+	else if (INTEL_GEN(i915) >= 5)
+		return ironlake_do_reset;
+	else if (IS_G4X(i915))
+		return g4x_do_reset;
+	else if (IS_G33(i915) || IS_PINEVIEW(i915))
+		return g33_do_reset;
+	else if (INTEL_GEN(i915) >= 3)
+		return i915_do_reset;
+	else
+		return NULL;
+}
+
+int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
+{
+	reset_func reset = intel_get_gpu_reset(i915);
+	int retry;
+	int ret;
+
+	/*
+	 * We want to perform per-engine reset from atomic context (e.g.
+	 * softirq), which imposes the constraint that we cannot sleep.
+	 * However, experience suggests that spending a bit of time waiting
+	 * for a reset helps in various cases, so for a full-device reset
+	 * we apply the opposite rule and wait if we want to. As we should
+	 * always follow up a failed per-engine reset with a full device reset,
+	 * being a little faster, stricter and more error prone for the
+	 * atomic case seems an acceptable compromise.
+	 *
+	 * Unfortunately this leads to a bimodal routine, when the goal was
+	 * to have a single reset function that worked for resetting any
+	 * number of engines simultaneously.
+	 */
+	might_sleep_if(engine_mask == ALL_ENGINES);
+
+	/*
+	 * If the power well sleeps during the reset, the reset
+	 * request may be dropped and never completes (causing -EIO).
+	 */
+	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
+	for (retry = 0; retry < 3; retry++) {
+		/*
+		 * We stop engines, otherwise we might get failed reset and a
+		 * dead gpu (on elk). Also as modern gpu as kbl can suffer
+		 * from system hang if batchbuffer is progressing when
+		 * the reset is issued, regardless of READY_TO_RESET ack.
+		 * Thus assume it is best to stop engines on all gens
+		 * where we have a gpu reset.
+		 *
+		 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
+		 *
+		 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
+		 *
+		 * FIXME: Wa for more modern gens needs to be validated
+		 */
+		i915_stop_engines(i915, engine_mask);
+
+		ret = -ENODEV;
+		if (reset) {
+			GEM_TRACE("engine_mask=%x\n", engine_mask);
+			ret = reset(i915, engine_mask, retry);
+		}
+		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
+			break;
+
+		cond_resched();
+	}
+	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
+
+	return ret;
+}
+
+bool intel_has_gpu_reset(struct drm_i915_private *i915)
+{
+	return intel_get_gpu_reset(i915);
+}
+
+bool intel_has_reset_engine(struct drm_i915_private *i915)
+{
+	return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2;
+}
+
+int intel_reset_guc(struct drm_i915_private *i915)
+{
+	u32 guc_domain =
+		INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
+	int ret;
+
+	GEM_BUG_ON(!HAS_GUC(i915));
+
+	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
+	ret = gen6_hw_domain_reset(i915, guc_domain);
+	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
+
+	return ret;
+}
+
+/*
+ * Ensure irq handler finishes, and not run again.
+ * Also return the active request so that we only search for it once.
+ */
+static struct i915_request *
+reset_prepare_engine(struct intel_engine_cs *engine)
+{
+	struct i915_request *rq;
+
+	/*
+	 * During the reset sequence, we must prevent the engine from
+	 * entering RC6. As the context state is undefined until we restart
+	 * the engine, if it does enter RC6 during the reset, the state
+	 * written to the powercontext is undefined and so we may lose
+	 * GPU state upon resume, i.e. fail to restart after a reset.
+	 */
+	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
+
+	rq = engine->reset.prepare(engine);
+	if (rq && rq->fence.error == -EIO)
+		rq = ERR_PTR(-EIO); /* Previous reset failed! */
+
+	return rq;
+}
+
+static int reset_prepare(struct drm_i915_private *i915)
+{
+	struct intel_engine_cs *engine;
+	struct i915_request *rq;
+	enum intel_engine_id id;
+	int err = 0;
+
+	for_each_engine(engine, i915, id) {
+		rq = reset_prepare_engine(engine);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			continue;
+		}
+
+		engine->hangcheck.active_request = rq;
+	}
+
+	i915_gem_revoke_fences(i915);
+	intel_uc_sanitize(i915);
+
+	return err;
+}
+
+/* Returns the request if it was guilty of the hang */
+static struct i915_request *
+reset_request(struct intel_engine_cs *engine,
+	      struct i915_request *rq,
+	      bool stalled)
+{
+	/*
+	 * The guilty request will get skipped on a hung engine.
+	 *
+	 * Users of client default contexts do not rely on logical
+	 * state preserved between batches so it is safe to execute
+	 * queued requests following the hang. Non default contexts
+	 * rely on preserved state, so skipping a batch loses the
+	 * evolution of the state and it needs to be considered corrupted.
+	 * Executing more queued batches on top of corrupted state is
+	 * risky. But we take the risk by trying to advance through
+	 * the queued requests in order to make the client behaviour
+	 * more predictable around resets, by not throwing away random
+	 * amount of batches it has prepared for execution. Sophisticated
+	 * clients can use gem_reset_stats_ioctl and dma fence status
+	 * (exported via sync_file info ioctl on explicit fences) to observe
+	 * when it loses the context state and should rebuild accordingly.
+	 *
+	 * The context ban, and ultimately the client ban, mechanism are safety
+	 * valves if client submission ends up resulting in nothing more than
+	 * subsequent hangs.
+	 */
+
+	if (i915_request_completed(rq)) {
+		GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
+			  engine->name, rq->global_seqno,
+			  rq->fence.context, rq->fence.seqno,
+			  intel_engine_get_seqno(engine));
+		stalled = false;
+	}
+
+	if (stalled) {
+		context_mark_guilty(rq->gem_context);
+		i915_request_skip(rq, -EIO);
+
+		/* If this context is now banned, skip all pending requests. */
+		if (i915_gem_context_is_banned(rq->gem_context))
+			engine_skip_context(rq);
+	} else {
+		/*
+		 * Since this is not the hung engine, it may have advanced
+		 * since the hang declaration. Double check by refinding
+		 * the active request at the time of the reset.
+		 */
+		rq = i915_gem_find_active_request(engine);
+		if (rq) {
+			unsigned long flags;
+
+			context_mark_innocent(rq->gem_context);
+			dma_fence_set_error(&rq->fence, -EAGAIN);
+
+			/* Rewind the engine to replay the incomplete rq */
+			spin_lock_irqsave(&engine->timeline.lock, flags);
+			rq = list_prev_entry(rq, link);
+			if (&rq->link == &engine->timeline.requests)
+				rq = NULL;
+			spin_unlock_irqrestore(&engine->timeline.lock, flags);
+		}
+	}
+
+	return rq;
+}
+
+static void reset_engine(struct intel_engine_cs *engine,
+			 struct i915_request *rq,
+			 bool stalled)
+{
+	if (rq)
+		rq = reset_request(engine, rq, stalled);
+
+	/* Setup the CS to resume from the breadcrumb of the hung request */
+	engine->reset.reset(engine, rq);
+}
+
+static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+
+	i915_retire_requests(i915);
+
+	for_each_engine(engine, i915, id) {
+		struct intel_context *ce;
+
+		reset_engine(engine,
+			     engine->hangcheck.active_request,
+			     stalled_mask & ENGINE_MASK(id));
+		ce = fetch_and_zero(&engine->last_retired_context);
+		if (ce)
+			intel_context_unpin(ce);
+
+		/*
+		 * Ostensibily, we always want a context loaded for powersaving,
+		 * so if the engine is idle after the reset, send a request
+		 * to load our scratch kernel_context.
+		 *
+		 * More mysteriously, if we leave the engine idle after a reset,
+		 * the next userspace batch may hang, with what appears to be
+		 * an incoherent read by the CS (presumably stale TLB). An
+		 * empty request appears sufficient to paper over the glitch.
+		 */
+		if (intel_engine_is_idle(engine)) {
+			struct i915_request *rq;
+
+			rq = i915_request_alloc(engine, i915->kernel_context);
+			if (!IS_ERR(rq))
+				i915_request_add(rq);
+		}
+	}
+
+	i915_gem_restore_fences(i915);
+}
+
+static void reset_finish_engine(struct intel_engine_cs *engine)
+{
+	engine->reset.finish(engine);
+
+	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
+}
+
+static void reset_finish(struct drm_i915_private *i915)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+
+	for_each_engine(engine, i915, id) {
+		engine->hangcheck.active_request = NULL;
+		reset_finish_engine(engine);
+	}
+}
+
+static void nop_submit_request(struct i915_request *request)
+{
+	unsigned long flags;
+
+	GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
+		  request->engine->name,
+		  request->fence.context, request->fence.seqno);
+	dma_fence_set_error(&request->fence, -EIO);
+
+	spin_lock_irqsave(&request->engine->timeline.lock, flags);
+	__i915_request_submit(request);
+	intel_engine_write_global_seqno(request->engine, request->global_seqno);
+	spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
+}
+
+void i915_gem_set_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	mutex_lock(&error->wedge_mutex);
+	if (test_bit(I915_WEDGED, &error->flags)) {
+		mutex_unlock(&error->wedge_mutex);
+		return;
+	}
+
+	if (GEM_SHOW_DEBUG()) {
+		struct drm_printer p = drm_debug_printer(__func__);
+
+		for_each_engine(engine, i915, id)
+			intel_engine_dump(engine, &p, "%s\n", engine->name);
+	}
+
+	GEM_TRACE("start\n");
+
+	/*
+	 * First, stop submission to hw, but do not yet complete requests by
+	 * rolling the global seqno forward (since this would complete requests
+	 * for which we haven't set the fence error to EIO yet).
+	 */
+	for_each_engine(engine, i915, id)
+		reset_prepare_engine(engine);
+
+	/* Even if the GPU reset fails, it should still stop the engines */
+	if (INTEL_GEN(i915) >= 5)
+		intel_gpu_reset(i915, ALL_ENGINES);
+
+	for_each_engine(engine, i915, id) {
+		engine->submit_request = nop_submit_request;
+		engine->schedule = NULL;
+	}
+	i915->caps.scheduler = 0;
+
+	/*
+	 * Make sure no request can slip through without getting completed by
+	 * either this call here to intel_engine_write_global_seqno, or the one
+	 * in nop_submit_request.
+	 */
+	synchronize_rcu();
+
+	/* Mark all executing requests as skipped */
+	for_each_engine(engine, i915, id)
+		engine->cancel_requests(engine);
+
+	for_each_engine(engine, i915, id) {
+		reset_finish_engine(engine);
+		intel_engine_wakeup(engine);
+	}
+
+	smp_mb__before_atomic();
+	set_bit(I915_WEDGED, &error->flags);
+
+	GEM_TRACE("end\n");
+	mutex_unlock(&error->wedge_mutex);
+
+	wake_up_all(&error->reset_queue);
+}
+
+bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	struct i915_timeline *tl;
+	bool ret = false;
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+
+	if (!test_bit(I915_WEDGED, &error->flags))
+		return true;
+
+	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
+		return false;
+
+	mutex_lock(&error->wedge_mutex);
+
+	GEM_TRACE("start\n");
+
+	/*
+	 * Before unwedging, make sure that all pending operations
+	 * are flushed and errored out - we may have requests waiting upon
+	 * third party fences. We marked all inflight requests as EIO, and
+	 * every execbuf since returned EIO, for consistency we want all
+	 * the currently pending requests to also be marked as EIO, which
+	 * is done inside our nop_submit_request - and so we must wait.
+	 *
+	 * No more can be submitted until we reset the wedged bit.
+	 */
+	list_for_each_entry(tl, &i915->gt.timelines, link) {
+		struct i915_request *rq;
+
+		rq = i915_gem_active_peek(&tl->last_request,
+					  &i915->drm.struct_mutex);
+		if (!rq)
+			continue;
+
+		/*
+		 * We can't use our normal waiter as we want to
+		 * avoid recursively trying to handle the current
+		 * reset. The basic dma_fence_default_wait() installs
+		 * a callback for dma_fence_signal(), which is
+		 * triggered by our nop handler (indirectly, the
+		 * callback enables the signaler thread which is
+		 * woken by the nop_submit_request() advancing the seqno
+		 * and when the seqno passes the fence, the signaler
+		 * then signals the fence waking us up).
+		 */
+		if (dma_fence_default_wait(&rq->fence, true,
+					   MAX_SCHEDULE_TIMEOUT) < 0)
+			goto unlock;
+	}
+	i915_retire_requests(i915);
+	GEM_BUG_ON(i915->gt.active_requests);
+
+	intel_engines_sanitize(i915, false);
+
+	/*
+	 * Undo nop_submit_request. We prevent all new i915 requests from
+	 * being queued (by disallowing execbuf whilst wedged) so having
+	 * waited for all active requests above, we know the system is idle
+	 * and do not have to worry about a thread being inside
+	 * engine->submit_request() as we swap over. So unlike installing
+	 * the nop_submit_request on reset, we can do this from normal
+	 * context and do not require stop_machine().
+	 */
+	intel_engines_reset_default_submission(i915);
+	i915_gem_contexts_lost(i915);
+
+	GEM_TRACE("end\n");
+
+	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
+	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
+	ret = true;
+unlock:
+	mutex_unlock(&i915->gpu_error.wedge_mutex);
+
+	return ret;
+}
+
+/**
+ * i915_reset - reset chip after a hang
+ * @i915: #drm_i915_private to reset
+ * @stalled_mask: mask of the stalled engines with the guilty requests
+ * @reason: user error message for why we are resetting
+ *
+ * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
+ * on failure.
+ *
+ * Caller must hold the struct_mutex.
+ *
+ * Procedure is fairly simple:
+ *   - reset the chip using the reset reg
+ *   - re-init context state
+ *   - re-init hardware status page
+ *   - re-init ring buffer
+ *   - re-init interrupt state
+ *   - re-init display
+ */
+void i915_reset(struct drm_i915_private *i915,
+		unsigned int stalled_mask,
+		const char *reason)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	int ret;
+	int i;
+
+	GEM_TRACE("flags=%lx\n", error->flags);
+
+	might_sleep();
+	lockdep_assert_held(&i915->drm.struct_mutex);
+	assert_rpm_wakelock_held(i915);
+	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
+
+	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
+		return;
+
+	/* Clear any previous failed attempts at recovery. Time to try again. */
+	if (!i915_gem_unset_wedged(i915))
+		goto wakeup;
+
+	if (reason)
+		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
+	error->reset_count++;
+
+	ret = reset_prepare(i915);
+	if (ret) {
+		dev_err(i915->drm.dev, "GPU recovery failed\n");
+		goto taint;
+	}
+
+	if (!intel_has_gpu_reset(i915)) {
+		if (i915_modparams.reset)
+			dev_err(i915->drm.dev, "GPU reset not supported\n");
+		else
+			DRM_DEBUG_DRIVER("GPU reset disabled\n");
+		goto error;
+	}
+
+	for (i = 0; i < 3; i++) {
+		ret = intel_gpu_reset(i915, ALL_ENGINES);
+		if (ret == 0)
+			break;
+
+		msleep(100);
+	}
+	if (ret) {
+		dev_err(i915->drm.dev, "Failed to reset chip\n");
+		goto taint;
+	}
+
+	/* Ok, now get things going again... */
+
+	/*
+	 * Everything depends on having the GTT running, so we need to start
+	 * there.
+	 */
+	ret = i915_ggtt_enable_hw(i915);
+	if (ret) {
+		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
+			  ret);
+		goto error;
+	}
+
+	gt_reset(i915, stalled_mask);
+	intel_overlay_reset(i915);
+
+	/*
+	 * Next we need to restore the context, but we don't use those
+	 * yet either...
+	 *
+	 * Ring buffer needs to be re-initialized in the KMS case, or if X
+	 * was running at the time of the reset (i.e. we weren't VT
+	 * switched away).
+	 */
+	ret = i915_gem_init_hw(i915);
+	if (ret) {
+		DRM_ERROR("Failed to initialise HW following reset (%d)\n",
+			  ret);
+		goto error;
+	}
+
+	i915_queue_hangcheck(i915);
+
+finish:
+	reset_finish(i915);
+wakeup:
+	clear_bit(I915_RESET_HANDOFF, &error->flags);
+	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
+	return;
+
+taint:
+	/*
+	 * History tells us that if we cannot reset the GPU now, we
+	 * never will. This then impacts everything that is run
+	 * subsequently. On failing the reset, we mark the driver
+	 * as wedged, preventing further execution on the GPU.
+	 * We also want to go one step further and add a taint to the
+	 * kernel so that any subsequent faults can be traced back to
+	 * this failure. This is important for CI, where if the
+	 * GPU/driver fails we would like to reboot and restart testing
+	 * rather than continue on into oblivion. For everyone else,
+	 * the system should still plod along, but they have been warned!
+	 */
+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+error:
+	i915_gem_set_wedged(i915);
+	i915_retire_requests(i915);
+	goto finish;
+}
+
+static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
+					struct intel_engine_cs *engine)
+{
+	return intel_gpu_reset(i915, intel_engine_flag(engine));
+}
+
+/**
+ * i915_reset_engine - reset GPU engine to recover from a hang
+ * @engine: engine to reset
+ * @msg: reason for GPU reset; or NULL for no dev_notice()
+ *
+ * Reset a specific GPU engine. Useful if a hang is detected.
+ * Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is:
+ *  - identifies the request that caused the hang and it is dropped
+ *  - reset engine (which will force the engine to idle)
+ *  - re-init/configure engine
+ */
+int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
+{
+	struct i915_gpu_error *error = &engine->i915->gpu_error;
+	struct i915_request *active_request;
+	int ret;
+
+	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
+	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
+
+	active_request = reset_prepare_engine(engine);
+	if (IS_ERR_OR_NULL(active_request)) {
+		/* Either the previous reset failed, or we pardon the reset. */
+		ret = PTR_ERR(active_request);
+		goto out;
+	}
+
+	if (msg)
+		dev_notice(engine->i915->drm.dev,
+			   "Resetting %s for %s\n", engine->name, msg);
+	error->reset_engine_count[engine->id]++;
+
+	if (!engine->i915->guc.execbuf_client)
+		ret = intel_gt_reset_engine(engine->i915, engine);
+	else
+		ret = intel_guc_reset_engine(&engine->i915->guc, engine);
+	if (ret) {
+		/* If we fail here, we expect to fallback to a global reset */
+		DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
+				 engine->i915->guc.execbuf_client ? "GuC " : "",
+				 engine->name, ret);
+		goto out;
+	}
+
+	/*
+	 * The request that caused the hang is stuck on elsp, we know the
+	 * active request and can drop it, adjust head to skip the offending
+	 * request to resume executing remaining requests in the queue.
+	 */
+	reset_engine(engine, active_request, true);
+
+	/*
+	 * The engine and its registers (and workarounds in case of render)
+	 * have been reset to their default values. Follow the init_ring
+	 * process to program RING_MODE, HWSP and re-enable submission.
+	 */
+	ret = engine->init_hw(engine);
+	if (ret)
+		goto out;
+
+out:
+	intel_engine_cancel_stop_cs(engine);
+	reset_finish_engine(engine);
+	return ret;
+}
+
+static void i915_reset_device(struct drm_i915_private *i915,
+			      u32 engine_mask,
+			      const char *reason)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	struct kobject *kobj = &i915->drm.primary->kdev->kobj;
+	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
+	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
+	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
+	struct i915_wedge_me w;
+
+	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
+
+	DRM_DEBUG_DRIVER("resetting chip\n");
+	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
+
+	/* Use a watchdog to ensure that our reset completes */
+	i915_wedge_on_timeout(&w, i915, 5 * HZ) {
+		intel_prepare_reset(i915);
+
+		error->reason = reason;
+		error->stalled_mask = engine_mask;
+
+		/* Signal that locked waiters should reset the GPU */
+		smp_mb__before_atomic();
+		set_bit(I915_RESET_HANDOFF, &error->flags);
+		wake_up_all(&error->wait_queue);
+
+		/*
+		 * Wait for anyone holding the lock to wakeup, without
+		 * blocking indefinitely on struct_mutex.
+		 */
+		do {
+			if (mutex_trylock(&i915->drm.struct_mutex)) {
+				i915_reset(i915, engine_mask, reason);
+				mutex_unlock(&i915->drm.struct_mutex);
+			}
+		} while (wait_on_bit_timeout(&error->flags,
+					     I915_RESET_HANDOFF,
+					     TASK_UNINTERRUPTIBLE,
+					     1));
+
+		error->stalled_mask = 0;
+		error->reason = NULL;
+
+		intel_finish_reset(i915);
+	}
+
+	if (!test_bit(I915_WEDGED, &error->flags))
+		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
+}
+
+void i915_clear_error_registers(struct drm_i915_private *dev_priv)
+{
+	u32 eir;
+
+	if (!IS_GEN(dev_priv, 2))
+		I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER));
+
+	if (INTEL_GEN(dev_priv) < 4)
+		I915_WRITE(IPEIR, I915_READ(IPEIR));
+	else
+		I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965));
+
+	I915_WRITE(EIR, I915_READ(EIR));
+	eir = I915_READ(EIR);
+	if (eir) {
+		/*
+		 * some errors might have become stuck,
+		 * mask them.
+		 */
+		DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
+		I915_WRITE(EMR, I915_READ(EMR) | eir);
+		I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT);
+	}
+
+	if (INTEL_GEN(dev_priv) >= 8) {
+		I915_WRITE(GEN8_RING_FAULT_REG,
+			   I915_READ(GEN8_RING_FAULT_REG) & ~RING_FAULT_VALID);
+		POSTING_READ(GEN8_RING_FAULT_REG);
+	} else if (INTEL_GEN(dev_priv) >= 6) {
+		struct intel_engine_cs *engine;
+		enum intel_engine_id id;
+
+		for_each_engine(engine, dev_priv, id) {
+			I915_WRITE(RING_FAULT_REG(engine),
+				   I915_READ(RING_FAULT_REG(engine)) &
+				   ~RING_FAULT_VALID);
+		}
+		POSTING_READ(RING_FAULT_REG(dev_priv->engine[RCS]));
+	}
+}
+
+/**
+ * i915_handle_error - handle a gpu error
+ * @i915: i915 device private
+ * @engine_mask: mask representing engines that are hung
+ * @flags: control flags
+ * @fmt: Error message format string
+ *
+ * Do some basic checking of register state at error time and
+ * dump it to the syslog.  Also call i915_capture_error_state() to make
+ * sure we get a record and make it available in debugfs.  Fire a uevent
+ * so userspace knows something bad happened (should trigger collection
+ * of a ring dump etc.).
+ */
+void i915_handle_error(struct drm_i915_private *i915,
+		       u32 engine_mask,
+		       unsigned long flags,
+		       const char *fmt, ...)
+{
+	struct intel_engine_cs *engine;
+	intel_wakeref_t wakeref;
+	unsigned int tmp;
+	char error_msg[80];
+	char *msg = NULL;
+
+	if (fmt) {
+		va_list args;
+
+		va_start(args, fmt);
+		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
+		va_end(args);
+
+		msg = error_msg;
+	}
+
+	/*
+	 * In most cases it's guaranteed that we get here with an RPM
+	 * reference held, for example because there is a pending GPU
+	 * request that won't finish until the reset is done. This
+	 * isn't the case at least when we get here by doing a
+	 * simulated reset via debugfs, so get an RPM reference.
+	 */
+	wakeref = intel_runtime_pm_get(i915);
+
+	engine_mask &= INTEL_INFO(i915)->ring_mask;
+
+	if (flags & I915_ERROR_CAPTURE) {
+		i915_capture_error_state(i915, engine_mask, msg);
+		i915_clear_error_registers(i915);
+	}
+
+	/*
+	 * Try engine reset when available. We fall back to full reset if
+	 * single reset fails.
+	 */
+	if (intel_has_reset_engine(i915) &&
+	    !i915_terminally_wedged(&i915->gpu_error)) {
+		for_each_engine_masked(engine, i915, engine_mask, tmp) {
+			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
+			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
+					     &i915->gpu_error.flags))
+				continue;
+
+			if (i915_reset_engine(engine, msg) == 0)
+				engine_mask &= ~intel_engine_flag(engine);
+
+			clear_bit(I915_RESET_ENGINE + engine->id,
+				  &i915->gpu_error.flags);
+			wake_up_bit(&i915->gpu_error.flags,
+				    I915_RESET_ENGINE + engine->id);
+		}
+	}
+
+	if (!engine_mask)
+		goto out;
+
+	/* Full reset needs the mutex, stop any other user trying to do so. */
+	if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
+		wait_event(i915->gpu_error.reset_queue,
+			   !test_bit(I915_RESET_BACKOFF,
+				     &i915->gpu_error.flags));
+		goto out;
+	}
+
+	/* Prevent any other reset-engine attempt. */
+	for_each_engine(engine, i915, tmp) {
+		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
+					&i915->gpu_error.flags))
+			wait_on_bit(&i915->gpu_error.flags,
+				    I915_RESET_ENGINE + engine->id,
+				    TASK_UNINTERRUPTIBLE);
+	}
+
+	i915_reset_device(i915, engine_mask, msg);
+
+	for_each_engine(engine, i915, tmp) {
+		clear_bit(I915_RESET_ENGINE + engine->id,
+			  &i915->gpu_error.flags);
+	}
+
+	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+	wake_up_all(&i915->gpu_error.reset_queue);
+
+out:
+	intel_runtime_pm_put(i915, wakeref);
+}
+
+static void i915_wedge_me(struct work_struct *work)
+{
+	struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
+
+	dev_err(w->i915->drm.dev,
+		"%s timed out, cancelling all in-flight rendering.\n",
+		w->name);
+	i915_gem_set_wedged(w->i915);
+}
+
+void __i915_init_wedge(struct i915_wedge_me *w,
+		       struct drm_i915_private *i915,
+		       long timeout,
+		       const char *name)
+{
+	w->i915 = i915;
+	w->name = name;
+
+	INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me);
+	schedule_delayed_work(&w->work, timeout);
+}
+
+void __i915_fini_wedge(struct i915_wedge_me *w)
+{
+	cancel_delayed_work_sync(&w->work);
+	destroy_delayed_work_on_stack(&w->work);
+	w->i915 = NULL;
+}
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
new file mode 100644
index 000000000000..b6a519bde67d
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -0,0 +1,56 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2008-2018 Intel Corporation
+ */
+
+#ifndef I915_RESET_H
+#define I915_RESET_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+struct drm_i915_private;
+struct intel_engine_cs;
+struct intel_guc;
+
+__printf(4, 5)
+void i915_handle_error(struct drm_i915_private *i915,
+		       u32 engine_mask,
+		       unsigned long flags,
+		       const char *fmt, ...);
+#define I915_ERROR_CAPTURE BIT(0)
+
+void i915_clear_error_registers(struct drm_i915_private *i915);
+
+void i915_reset(struct drm_i915_private *i915,
+		unsigned int stalled_mask,
+		const char *reason);
+int i915_reset_engine(struct intel_engine_cs *engine,
+		      const char *reason);
+
+bool intel_has_gpu_reset(struct drm_i915_private *i915);
+bool intel_has_reset_engine(struct drm_i915_private *i915);
+
+int intel_gpu_reset(struct drm_i915_private *i915, u32 engine_mask);
+
+int intel_reset_guc(struct drm_i915_private *i915);
+
+struct i915_wedge_me {
+	struct delayed_work work;
+	struct drm_i915_private *i915;
+	const char *name;
+};
+
+void __i915_init_wedge(struct i915_wedge_me *w,
+		       struct drm_i915_private *i915,
+		       long timeout,
+		       const char *name);
+void __i915_fini_wedge(struct i915_wedge_me *w);
+
+#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
+	for (__i915_init_wedge((W), (DEV), (TIMEOUT), __func__);	\
+	     (W)->i915;							\
+	     __i915_fini_wedge((W)))
+
+#endif /* I915_RESET_H */
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 64dbd06f4ffb..fbe3c3a3b675 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -31,13 +31,7 @@
 #include <linux/slab.h>
 #include <linux/vgaarb.h>
 #include <drm/drm_edid.h>
-#include "intel_drv.h"
-#include "intel_frontbuffer.h"
 #include <drm/i915_drm.h>
-#include "i915_drv.h"
-#include "i915_gem_clflush.h"
-#include "intel_dsi.h"
-#include "i915_trace.h"
 #include <drm/drm_atomic.h>
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_dp_helper.h>
@@ -48,6 +42,15 @@
 #include <linux/intel-iommu.h>
 #include <linux/reservation.h>
 
+#include "intel_drv.h"
+#include "intel_dsi.h"
+#include "intel_frontbuffer.h"
+
+#include "i915_drv.h"
+#include "i915_gem_clflush.h"
+#include "i915_reset.h"
+#include "i915_trace.h"
+
 /* Primary plane formats for gen <= 3 */
 static const uint32_t i8xx_primary_formats[] = {
 	DRM_FORMAT_C8,
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index bf4dae2649ab..9c943bb95cb9 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -25,6 +25,7 @@
 #include <drm/drm_print.h>
 
 #include "i915_drv.h"
+#include "i915_reset.h"
 #include "intel_ringbuffer.h"
 #include "intel_lrc.h"
 
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 0f1c4f9ebfd8..744220296653 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -192,4 +192,7 @@ static inline void intel_guc_disable_msg(struct intel_guc *guc, u32 mask)
 	spin_unlock_irq(&guc->irq_lock);
 }
 
+int intel_guc_reset_engine(struct intel_guc *guc,
+			   struct intel_engine_cs *engine);
+
 #endif
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 51e9efec5116..7dc11fcb13de 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -23,6 +23,7 @@
  */
 
 #include "i915_drv.h"
+#include "i915_reset.h"
 
 static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
 {
diff --git a/drivers/gpu/drm/i915/intel_uc.c b/drivers/gpu/drm/i915/intel_uc.c
index 731b82afe636..e711eb3268bc 100644
--- a/drivers/gpu/drm/i915/intel_uc.c
+++ b/drivers/gpu/drm/i915/intel_uc.c
@@ -26,6 +26,7 @@
 #include "intel_guc_submission.h"
 #include "intel_guc.h"
 #include "i915_drv.h"
+#include "i915_reset.h"
 
 static void guc_free_load_err_log(struct intel_guc *guc);
 
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index 681ea532585e..e88f0252d77e 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1715,372 +1715,6 @@ int i915_reg_read_ioctl(struct drm_device *dev,
 	return ret;
 }
 
-static void gen3_stop_engine(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	const u32 base = engine->mmio_base;
-
-	if (intel_engine_stop_cs(engine))
-		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name);
-
-	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
-	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
-
-	I915_WRITE_FW(RING_HEAD(base), 0);
-	I915_WRITE_FW(RING_TAIL(base), 0);
-	POSTING_READ_FW(RING_TAIL(base));
-
-	/* The ring must be empty before it is disabled */
-	I915_WRITE_FW(RING_CTL(base), 0);
-
-	/* Check acts as a post */
-	if (I915_READ_FW(RING_HEAD(base)) != 0)
-		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
-				 engine->name);
-}
-
-static void i915_stop_engines(struct drm_i915_private *dev_priv,
-			      unsigned int engine_mask)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	if (INTEL_GEN(dev_priv) < 3)
-		return;
-
-	for_each_engine_masked(engine, dev_priv, engine_mask, id)
-		gen3_stop_engine(engine);
-}
-
-static bool i915_in_reset(struct pci_dev *pdev)
-{
-	u8 gdrst;
-
-	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
-	return gdrst & GRDOM_RESET_STATUS;
-}
-
-static int i915_do_reset(struct drm_i915_private *dev_priv,
-			 unsigned int engine_mask,
-			 unsigned int retry)
-{
-	struct pci_dev *pdev = dev_priv->drm.pdev;
-	int err;
-
-	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
-	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
-	usleep_range(50, 200);
-	err = wait_for(i915_in_reset(pdev), 500);
-
-	/* Clear the reset request. */
-	pci_write_config_byte(pdev, I915_GDRST, 0);
-	usleep_range(50, 200);
-	if (!err)
-		err = wait_for(!i915_in_reset(pdev), 500);
-
-	return err;
-}
-
-static bool g4x_reset_complete(struct pci_dev *pdev)
-{
-	u8 gdrst;
-
-	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
-	return (gdrst & GRDOM_RESET_ENABLE) == 0;
-}
-
-static int g33_do_reset(struct drm_i915_private *dev_priv,
-			unsigned int engine_mask,
-			unsigned int retry)
-{
-	struct pci_dev *pdev = dev_priv->drm.pdev;
-
-	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
-	return wait_for(g4x_reset_complete(pdev), 500);
-}
-
-static int g4x_do_reset(struct drm_i915_private *dev_priv,
-			unsigned int engine_mask,
-			unsigned int retry)
-{
-	struct pci_dev *pdev = dev_priv->drm.pdev;
-	int ret;
-
-	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
-	I915_WRITE(VDECCLK_GATE_D,
-		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
-	POSTING_READ(VDECCLK_GATE_D);
-
-	pci_write_config_byte(pdev, I915_GDRST,
-			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
-	ret =  wait_for(g4x_reset_complete(pdev), 500);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
-		goto out;
-	}
-
-	pci_write_config_byte(pdev, I915_GDRST,
-			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
-	ret =  wait_for(g4x_reset_complete(pdev), 500);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
-		goto out;
-	}
-
-out:
-	pci_write_config_byte(pdev, I915_GDRST, 0);
-
-	I915_WRITE(VDECCLK_GATE_D,
-		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
-	POSTING_READ(VDECCLK_GATE_D);
-
-	return ret;
-}
-
-static int ironlake_do_reset(struct drm_i915_private *dev_priv,
-			     unsigned int engine_mask,
-			     unsigned int retry)
-{
-	int ret;
-
-	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
-	ret = intel_wait_for_register(dev_priv,
-				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
-				      500);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
-		goto out;
-	}
-
-	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
-	ret = intel_wait_for_register(dev_priv,
-				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
-				      500);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
-		goto out;
-	}
-
-out:
-	I915_WRITE(ILK_GDSR, 0);
-	POSTING_READ(ILK_GDSR);
-	return ret;
-}
-
-/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
-static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
-				u32 hw_domain_mask)
-{
-	int err;
-
-	/* GEN6_GDRST is not in the gt power well, no need to check
-	 * for fifo space for the write or forcewake the chip for
-	 * the read
-	 */
-	__raw_i915_write32(dev_priv, GEN6_GDRST, hw_domain_mask);
-
-	/* Wait for the device to ack the reset requests */
-	err = __intel_wait_for_register_fw(dev_priv,
-					   GEN6_GDRST, hw_domain_mask, 0,
-					   500, 0,
-					   NULL);
-	if (err)
-		DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
-				 hw_domain_mask);
-
-	return err;
-}
-
-/**
- * gen6_reset_engines - reset individual engines
- * @dev_priv: i915 device
- * @engine_mask: mask of intel_ring_flag() engines or ALL_ENGINES for full reset
- * @retry: the count of of previous attempts to reset.
- *
- * This function will reset the individual engines that are set in engine_mask.
- * If you provide ALL_ENGINES as mask, full global domain reset will be issued.
- *
- * Note: It is responsibility of the caller to handle the difference between
- * asking full domain reset versus reset for all available individual engines.
- *
- * Returns 0 on success, nonzero on error.
- */
-static int gen6_reset_engines(struct drm_i915_private *dev_priv,
-			      unsigned int engine_mask,
-			      unsigned int retry)
-{
-	struct intel_engine_cs *engine;
-	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
-		[RCS] = GEN6_GRDOM_RENDER,
-		[BCS] = GEN6_GRDOM_BLT,
-		[VCS] = GEN6_GRDOM_MEDIA,
-		[VCS2] = GEN8_GRDOM_MEDIA2,
-		[VECS] = GEN6_GRDOM_VECS,
-	};
-	u32 hw_mask;
-
-	if (engine_mask == ALL_ENGINES) {
-		hw_mask = GEN6_GRDOM_FULL;
-	} else {
-		unsigned int tmp;
-
-		hw_mask = 0;
-		for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-			hw_mask |= hw_engine_mask[engine->id];
-	}
-
-	return gen6_hw_domain_reset(dev_priv, hw_mask);
-}
-
-static u32 gen11_lock_sfc(struct drm_i915_private *dev_priv,
-			  struct intel_engine_cs *engine)
-{
-	u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
-	i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
-	u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
-	i915_reg_t sfc_usage;
-	u32 sfc_usage_bit;
-	u32 sfc_reset_bit;
-
-	switch (engine->class) {
-	case VIDEO_DECODE_CLASS:
-		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
-			return 0;
-
-		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
-		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
-
-		sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
-		sfc_forced_lock_ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
-
-		sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
-		sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
-		sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
-		break;
-
-	case VIDEO_ENHANCEMENT_CLASS:
-		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
-		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
-
-		sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
-		sfc_forced_lock_ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
-
-		sfc_usage = GEN11_VECS_SFC_USAGE(engine);
-		sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
-		sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
-		break;
-
-	default:
-		return 0;
-	}
-
-	/*
-	 * Tell the engine that a software reset is going to happen. The engine
-	 * will then try to force lock the SFC (if currently locked, it will
-	 * remain so until we tell the engine it is safe to unlock; if currently
-	 * unlocked, it will ignore this and all new lock requests). If SFC
-	 * ends up being locked to the engine we want to reset, we have to reset
-	 * it as well (we will unlock it once the reset sequence is completed).
-	 */
-	I915_WRITE_FW(sfc_forced_lock,
-		      I915_READ_FW(sfc_forced_lock) | sfc_forced_lock_bit);
-
-	if (__intel_wait_for_register_fw(dev_priv,
-					 sfc_forced_lock_ack,
-					 sfc_forced_lock_ack_bit,
-					 sfc_forced_lock_ack_bit,
-					 1000, 0, NULL)) {
-		DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
-		return 0;
-	}
-
-	if (I915_READ_FW(sfc_usage) & sfc_usage_bit)
-		return sfc_reset_bit;
-
-	return 0;
-}
-
-static void gen11_unlock_sfc(struct drm_i915_private *dev_priv,
-			     struct intel_engine_cs *engine)
-{
-	u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
-	i915_reg_t sfc_forced_lock;
-	u32 sfc_forced_lock_bit;
-
-	switch (engine->class) {
-	case VIDEO_DECODE_CLASS:
-		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
-			return;
-
-		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
-		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
-		break;
-
-	case VIDEO_ENHANCEMENT_CLASS:
-		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
-		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
-		break;
-
-	default:
-		return;
-	}
-
-	I915_WRITE_FW(sfc_forced_lock,
-		      I915_READ_FW(sfc_forced_lock) & ~sfc_forced_lock_bit);
-}
-
-/**
- * gen11_reset_engines - reset individual engines
- * @dev_priv: i915 device
- * @engine_mask: mask of intel_ring_flag() engines or ALL_ENGINES for full reset
- *
- * This function will reset the individual engines that are set in engine_mask.
- * If you provide ALL_ENGINES as mask, full global domain reset will be issued.
- *
- * Note: It is responsibility of the caller to handle the difference between
- * asking full domain reset versus reset for all available individual engines.
- *
- * Returns 0 on success, nonzero on error.
- */
-static int gen11_reset_engines(struct drm_i915_private *dev_priv,
-			       unsigned int engine_mask)
-{
-	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
-		[RCS] = GEN11_GRDOM_RENDER,
-		[BCS] = GEN11_GRDOM_BLT,
-		[VCS] = GEN11_GRDOM_MEDIA,
-		[VCS2] = GEN11_GRDOM_MEDIA2,
-		[VCS3] = GEN11_GRDOM_MEDIA3,
-		[VCS4] = GEN11_GRDOM_MEDIA4,
-		[VECS] = GEN11_GRDOM_VECS,
-		[VECS2] = GEN11_GRDOM_VECS2,
-	};
-	struct intel_engine_cs *engine;
-	unsigned int tmp;
-	u32 hw_mask;
-	int ret;
-
-	BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES);
-
-	if (engine_mask == ALL_ENGINES) {
-		hw_mask = GEN11_GRDOM_FULL;
-	} else {
-		hw_mask = 0;
-		for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
-			hw_mask |= hw_engine_mask[engine->id];
-			hw_mask |= gen11_lock_sfc(dev_priv, engine);
-		}
-	}
-
-	ret = gen6_hw_domain_reset(dev_priv, hw_mask);
-
-	if (engine_mask != ALL_ENGINES)
-		for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-			gen11_unlock_sfc(dev_priv, engine);
-
-	return ret;
-}
-
 /**
  * __intel_wait_for_register_fw - wait until register matches expected state
  * @dev_priv: the i915 device
@@ -2191,196 +1825,6 @@ int __intel_wait_for_register(struct drm_i915_private *dev_priv,
 	return ret;
 }
 
-static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	int ret;
-
-	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
-		      _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
-
-	ret = __intel_wait_for_register_fw(dev_priv,
-					   RING_RESET_CTL(engine->mmio_base),
-					   RESET_CTL_READY_TO_RESET,
-					   RESET_CTL_READY_TO_RESET,
-					   700, 0,
-					   NULL);
-	if (ret)
-		DRM_ERROR("%s: reset request timeout\n", engine->name);
-
-	return ret;
-}
-
-static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
-		      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
-}
-
-static int reset_engines(struct drm_i915_private *i915,
-			 unsigned int engine_mask,
-			 unsigned int retry)
-{
-	if (INTEL_GEN(i915) >= 11)
-		return gen11_reset_engines(i915, engine_mask);
-	else
-		return gen6_reset_engines(i915, engine_mask, retry);
-}
-
-static int gen8_reset_engines(struct drm_i915_private *dev_priv,
-			      unsigned int engine_mask,
-			      unsigned int retry)
-{
-	struct intel_engine_cs *engine;
-	const bool reset_non_ready = retry >= 1;
-	unsigned int tmp;
-	int ret;
-
-	for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
-		ret = gen8_engine_reset_prepare(engine);
-		if (ret && !reset_non_ready)
-			goto skip_reset;
-
-		/*
-		 * If this is not the first failed attempt to prepare,
-		 * we decide to proceed anyway.
-		 *
-		 * By doing so we risk context corruption and with
-		 * some gens (kbl), possible system hang if reset
-		 * happens during active bb execution.
-		 *
-		 * We rather take context corruption instead of
-		 * failed reset with a wedged driver/gpu. And
-		 * active bb execution case should be covered by
-		 * i915_stop_engines we have before the reset.
-		 */
-	}
-
-	ret = reset_engines(dev_priv, engine_mask, retry);
-
-skip_reset:
-	for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-		gen8_engine_reset_cancel(engine);
-
-	return ret;
-}
-
-typedef int (*reset_func)(struct drm_i915_private *,
-			  unsigned int engine_mask, unsigned int retry);
-
-static reset_func intel_get_gpu_reset(struct drm_i915_private *dev_priv)
-{
-	if (!i915_modparams.reset)
-		return NULL;
-
-	if (INTEL_GEN(dev_priv) >= 8)
-		return gen8_reset_engines;
-	else if (INTEL_GEN(dev_priv) >= 6)
-		return gen6_reset_engines;
-	else if (IS_GEN(dev_priv, 5))
-		return ironlake_do_reset;
-	else if (IS_G4X(dev_priv))
-		return g4x_do_reset;
-	else if (IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
-		return g33_do_reset;
-	else if (INTEL_GEN(dev_priv) >= 3)
-		return i915_do_reset;
-	else
-		return NULL;
-}
-
-int intel_gpu_reset(struct drm_i915_private *dev_priv,
-		    const unsigned int engine_mask)
-{
-	reset_func reset = intel_get_gpu_reset(dev_priv);
-	unsigned int retry;
-	int ret;
-
-	GEM_BUG_ON(!engine_mask);
-
-	/*
-	 * We want to perform per-engine reset from atomic context (e.g.
-	 * softirq), which imposes the constraint that we cannot sleep.
-	 * However, experience suggests that spending a bit of time waiting
-	 * for a reset helps in various cases, so for a full-device reset
-	 * we apply the opposite rule and wait if we want to. As we should
-	 * always follow up a failed per-engine reset with a full device reset,
-	 * being a little faster, stricter and more error prone for the
-	 * atomic case seems an acceptable compromise.
-	 *
-	 * Unfortunately this leads to a bimodal routine, when the goal was
-	 * to have a single reset function that worked for resetting any
-	 * number of engines simultaneously.
-	 */
-	might_sleep_if(engine_mask == ALL_ENGINES);
-
-	/*
-	 * If the power well sleeps during the reset, the reset
-	 * request may be dropped and never completes (causing -EIO).
-	 */
-	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
-	for (retry = 0; retry < 3; retry++) {
-
-		/*
-		 * We stop engines, otherwise we might get failed reset and a
-		 * dead gpu (on elk). Also as modern gpu as kbl can suffer
-		 * from system hang if batchbuffer is progressing when
-		 * the reset is issued, regardless of READY_TO_RESET ack.
-		 * Thus assume it is best to stop engines on all gens
-		 * where we have a gpu reset.
-		 *
-		 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
-		 *
-		 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
-		 *
-		 * FIXME: Wa for more modern gens needs to be validated
-		 */
-		i915_stop_engines(dev_priv, engine_mask);
-
-		ret = -ENODEV;
-		if (reset) {
-			ret = reset(dev_priv, engine_mask, retry);
-			GEM_TRACE("engine_mask=%x, ret=%d, retry=%d\n",
-				  engine_mask, ret, retry);
-		}
-		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
-			break;
-
-		cond_resched();
-	}
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-
-	return ret;
-}
-
-bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
-{
-	return intel_get_gpu_reset(dev_priv) != NULL;
-}
-
-bool intel_has_reset_engine(struct drm_i915_private *dev_priv)
-{
-	return (INTEL_INFO(dev_priv)->has_reset_engine &&
-		i915_modparams.reset >= 2);
-}
-
-int intel_reset_guc(struct drm_i915_private *dev_priv)
-{
-	u32 guc_domain = INTEL_GEN(dev_priv) >= 11 ? GEN11_GRDOM_GUC :
-						     GEN9_GRDOM_GUC;
-	int ret;
-
-	GEM_BUG_ON(!HAS_GUC(dev_priv));
-
-	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
-	ret = gen6_hw_domain_reset(dev_priv, guc_domain);
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-
-	return ret;
-}
-
 bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
 {
 	return check_for_unclaimed_mmio(dev_priv);
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index e6073cd4719c..2b2ecd76c2ac 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -4,6 +4,8 @@
  * Copyright © 2018 Intel Corporation
  */
 
+#include "../i915_reset.h"
+
 #include "../i915_selftest.h"
 #include "igt_flush_test.h"
 #include "igt_spinner.h"
diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
index 9009d7b8b136..a8cac56be835 100644
--- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
@@ -5,6 +5,7 @@
  */
 
 #include "../i915_selftest.h"
+#include "../i915_reset.h"
 
 #include "igt_flush_test.h"
 #include "igt_reset.h"
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 4/8] drm/i915: Make all GPU resets atomic
  2019-01-14 21:04 Mika's reward Chris Wilson
                   ` (2 preceding siblings ...)
  2019-01-14 21:04 ` [PATCH 3/8] drm/i915: Pull all the reset functionality together into i915_reset.c Chris Wilson
@ 2019-01-14 21:04 ` Chris Wilson
  2019-01-17 14:14   ` Mika Kuoppala
  2019-01-14 21:04 ` [PATCH 5/8] drm/i915/guc: Disable global reset Chris Wilson
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 20+ messages in thread
From: Chris Wilson @ 2019-01-14 21:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

In preparation for the next few commits, make resetting the GPU atomic.
Currently, we have prepared gen6+ for atomic resetting of individual
engines, but now there is a requirement to perform the whole device
level reset (just the register poking) from inside an atomic context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c             | 50 ++++++++++---------
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  4 +-
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index e2e40b44a9a8..f9512e07646d 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -144,14 +144,14 @@ static int i915_do_reset(struct drm_i915_private *i915,
 
 	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
 	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
-	usleep_range(50, 200);
-	err = wait_for(i915_in_reset(pdev), 500);
+	udelay(50);
+	err = wait_for_atomic(i915_in_reset(pdev), 50);
 
 	/* Clear the reset request. */
 	pci_write_config_byte(pdev, I915_GDRST, 0);
-	usleep_range(50, 200);
+	udelay(50);
 	if (!err)
-		err = wait_for(!i915_in_reset(pdev), 500);
+		err = wait_for_atomic(!i915_in_reset(pdev), 50);
 
 	return err;
 }
@@ -171,7 +171,7 @@ static int g33_do_reset(struct drm_i915_private *i915,
 	struct pci_dev *pdev = i915->drm.pdev;
 
 	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
-	return wait_for(g4x_reset_complete(pdev), 500);
+	return wait_for_atomic(g4x_reset_complete(pdev), 50);
 }
 
 static int g4x_do_reset(struct drm_i915_private *dev_priv,
@@ -182,13 +182,13 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
 	int ret;
 
 	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
-	I915_WRITE(VDECCLK_GATE_D,
-		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
-	POSTING_READ(VDECCLK_GATE_D);
+	I915_WRITE_FW(VDECCLK_GATE_D,
+		      I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
+	POSTING_READ_FW(VDECCLK_GATE_D);
 
 	pci_write_config_byte(pdev, I915_GDRST,
 			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
-	ret =  wait_for(g4x_reset_complete(pdev), 500);
+	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
 	if (ret) {
 		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
 		goto out;
@@ -196,7 +196,7 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
 
 	pci_write_config_byte(pdev, I915_GDRST,
 			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
-	ret =  wait_for(g4x_reset_complete(pdev), 500);
+	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
 	if (ret) {
 		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
 		goto out;
@@ -205,9 +205,9 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
 out:
 	pci_write_config_byte(pdev, I915_GDRST, 0);
 
-	I915_WRITE(VDECCLK_GATE_D,
-		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
-	POSTING_READ(VDECCLK_GATE_D);
+	I915_WRITE_FW(VDECCLK_GATE_D,
+		      I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
+	POSTING_READ_FW(VDECCLK_GATE_D);
 
 	return ret;
 }
@@ -218,27 +218,29 @@ static int ironlake_do_reset(struct drm_i915_private *dev_priv,
 {
 	int ret;
 
-	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
-	ret = intel_wait_for_register(dev_priv,
-				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
-				      500);
+	I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
+	ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR,
+					   ILK_GRDOM_RESET_ENABLE, 0,
+					   5000, 0,
+					   NULL);
 	if (ret) {
 		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
 		goto out;
 	}
 
-	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
-	ret = intel_wait_for_register(dev_priv,
-				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
-				      500);
+	I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
+	ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR,
+					   ILK_GRDOM_RESET_ENABLE, 0,
+					   5000, 0,
+					   NULL);
 	if (ret) {
 		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
 		goto out;
 	}
 
 out:
-	I915_WRITE(ILK_GDSR, 0);
-	POSTING_READ(ILK_GDSR);
+	I915_WRITE_FW(ILK_GDSR, 0);
+	POSTING_READ_FW(ILK_GDSR);
 	return ret;
 }
 
@@ -572,7 +574,9 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
 		ret = -ENODEV;
 		if (reset) {
 			GEM_TRACE("engine_mask=%x\n", engine_mask);
+			preempt_disable();
 			ret = reset(i915, engine_mask, retry);
+			preempt_enable();
 		}
 		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
 			break;
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 3cda66292e76..888c6978bc54 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -58,8 +58,8 @@ static void mock_device_release(struct drm_device *dev)
 	i915_gem_contexts_lost(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 
-	cancel_delayed_work_sync(&i915->gt.retire_work);
-	cancel_delayed_work_sync(&i915->gt.idle_work);
+	drain_delayed_work(&i915->gt.retire_work);
+	drain_delayed_work(&i915->gt.idle_work);
 	i915_gem_drain_workqueue(i915);
 
 	mutex_lock(&i915->drm.struct_mutex);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 5/8] drm/i915/guc: Disable global reset
  2019-01-14 21:04 Mika's reward Chris Wilson
                   ` (3 preceding siblings ...)
  2019-01-14 21:04 ` [PATCH 4/8] drm/i915: Make all GPU resets atomic Chris Wilson
@ 2019-01-14 21:04 ` Chris Wilson
  2019-01-17 14:24   ` Mika Kuoppala
  2019-01-14 21:04 ` [PATCH 6/8] drm/i915: Remove GPU reset dependence on struct_mutex Chris Wilson
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 20+ messages in thread
From: Chris Wilson @ 2019-01-14 21:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

The guc (and huc) currently inexcruitably depend on struct_mutex for
device reinitialisation from inside the reset, and indeed taking any
mutex here is verboten (as we must be able to reset from underneath any
of our mutexes). That makes recovering the guc unviable without, for
example, reserving contiguous vma space and pages for it to use.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index f9512e07646d..c9a844d2626f 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -590,6 +590,9 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
 
 bool intel_has_gpu_reset(struct drm_i915_private *i915)
 {
+	if (USES_GUC(i915))
+		return false;
+
 	return intel_get_gpu_reset(i915);
 }
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 6/8] drm/i915: Remove GPU reset dependence on struct_mutex
  2019-01-14 21:04 Mika's reward Chris Wilson
                   ` (4 preceding siblings ...)
  2019-01-14 21:04 ` [PATCH 5/8] drm/i915/guc: Disable global reset Chris Wilson
@ 2019-01-14 21:04 ` Chris Wilson
  2019-01-14 21:04 ` [PATCH 7/8] drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest Chris Wilson
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2019-01-14 21:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Now that the submission backends are controlled via their own spinlocks,
with a wave of a magic wand we can lift the struct_mutex requirement
around GPU reset. That is we allow the submission frontend (userspace)
to keep on submitting while we process the GPU reset as we can suspend
the backend independently.

The major change is around the backoff/handoff strategy for performing
the reset. With no mutex deadlock, we no longer have to coordinate with
any waiter, and just perform the reset immediately.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  14 +-
 drivers/gpu/drm/i915/i915_drv.h               |   5 -
 drivers/gpu/drm/i915/i915_gem.c               |  18 +-
 drivers/gpu/drm/i915/i915_gem_fence_reg.h     |   1 -
 drivers/gpu/drm/i915/i915_gem_gtt.h           |   1 +
 drivers/gpu/drm/i915/i915_gpu_error.h         |  24 +-
 drivers/gpu/drm/i915/i915_request.c           |  47 ---
 drivers/gpu/drm/i915/i915_reset.c             | 397 ++++++++----------
 drivers/gpu/drm/i915/i915_reset.h             |   3 +
 drivers/gpu/drm/i915/intel_engine_cs.c        |   6 +-
 drivers/gpu/drm/i915/intel_guc_submission.c   |   5 +-
 drivers/gpu/drm/i915/intel_lrc.c              |  92 ++--
 drivers/gpu/drm/i915/intel_overlay.c          |   2 -
 drivers/gpu/drm/i915/intel_ringbuffer.c       |  91 ++--
 drivers/gpu/drm/i915/intel_ringbuffer.h       |  13 +-
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |  57 +--
 .../drm/i915/selftests/intel_workarounds.c    |   3 -
 17 files changed, 317 insertions(+), 462 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a93abb2274e6..28fd54b1f11a 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1284,8 +1284,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_puts(m, "Wedged\n");
 	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
 		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-	if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Reset in progress: reset handoff to waiter\n");
 	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
 		seq_puts(m, "Waiter holding struct mutex\n");
 	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
@@ -3907,11 +3905,6 @@ i915_wedged_set(void *data, u64 val)
 
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
-
-	wait_on_bit(&i915->gpu_error.flags,
-		    I915_RESET_HANDOFF,
-		    TASK_UNINTERRUPTIBLE);
-
 	return 0;
 }
 
@@ -4066,13 +4059,8 @@ i915_drop_caches_set(void *data, u64 val)
 		mutex_unlock(&i915->drm.struct_mutex);
 	}
 
-	if (val & DROP_RESET_ACTIVE &&
-	    i915_terminally_wedged(&i915->gpu_error)) {
+	if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
 		i915_handle_error(i915, ALL_ENGINES, 0, NULL);
-		wait_on_bit(&i915->gpu_error.flags,
-			    I915_RESET_HANDOFF,
-			    TASK_UNINTERRUPTIBLE);
-	}
 
 	fs_reclaim_acquire(GFP_KERNEL);
 	if (val & DROP_BOUND)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 224d433ac7b6..369de69e1d65 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2997,11 +2997,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
 	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
 }
 
-static inline bool i915_reset_handoff(struct i915_gpu_error *error)
-{
-	return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
-}
-
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a7e0d61a45ea..24ba698fc5b5 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -657,11 +657,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
 		     struct intel_rps_client *rps_client)
 {
 	might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
-	GEM_BUG_ON(debug_locks &&
-		   !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
-		   !!(flags & I915_WAIT_LOCKED));
-#endif
 	GEM_BUG_ON(timeout < 0);
 
 	timeout = i915_gem_object_wait_reservation(obj->resv,
@@ -4479,8 +4474,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 
 	GEM_TRACE("\n");
 
-	mutex_lock(&i915->drm.struct_mutex);
-
 	wakeref = intel_runtime_pm_get(i915);
 	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
 
@@ -4506,6 +4499,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 	intel_runtime_pm_put(i915, wakeref);
 
+	mutex_lock(&i915->drm.struct_mutex);
 	i915_gem_contexts_lost(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 }
@@ -4520,6 +4514,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	wakeref = intel_runtime_pm_get(i915);
 	intel_suspend_gt_powersave(i915);
 
+	flush_workqueue(i915->wq);
+
 	mutex_lock(&i915->drm.struct_mutex);
 
 	/*
@@ -4549,11 +4545,9 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	i915_retire_requests(i915); /* ensure we flush after wedging */
 
 	mutex_unlock(&i915->drm.struct_mutex);
+	i915_reset_flush(i915);
 
-	intel_uc_suspend(i915);
-
-	cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
-	cancel_delayed_work_sync(&i915->gt.retire_work);
+	drain_delayed_work(&i915->gt.retire_work);
 
 	/*
 	 * As the idle_work is rearming if it detects a race, play safe and
@@ -4561,6 +4555,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	 */
 	drain_delayed_work(&i915->gt.idle_work);
 
+	intel_uc_suspend(i915);
+
 	/*
 	 * Assert that we successfully flushed all the work and
 	 * reset the GPU back to its idle, low power state.
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
index 99a31ded4dfd..09dcaf14121b 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -50,4 +50,3 @@ struct drm_i915_fence_reg {
 };
 
 #endif
-
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 9229b03d629b..a0039ea97cdc 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -39,6 +39,7 @@
 #include <linux/pagevec.h>
 
 #include "i915_request.h"
+#include "i915_reset.h"
 #include "i915_selftest.h"
 #include "i915_timeline.h"
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 604291f7762d..733723e1ea03 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -197,6 +197,8 @@ struct i915_gpu_state {
 	struct scatterlist *sgl, *fit;
 };
 
+struct i915_gpu_restart;
+
 struct i915_gpu_error {
 	/* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
@@ -247,15 +249,6 @@ struct i915_gpu_error {
 	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
 	 * secondary role in preventing two concurrent global reset attempts.
 	 *
-	 * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
-	 * struct_mutex. We try to acquire the struct_mutex in the reset worker,
-	 * but it may be held by some long running waiter (that we cannot
-	 * interrupt without causing trouble). Once we are ready to do the GPU
-	 * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
-	 * they already hold the struct_mutex and want to participate they can
-	 * inspect the bit and do the reset directly, otherwise the worker
-	 * waits for the struct_mutex.
-	 *
 	 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
 	 * acquire the struct_mutex to reset an engine, we need an explicit
 	 * flag to prevent two concurrent reset attempts in the same engine.
@@ -269,20 +262,13 @@ struct i915_gpu_error {
 	 */
 	unsigned long flags;
 #define I915_RESET_BACKOFF	0
-#define I915_RESET_HANDOFF	1
-#define I915_RESET_MODESET	2
-#define I915_RESET_ENGINE	3
+#define I915_RESET_MODESET	1
+#define I915_RESET_ENGINE	2
 #define I915_WEDGED		(BITS_PER_LONG - 1)
 
 	/** Number of times an engine has been reset */
 	u32 reset_engine_count[I915_NUM_ENGINES];
 
-	/** Set of stalled engines with guilty requests, in the current reset */
-	u32 stalled_mask;
-
-	/** Reason for the current *global* reset */
-	const char *reason;
-
 	struct mutex wedge_mutex; /* serialises wedging/unwedging */
 
 	/**
@@ -299,6 +285,8 @@ struct i915_gpu_error {
 
 	/* For missed irq/seqno simulation. */
 	unsigned long test_irq_rings;
+
+	struct i915_gpu_restart *restart;
 };
 
 struct drm_i915_error_state_buf {
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5403d4e2cee0..fb723ed2f574 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1076,18 +1076,6 @@ static bool __i915_spin_request(const struct i915_request *rq,
 	return false;
 }
 
-static bool __i915_wait_request_check_and_reset(struct i915_request *request)
-{
-	struct i915_gpu_error *error = &request->i915->gpu_error;
-
-	if (likely(!i915_reset_handoff(error)))
-		return false;
-
-	__set_current_state(TASK_RUNNING);
-	i915_reset(request->i915, error->stalled_mask, error->reason);
-	return true;
-}
-
 /**
  * i915_request_wait - wait until execution of request has finished
  * @rq: the request to wait upon
@@ -1113,17 +1101,10 @@ long i915_request_wait(struct i915_request *rq,
 {
 	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
 		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
-	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
-	DEFINE_WAIT_FUNC(reset, default_wake_function);
 	DEFINE_WAIT_FUNC(exec, default_wake_function);
 	struct intel_wait wait;
 
 	might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
-	GEM_BUG_ON(debug_locks &&
-		   !!lockdep_is_held(&rq->i915->drm.struct_mutex) !=
-		   !!(flags & I915_WAIT_LOCKED));
-#endif
 	GEM_BUG_ON(timeout < 0);
 
 	if (i915_request_completed(rq))
@@ -1133,11 +1114,7 @@ long i915_request_wait(struct i915_request *rq,
 		return -ETIME;
 
 	trace_i915_request_wait_begin(rq, flags);
-
 	add_wait_queue(&rq->execute, &exec);
-	if (flags & I915_WAIT_LOCKED)
-		add_wait_queue(errq, &reset);
-
 	intel_wait_init(&wait);
 	if (flags & I915_WAIT_PRIORITY)
 		i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
@@ -1148,10 +1125,6 @@ long i915_request_wait(struct i915_request *rq,
 		if (intel_wait_update_request(&wait, rq))
 			break;
 
-		if (flags & I915_WAIT_LOCKED &&
-		    __i915_wait_request_check_and_reset(rq))
-			continue;
-
 		if (signal_pending_state(state, current)) {
 			timeout = -ERESTARTSYS;
 			goto complete;
@@ -1181,9 +1154,6 @@ long i915_request_wait(struct i915_request *rq,
 		 */
 		goto wakeup;
 
-	if (flags & I915_WAIT_LOCKED)
-		__i915_wait_request_check_and_reset(rq);
-
 	for (;;) {
 		if (signal_pending_state(state, current)) {
 			timeout = -ERESTARTSYS;
@@ -1207,21 +1177,6 @@ long i915_request_wait(struct i915_request *rq,
 		if (i915_request_completed(rq))
 			break;
 
-		/*
-		 * If the GPU is hung, and we hold the lock, reset the GPU
-		 * and then check for completion. On a full reset, the engine's
-		 * HW seqno will be advanced passed us and we are complete.
-		 * If we do a partial reset, we have to wait for the GPU to
-		 * resume and update the breadcrumb.
-		 *
-		 * If we don't hold the mutex, we can just wait for the worker
-		 * to come along and update the breadcrumb (either directly
-		 * itself, or indirectly by recovering the GPU).
-		 */
-		if (flags & I915_WAIT_LOCKED &&
-		    __i915_wait_request_check_and_reset(rq))
-			continue;
-
 		/* Only spin if we know the GPU is processing this request */
 		if (__i915_spin_request(rq, wait.seqno, state, 2))
 			break;
@@ -1235,8 +1190,6 @@ long i915_request_wait(struct i915_request *rq,
 	intel_engine_remove_wait(rq->engine, &wait);
 complete:
 	__set_current_state(TASK_RUNNING);
-	if (flags & I915_WAIT_LOCKED)
-		remove_wait_queue(errq, &reset);
 	remove_wait_queue(&rq->execute, &exec);
 	trace_i915_request_wait_end(rq);
 
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index c9a844d2626f..30f669aa526a 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/sched/mm.h>
+#include <linux/stop_machine.h>
 
 #include "i915_drv.h"
 #include "i915_gpu_error.h"
@@ -17,22 +18,23 @@ static void engine_skip_context(struct i915_request *rq)
 	struct intel_engine_cs *engine = rq->engine;
 	struct i915_gem_context *hung_ctx = rq->gem_context;
 	struct i915_timeline *timeline = rq->timeline;
-	unsigned long flags;
 
+	lockdep_assert_held(&engine->timeline.lock);
 	GEM_BUG_ON(timeline == &engine->timeline);
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
 	spin_lock(&timeline->lock);
 
-	list_for_each_entry_continue(rq, &engine->timeline.requests, link)
-		if (rq->gem_context == hung_ctx)
-			i915_request_skip(rq, -EIO);
+	if (rq->global_seqno) {
+		list_for_each_entry_continue(rq,
+					     &engine->timeline.requests, link)
+			if (rq->gem_context == hung_ctx)
+				i915_request_skip(rq, -EIO);
+	}
 
 	list_for_each_entry(rq, &timeline->requests, link)
 		i915_request_skip(rq, -EIO);
 
 	spin_unlock(&timeline->lock);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
 static void client_mark_guilty(struct drm_i915_file_private *file_priv,
@@ -59,7 +61,7 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv,
 	}
 }
 
-static void context_mark_guilty(struct i915_gem_context *ctx)
+static bool context_mark_guilty(struct i915_gem_context *ctx)
 {
 	unsigned int score;
 	bool banned, bannable;
@@ -72,7 +74,7 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
 
 	/* Cool contexts don't accumulate client ban score */
 	if (!bannable)
-		return;
+		return false;
 
 	if (banned) {
 		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
@@ -83,6 +85,8 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
 
 	if (!IS_ERR_OR_NULL(ctx->file_priv))
 		client_mark_guilty(ctx->file_priv, ctx);
+
+	return banned;
 }
 
 static void context_mark_innocent(struct i915_gem_context *ctx)
@@ -90,6 +94,21 @@ static void context_mark_innocent(struct i915_gem_context *ctx)
 	atomic_inc(&ctx->active_count);
 }
 
+void i915_reset_request(struct i915_request *rq, bool guilty)
+{
+	lockdep_assert_held(&rq->engine->timeline.lock);
+	GEM_BUG_ON(i915_request_completed(rq));
+
+	if (guilty) {
+		i915_request_skip(rq, -EIO);
+		if (context_mark_guilty(rq->gem_context))
+			engine_skip_context(rq);
+	} else {
+		dma_fence_set_error(&rq->fence, -EAGAIN);
+		context_mark_innocent(rq->gem_context);
+	}
+}
+
 static void gen3_stop_engine(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
@@ -533,22 +552,6 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
 	int retry;
 	int ret;
 
-	/*
-	 * We want to perform per-engine reset from atomic context (e.g.
-	 * softirq), which imposes the constraint that we cannot sleep.
-	 * However, experience suggests that spending a bit of time waiting
-	 * for a reset helps in various cases, so for a full-device reset
-	 * we apply the opposite rule and wait if we want to. As we should
-	 * always follow up a failed per-engine reset with a full device reset,
-	 * being a little faster, stricter and more error prone for the
-	 * atomic case seems an acceptable compromise.
-	 *
-	 * Unfortunately this leads to a bimodal routine, when the goal was
-	 * to have a single reset function that worked for resetting any
-	 * number of engines simultaneously.
-	 */
-	might_sleep_if(engine_mask == ALL_ENGINES);
-
 	/*
 	 * If the power well sleeps during the reset, the reset
 	 * request may be dropped and never completes (causing -EIO).
@@ -580,8 +583,6 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
 		}
 		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
 			break;
-
-		cond_resched();
 	}
 	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 
@@ -620,11 +621,8 @@ int intel_reset_guc(struct drm_i915_private *i915)
  * Ensure irq handler finishes, and not run again.
  * Also return the active request so that we only search for it once.
  */
-static struct i915_request *
-reset_prepare_engine(struct intel_engine_cs *engine)
+static void reset_prepare_engine(struct intel_engine_cs *engine)
 {
-	struct i915_request *rq;
-
 	/*
 	 * During the reset sequence, we must prevent the engine from
 	 * entering RC6. As the context state is undefined until we restart
@@ -633,162 +631,86 @@ reset_prepare_engine(struct intel_engine_cs *engine)
 	 * GPU state upon resume, i.e. fail to restart after a reset.
 	 */
 	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
-
-	rq = engine->reset.prepare(engine);
-	if (rq && rq->fence.error == -EIO)
-		rq = ERR_PTR(-EIO); /* Previous reset failed! */
-
-	return rq;
+	engine->reset.prepare(engine);
 }
 
-static int reset_prepare(struct drm_i915_private *i915)
+static void reset_prepare(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
-	struct i915_request *rq;
 	enum intel_engine_id id;
-	int err = 0;
 
-	for_each_engine(engine, i915, id) {
-		rq = reset_prepare_engine(engine);
-		if (IS_ERR(rq)) {
-			err = PTR_ERR(rq);
-			continue;
-		}
-
-		engine->hangcheck.active_request = rq;
-	}
+	for_each_engine(engine, i915, id)
+		reset_prepare_engine(engine);
 
-	i915_gem_revoke_fences(i915);
 	intel_uc_sanitize(i915);
-
-	return err;
 }
 
-/* Returns the request if it was guilty of the hang */
-static struct i915_request *
-reset_request(struct intel_engine_cs *engine,
-	      struct i915_request *rq,
-	      bool stalled)
+static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int err;
+
 	/*
-	 * The guilty request will get skipped on a hung engine.
-	 *
-	 * Users of client default contexts do not rely on logical
-	 * state preserved between batches so it is safe to execute
-	 * queued requests following the hang. Non default contexts
-	 * rely on preserved state, so skipping a batch loses the
-	 * evolution of the state and it needs to be considered corrupted.
-	 * Executing more queued batches on top of corrupted state is
-	 * risky. But we take the risk by trying to advance through
-	 * the queued requests in order to make the client behaviour
-	 * more predictable around resets, by not throwing away random
-	 * amount of batches it has prepared for execution. Sophisticated
-	 * clients can use gem_reset_stats_ioctl and dma fence status
-	 * (exported via sync_file info ioctl on explicit fences) to observe
-	 * when it loses the context state and should rebuild accordingly.
-	 *
-	 * The context ban, and ultimately the client ban, mechanism are safety
-	 * valves if client submission ends up resulting in nothing more than
-	 * subsequent hangs.
+	 * Everything depends on having the GTT running, so we need to start
+	 * there.
 	 */
+	err = i915_ggtt_enable_hw(i915);
+	if (err)
+		return err;
 
-	if (i915_request_completed(rq)) {
-		GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
-			  engine->name, rq->global_seqno,
-			  rq->fence.context, rq->fence.seqno,
-			  intel_engine_get_seqno(engine));
-		stalled = false;
-	}
-
-	if (stalled) {
-		context_mark_guilty(rq->gem_context);
-		i915_request_skip(rq, -EIO);
+	for_each_engine(engine, i915, id)
+		intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id));
 
-		/* If this context is now banned, skip all pending requests. */
-		if (i915_gem_context_is_banned(rq->gem_context))
-			engine_skip_context(rq);
-	} else {
-		/*
-		 * Since this is not the hung engine, it may have advanced
-		 * since the hang declaration. Double check by refinding
-		 * the active request at the time of the reset.
-		 */
-		rq = i915_gem_find_active_request(engine);
-		if (rq) {
-			unsigned long flags;
-
-			context_mark_innocent(rq->gem_context);
-			dma_fence_set_error(&rq->fence, -EAGAIN);
-
-			/* Rewind the engine to replay the incomplete rq */
-			spin_lock_irqsave(&engine->timeline.lock, flags);
-			rq = list_prev_entry(rq, link);
-			if (&rq->link == &engine->timeline.requests)
-				rq = NULL;
-			spin_unlock_irqrestore(&engine->timeline.lock, flags);
-		}
-	}
+	i915_gem_restore_fences(i915);
 
-	return rq;
+	return err;
 }
 
-static void reset_engine(struct intel_engine_cs *engine,
-			 struct i915_request *rq,
-			 bool stalled)
+static void reset_finish_engine(struct intel_engine_cs *engine)
 {
-	if (rq)
-		rq = reset_request(engine, rq, stalled);
-
-	/* Setup the CS to resume from the breadcrumb of the hung request */
-	engine->reset.reset(engine, rq);
+	engine->reset.finish(engine);
+	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
 }
 
-static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+struct i915_gpu_restart {
+	struct work_struct work;
+	struct drm_i915_private *i915;
+};
+
+static void restart_work(struct work_struct *work)
 {
+	struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work);
+	struct drm_i915_private *i915 = arg->i915;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+	mutex_lock(&i915->drm.struct_mutex);
 
-	i915_retire_requests(i915);
+	smp_store_mb(i915->gpu_error.restart, NULL);
 
 	for_each_engine(engine, i915, id) {
-		struct intel_context *ce;
-
-		reset_engine(engine,
-			     engine->hangcheck.active_request,
-			     stalled_mask & ENGINE_MASK(id));
-		ce = fetch_and_zero(&engine->last_retired_context);
-		if (ce)
-			intel_context_unpin(ce);
+		struct i915_request *rq;
 
 		/*
 		 * Ostensibily, we always want a context loaded for powersaving,
 		 * so if the engine is idle after the reset, send a request
 		 * to load our scratch kernel_context.
-		 *
-		 * More mysteriously, if we leave the engine idle after a reset,
-		 * the next userspace batch may hang, with what appears to be
-		 * an incoherent read by the CS (presumably stale TLB). An
-		 * empty request appears sufficient to paper over the glitch.
 		 */
-		if (intel_engine_is_idle(engine)) {
-			struct i915_request *rq;
+		if (!intel_engine_is_idle(engine))
+			continue;
 
-			rq = i915_request_alloc(engine, i915->kernel_context);
-			if (!IS_ERR(rq))
-				i915_request_add(rq);
-		}
+		rq = i915_request_alloc(engine, i915->kernel_context);
+		if (!IS_ERR(rq))
+			i915_request_add(rq);
 	}
 
-	i915_gem_restore_fences(i915);
-}
-
-static void reset_finish_engine(struct intel_engine_cs *engine)
-{
-	engine->reset.finish(engine);
+	mutex_unlock(&i915->drm.struct_mutex);
+	intel_runtime_pm_put(i915, wakeref);
 
-	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
+	kfree(arg);
 }
 
 static void reset_finish(struct drm_i915_private *i915)
@@ -796,11 +718,30 @@ static void reset_finish(struct drm_i915_private *i915)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
-
-	for_each_engine(engine, i915, id) {
-		engine->hangcheck.active_request = NULL;
+	for_each_engine(engine, i915, id)
 		reset_finish_engine(engine);
+}
+
+static void reset_restart(struct drm_i915_private *i915)
+{
+	struct i915_gpu_restart *arg;
+
+	/*
+	 * Following the reset, ensure that we always reload context for
+	 * powersaving, and to correct engine->last_retired_context. Since
+	 * this requires us to submit a request, queue a worker to do that
+	 * task for us to evade any locking here.
+	 */
+	if (READ_ONCE(i915->gpu_error.restart))
+		return;
+
+	arg = kmalloc(sizeof(*arg), GFP_KERNEL);
+	if (arg) {
+		arg->i915 = i915;
+		INIT_WORK(&arg->work, restart_work);
+
+		WRITE_ONCE(i915->gpu_error.restart, arg);
+		queue_work(i915->wq, &arg->work);
 	}
 }
 
@@ -889,8 +830,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	struct i915_timeline *tl;
 	bool ret = false;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
-
 	if (!test_bit(I915_WEDGED, &error->flags))
 		return true;
 
@@ -913,9 +852,9 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 */
 	list_for_each_entry(tl, &i915->gt.timelines, link) {
 		struct i915_request *rq;
+		long timeout;
 
-		rq = i915_gem_active_peek(&tl->last_request,
-					  &i915->drm.struct_mutex);
+		rq = i915_gem_active_get_unlocked(&tl->last_request);
 		if (!rq)
 			continue;
 
@@ -930,12 +869,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 		 * and when the seqno passes the fence, the signaler
 		 * then signals the fence waking us up).
 		 */
-		if (dma_fence_default_wait(&rq->fence, true,
-					   MAX_SCHEDULE_TIMEOUT) < 0)
+		timeout = dma_fence_default_wait(&rq->fence, true,
+						 MAX_SCHEDULE_TIMEOUT);
+		i915_request_put(rq);
+		if (timeout < 0)
 			goto unlock;
 	}
-	i915_retire_requests(i915);
-	GEM_BUG_ON(i915->gt.active_requests);
 
 	intel_engines_sanitize(i915, false);
 
@@ -949,7 +888,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 * context and do not require stop_machine().
 	 */
 	intel_engines_reset_default_submission(i915);
-	i915_gem_contexts_lost(i915);
 
 	GEM_TRACE("end\n");
 
@@ -962,6 +900,43 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	return ret;
 }
 
+struct __i915_reset {
+	struct drm_i915_private *i915;
+	unsigned int stalled_mask;
+};
+
+static int __i915_reset__BKL(void *data)
+{
+	struct __i915_reset *arg = data;
+	int err;
+
+	err = intel_gpu_reset(arg->i915, ALL_ENGINES);
+	if (err)
+		return err;
+
+	return gt_reset(arg->i915, arg->stalled_mask);
+}
+
+#if 0
+#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
+#else
+#define __do_reset(fn, arg) fn(arg)
+#endif
+
+static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+{
+	struct __i915_reset arg = { i915, stalled_mask };
+	int err, i;
+
+	err = __do_reset(__i915_reset__BKL, &arg);
+	for (i = 0; err && i < 3; i++) {
+		msleep(100);
+		err = __do_reset(__i915_reset__BKL, &arg);
+	}
+
+	return err;
+}
+
 /**
  * i915_reset - reset chip after a hang
  * @i915: #drm_i915_private to reset
@@ -987,31 +962,22 @@ void i915_reset(struct drm_i915_private *i915,
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	int ret;
-	int i;
 
 	GEM_TRACE("flags=%lx\n", error->flags);
 
 	might_sleep();
-	lockdep_assert_held(&i915->drm.struct_mutex);
 	assert_rpm_wakelock_held(i915);
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
-	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
-		return;
-
 	/* Clear any previous failed attempts at recovery. Time to try again. */
 	if (!i915_gem_unset_wedged(i915))
-		goto wakeup;
+		return;
 
 	if (reason)
 		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
 	error->reset_count++;
 
-	ret = reset_prepare(i915);
-	if (ret) {
-		dev_err(i915->drm.dev, "GPU recovery failed\n");
-		goto taint;
-	}
+	reset_prepare(i915);
 
 	if (!intel_has_gpu_reset(i915)) {
 		if (i915_modparams.reset)
@@ -1021,32 +987,11 @@ void i915_reset(struct drm_i915_private *i915,
 		goto error;
 	}
 
-	for (i = 0; i < 3; i++) {
-		ret = intel_gpu_reset(i915, ALL_ENGINES);
-		if (ret == 0)
-			break;
-
-		msleep(100);
-	}
-	if (ret) {
+	if (do_reset(i915, stalled_mask)) {
 		dev_err(i915->drm.dev, "Failed to reset chip\n");
 		goto taint;
 	}
 
-	/* Ok, now get things going again... */
-
-	/*
-	 * Everything depends on having the GTT running, so we need to start
-	 * there.
-	 */
-	ret = i915_ggtt_enable_hw(i915);
-	if (ret) {
-		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
-			  ret);
-		goto error;
-	}
-
-	gt_reset(i915, stalled_mask);
 	intel_overlay_reset(i915);
 
 	/*
@@ -1068,9 +1013,8 @@ void i915_reset(struct drm_i915_private *i915,
 
 finish:
 	reset_finish(i915);
-wakeup:
-	clear_bit(I915_RESET_HANDOFF, &error->flags);
-	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
+	if (!i915_terminally_wedged(error))
+		reset_restart(i915);
 	return;
 
 taint:
@@ -1089,7 +1033,6 @@ void i915_reset(struct drm_i915_private *i915,
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 error:
 	i915_gem_set_wedged(i915);
-	i915_retire_requests(i915);
 	goto finish;
 }
 
@@ -1115,18 +1058,16 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
 int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
 {
 	struct i915_gpu_error *error = &engine->i915->gpu_error;
-	struct i915_request *active_request;
 	int ret;
 
 	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
 	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
 
-	active_request = reset_prepare_engine(engine);
-	if (IS_ERR_OR_NULL(active_request)) {
-		/* Either the previous reset failed, or we pardon the reset. */
-		ret = PTR_ERR(active_request);
-		goto out;
-	}
+	if (i915_seqno_passed(intel_engine_get_seqno(engine),
+			      intel_engine_last_submit(engine)))
+		return 0;
+
+	reset_prepare_engine(engine);
 
 	if (msg)
 		dev_notice(engine->i915->drm.dev,
@@ -1150,7 +1091,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
 	 * active request and can drop it, adjust head to skip the offending
 	 * request to resume executing remaining requests in the queue.
 	 */
-	reset_engine(engine, active_request, true);
+	intel_engine_reset(engine, true);
 
 	/*
 	 * The engine and its registers (and workarounds in case of render)
@@ -1187,30 +1128,7 @@ static void i915_reset_device(struct drm_i915_private *i915,
 	i915_wedge_on_timeout(&w, i915, 5 * HZ) {
 		intel_prepare_reset(i915);
 
-		error->reason = reason;
-		error->stalled_mask = engine_mask;
-
-		/* Signal that locked waiters should reset the GPU */
-		smp_mb__before_atomic();
-		set_bit(I915_RESET_HANDOFF, &error->flags);
-		wake_up_all(&error->wait_queue);
-
-		/*
-		 * Wait for anyone holding the lock to wakeup, without
-		 * blocking indefinitely on struct_mutex.
-		 */
-		do {
-			if (mutex_trylock(&i915->drm.struct_mutex)) {
-				i915_reset(i915, engine_mask, reason);
-				mutex_unlock(&i915->drm.struct_mutex);
-			}
-		} while (wait_on_bit_timeout(&error->flags,
-					     I915_RESET_HANDOFF,
-					     TASK_UNINTERRUPTIBLE,
-					     1));
-
-		error->stalled_mask = 0;
-		error->reason = NULL;
+		i915_reset(i915, engine_mask, reason);
 
 		intel_finish_reset(i915);
 	}
@@ -1366,6 +1284,25 @@ void i915_handle_error(struct drm_i915_private *i915,
 	intel_runtime_pm_put(i915, wakeref);
 }
 
+bool i915_reset_flush(struct drm_i915_private *i915)
+{
+	int err;
+
+	cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
+
+	flush_workqueue(i915->wq);
+	GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart));
+
+	mutex_lock(&i915->drm.struct_mutex);
+	err = i915_gem_wait_for_idle(i915,
+				     I915_WAIT_LOCKED |
+				     I915_WAIT_FOR_IDLE_BOOST,
+				     MAX_SCHEDULE_TIMEOUT);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	return !err;
+}
+
 static void i915_wedge_me(struct work_struct *work)
 {
 	struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index b6a519bde67d..f2d347f319df 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -29,6 +29,9 @@ void i915_reset(struct drm_i915_private *i915,
 int i915_reset_engine(struct intel_engine_cs *engine,
 		      const char *reason);
 
+void i915_reset_request(struct i915_request *rq, bool guilty);
+bool i915_reset_flush(struct drm_i915_private *i915);
+
 bool intel_has_gpu_reset(struct drm_i915_private *i915);
 bool intel_has_reset_engine(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 9c943bb95cb9..3a0b8036f173 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1074,10 +1074,8 @@ void intel_engines_sanitize(struct drm_i915_private *i915, bool force)
 	if (!reset_engines(i915) && !force)
 		return;
 
-	for_each_engine(engine, i915, id) {
-		if (engine->reset.reset)
-			engine->reset.reset(engine, NULL);
-	}
+	for_each_engine(engine, i915, id)
+		intel_engine_reset(engine, false);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index ab1c49b106f2..7217c7e3ee8d 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -834,8 +834,7 @@ static void guc_submission_tasklet(unsigned long data)
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
-static struct i915_request *
-guc_reset_prepare(struct intel_engine_cs *engine)
+static void guc_reset_prepare(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 
@@ -861,8 +860,6 @@ guc_reset_prepare(struct intel_engine_cs *engine)
 	 */
 	if (engine->i915->guc.preempt_wq)
 		flush_workqueue(engine->i915->guc.preempt_wq);
-
-	return i915_gem_find_active_request(engine);
 }
 
 /*
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 608458b92db3..ee234b60c5e1 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -136,6 +136,7 @@
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include "i915_gem_render_state.h"
+#include "i915_reset.h"
 #include "i915_vgpu.h"
 #include "intel_lrc_reg.h"
 #include "intel_mocs.h"
@@ -264,7 +265,8 @@ static void unwind_wa_tail(struct i915_request *rq)
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
 
-static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
+static struct i915_request *
+__unwind_incomplete_requests(struct intel_engine_cs *engine)
 {
 	struct i915_request *rq, *rn, *active = NULL;
 	struct list_head *uninitialized_var(pl);
@@ -305,6 +307,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		list_move_tail(&active->sched.link,
 			       i915_sched_lookup_priolist(engine, prio));
 	}
+
+	return active;
 }
 
 void
@@ -1712,11 +1716,9 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	return 0;
 }
 
-static struct i915_request *
-execlists_reset_prepare(struct intel_engine_cs *engine)
+static void execlists_reset_prepare(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct i915_request *request, *active;
 	unsigned long flags;
 
 	GEM_TRACE("%s: depth<-%d\n", engine->name,
@@ -1732,59 +1734,21 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
 	 * prevents the race.
 	 */
 	__tasklet_disable_sync_once(&execlists->tasklet);
+	GEM_BUG_ON(!reset_in_progress(execlists));
 
+	/* And flush any current direct submission. */
 	spin_lock_irqsave(&engine->timeline.lock, flags);
-
-	/*
-	 * We want to flush the pending context switches, having disabled
-	 * the tasklet above, we can assume exclusive access to the execlists.
-	 * For this allows us to catch up with an inflight preemption event,
-	 * and avoid blaming an innocent request if the stall was due to the
-	 * preemption itself.
-	 */
-	process_csb(engine);
-
-	/*
-	 * The last active request can then be no later than the last request
-	 * now in ELSP[0]. So search backwards from there, so that if the GPU
-	 * has advanced beyond the last CSB update, it will be pardoned.
-	 */
-	active = NULL;
-	request = port_request(execlists->port);
-	if (request) {
-		/*
-		 * Prevent the breadcrumb from advancing before we decide
-		 * which request is currently active.
-		 */
-		intel_engine_stop_cs(engine);
-
-		list_for_each_entry_from_reverse(request,
-						 &engine->timeline.requests,
-						 link) {
-			if (__i915_request_completed(request,
-						     request->global_seqno))
-				break;
-
-			active = request;
-		}
-	}
-
+	process_csb(engine); /* drain preemption events */
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
-
-	return active;
 }
 
-static void execlists_reset(struct intel_engine_cs *engine,
-			    struct i915_request *request)
+static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_request *rq;
 	unsigned long flags;
 	u32 *regs;
 
-	GEM_TRACE("%s request global=%d, current=%d\n",
-		  engine->name, request ? request->global_seqno : 0,
-		  intel_engine_get_seqno(engine));
-
 	spin_lock_irqsave(&engine->timeline.lock, flags);
 
 	/*
@@ -1799,12 +1763,18 @@ static void execlists_reset(struct intel_engine_cs *engine,
 	execlists_cancel_port_requests(execlists);
 
 	/* Push back any incomplete requests for replay after the reset. */
-	__unwind_incomplete_requests(engine);
+	rq = __unwind_incomplete_requests(engine);
 
 	/* Following the reset, we need to reload the CSB read/write pointers */
 	reset_csb_pointers(&engine->execlists);
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
+		  engine->name,
+		  rq ? lower_32_bits(rq->global_seqno) : 0,
+		  intel_engine_get_seqno(engine),
+		  yesno(stalled));
+	if (!rq)
+		goto out_unlock;
 
 	/*
 	 * If the request was innocent, we leave the request in the ELSP
@@ -1817,8 +1787,9 @@ static void execlists_reset(struct intel_engine_cs *engine,
 	 * and have to at least restore the RING register in the context
 	 * image back to the expected values to skip over the guilty request.
 	 */
-	if (!request || request->fence.error != -EIO)
-		return;
+	i915_reset_request(rq, stalled);
+	if (!stalled)
+		goto out_unlock;
 
 	/*
 	 * We want a simple context + ring to execute the breadcrumb update.
@@ -1828,25 +1799,23 @@ static void execlists_reset(struct intel_engine_cs *engine,
 	 * future request will be after userspace has had the opportunity
 	 * to recreate its own state.
 	 */
-	regs = request->hw_context->lrc_reg_state;
+	regs = rq->hw_context->lrc_reg_state;
 	if (engine->pinned_default_state) {
 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
 		       engine->context_size - PAGE_SIZE);
 	}
-	execlists_init_reg_state(regs,
-				 request->gem_context, engine, request->ring);
+	execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
 
 	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
-	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
-
-	request->ring->head = intel_ring_wrap(request->ring, request->postfix);
-	regs[CTX_RING_HEAD + 1] = request->ring->head;
+	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(rq->ring->vma);
 
-	intel_ring_update_space(request->ring);
+	rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
+	regs[CTX_RING_HEAD + 1] = rq->ring->head;
+	intel_ring_update_space(rq->ring);
 
-	/* Reset WaIdleLiteRestore:bdw,skl as well */
-	unwind_wa_tail(request);
+out_unlock:
+	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
 static void execlists_reset_finish(struct intel_engine_cs *engine)
@@ -1859,6 +1828,7 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
 	 * to sleep before we restart and reload a context.
 	 *
 	 */
+	GEM_BUG_ON(!reset_in_progress(execlists));
 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
 		execlists->tasklet.func(execlists->tasklet.data);
 
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index c81db81e4416..f68c7975006c 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -478,8 +478,6 @@ void intel_overlay_reset(struct drm_i915_private *dev_priv)
 	if (!overlay)
 		return;
 
-	intel_overlay_release_old_vid(overlay);
-
 	overlay->old_xscale = 0;
 	overlay->old_yscale = 0;
 	overlay->crtc = NULL;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 26b7274a2d43..6d032bdffacc 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -33,6 +33,7 @@
 
 #include "i915_drv.h"
 #include "i915_gem_render_state.h"
+#include "i915_reset.h"
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include "intel_workarounds.h"
@@ -707,52 +708,80 @@ static int init_ring_common(struct intel_engine_cs *engine)
 	return ret;
 }
 
-static struct i915_request *reset_prepare(struct intel_engine_cs *engine)
+static void reset_prepare(struct intel_engine_cs *engine)
 {
 	intel_engine_stop_cs(engine);
-	return i915_gem_find_active_request(engine);
 }
 
-static void skip_request(struct i915_request *rq)
+static void reset_ring(struct intel_engine_cs *engine, bool stalled)
 {
-	void *vaddr = rq->ring->vaddr;
+	struct i915_timeline *tl = &engine->timeline;
+	struct i915_request *pos, *rq;
+	unsigned long flags;
 	u32 head;
 
-	head = rq->infix;
-	if (rq->postfix < head) {
-		memset32(vaddr + head, MI_NOOP,
-			 (rq->ring->size - head) / sizeof(u32));
-		head = 0;
+	rq = NULL;
+	spin_lock_irqsave(&tl->lock, flags);
+	list_for_each_entry(pos, &tl->requests, link) {
+		if (!__i915_request_completed(pos, pos->global_seqno)) {
+			rq = pos;
+			break;
+		}
 	}
-	memset32(vaddr + head, MI_NOOP, (rq->postfix - head) / sizeof(u32));
-}
-
-static void reset_ring(struct intel_engine_cs *engine, struct i915_request *rq)
-{
-	GEM_TRACE("%s request global=%d, current=%d\n",
-		  engine->name, rq ? rq->global_seqno : 0,
-		  intel_engine_get_seqno(engine));
 
+	GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
+		  engine->name,
+		  rq ? lower_32_bits(rq->global_seqno) : 0,
+		  intel_engine_get_seqno(engine),
+		  yesno(stalled));
 	/*
-	 * Try to restore the logical GPU state to match the continuation
-	 * of the request queue. If we skip the context/PD restore, then
-	 * the next request may try to execute assuming that its context
-	 * is valid and loaded on the GPU and so may try to access invalid
-	 * memory, prompting repeated GPU hangs.
+	 * The guilty request will get skipped on a hung engine.
 	 *
-	 * If the request was guilty, we still restore the logical state
-	 * in case the next request requires it (e.g. the aliasing ppgtt),
-	 * but skip over the hung batch.
+	 * Users of client default contexts do not rely on logical
+	 * state preserved between batches so it is safe to execute
+	 * queued requests following the hang. Non default contexts
+	 * rely on preserved state, so skipping a batch loses the
+	 * evolution of the state and it needs to be considered corrupted.
+	 * Executing more queued batches on top of corrupted state is
+	 * risky. But we take the risk by trying to advance through
+	 * the queued requests in order to make the client behaviour
+	 * more predictable around resets, by not throwing away random
+	 * amount of batches it has prepared for execution. Sophisticated
+	 * clients can use gem_reset_stats_ioctl and dma fence status
+	 * (exported via sync_file info ioctl on explicit fences) to observe
+	 * when it loses the context state and should rebuild accordingly.
 	 *
-	 * If the request was innocent, we try to replay the request with
-	 * the restored context.
+	 * The context ban, and ultimately the client ban, mechanism are safety
+	 * valves if client submission ends up resulting in nothing more than
+	 * subsequent hangs.
 	 */
+
 	if (rq) {
-		/* If the rq hung, jump to its breadcrumb and skip the batch */
-		rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
-		if (rq->fence.error == -EIO)
-			skip_request(rq);
+		/*
+		 * Try to restore the logical GPU state to match the
+		 * continuation of the request queue. If we skip the
+		 * context/PD restore, then the next request may try to execute
+		 * assuming that its context is valid and loaded on the GPU and
+		 * so may try to access invalid memory, prompting repeated GPU
+		 * hangs.
+		 *
+		 * If the request was guilty, we still restore the logical
+		 * state in case the next request requires it (e.g. the
+		 * aliasing ppgtt), but skip over the hung batch.
+		 *
+		 * If the request was innocent, we try to replay the request
+		 * with the restored context.
+		 */
+		i915_reset_request(rq, stalled);
+
+		GEM_BUG_ON(rq->ring != engine->buffer);
+		head = rq->head;
+	} else {
+		head = engine->buffer->tail;
 	}
+	engine->buffer->head = intel_ring_wrap(engine->buffer, head);
+
+	spin_unlock_irqrestore(&tl->lock, flags);
 }
 
 static void reset_finish(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3c1366c58cf3..06850ee17087 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -124,7 +124,6 @@ struct intel_engine_hangcheck {
 	unsigned long action_timestamp;
 	int deadlock;
 	struct intel_instdone instdone;
-	struct i915_request *active_request;
 	bool stalled:1;
 	bool wedged:1;
 };
@@ -445,9 +444,8 @@ struct intel_engine_cs {
 	int		(*init_hw)(struct intel_engine_cs *engine);
 
 	struct {
-		struct i915_request *(*prepare)(struct intel_engine_cs *engine);
-		void (*reset)(struct intel_engine_cs *engine,
-			      struct i915_request *rq);
+		void (*prepare)(struct intel_engine_cs *engine);
+		void (*reset)(struct intel_engine_cs *engine, bool stalled);
 		void (*finish)(struct intel_engine_cs *engine);
 	} reset;
 
@@ -1019,6 +1017,13 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset)
 	return cs;
 }
 
+static inline void intel_engine_reset(struct intel_engine_cs *engine,
+				      bool stalled)
+{
+	if (engine->reset.reset)
+		engine->reset.reset(engine, stalled);
+}
+
 void intel_engines_sanitize(struct drm_i915_private *i915, bool force);
 
 bool intel_engine_is_idle(struct intel_engine_cs *engine);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 12550b55c42f..67431355cd6e 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -363,9 +363,7 @@ static int igt_global_reset(void *arg)
 	/* Check that we can issue a global GPU reset */
 
 	igt_global_reset_lock(i915);
-	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 
-	mutex_lock(&i915->drm.struct_mutex);
 	reset_count = i915_reset_count(&i915->gpu_error);
 
 	i915_reset(i915, ALL_ENGINES, NULL);
@@ -374,9 +372,7 @@ static int igt_global_reset(void *arg)
 		pr_err("No GPU reset recorded!\n");
 		err = -EINVAL;
 	}
-	mutex_unlock(&i915->drm.struct_mutex);
 
-	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 	igt_global_reset_unlock(i915);
 
 	if (i915_terminally_wedged(&i915->gpu_error))
@@ -399,9 +395,7 @@ static int igt_wedged_reset(void *arg)
 	i915_gem_set_wedged(i915);
 	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
 
-	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 	i915_reset(i915, ALL_ENGINES, NULL);
-	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 
 	intel_runtime_pm_put(i915, wakeref);
 	mutex_unlock(&i915->drm.struct_mutex);
@@ -511,7 +505,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 				break;
 			}
 
-			if (!wait_for_idle(engine)) {
+			if (!i915_reset_flush(i915)) {
 				struct drm_printer p =
 					drm_info_printer(i915->drm.dev);
 
@@ -903,20 +897,13 @@ static int igt_reset_engines(void *arg)
 	return 0;
 }
 
-static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
+static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask)
 {
-	struct i915_gpu_error *error = &rq->i915->gpu_error;
-	u32 reset_count = i915_reset_count(error);
-
-	error->stalled_mask = mask;
-
-	/* set_bit() must be after we have setup the backchannel (mask) */
-	smp_mb__before_atomic();
-	set_bit(I915_RESET_HANDOFF, &error->flags);
+	u32 count = i915_reset_count(&i915->gpu_error);
 
-	wake_up_all(&error->wait_queue);
+	i915_reset(i915, mask, NULL);
 
-	return reset_count;
+	return count;
 }
 
 static int igt_reset_wait(void *arg)
@@ -962,7 +949,7 @@ static int igt_reset_wait(void *arg)
 		goto out_rq;
 	}
 
-	reset_count = fake_hangcheck(rq, ALL_ENGINES);
+	reset_count = fake_hangcheck(i915, ALL_ENGINES);
 
 	timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
 	if (timeout < 0) {
@@ -972,7 +959,6 @@ static int igt_reset_wait(void *arg)
 		goto out_rq;
 	}
 
-	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 	if (i915_reset_count(&i915->gpu_error) == reset_count) {
 		pr_err("No GPU reset recorded!\n");
 		err = -EINVAL;
@@ -1162,7 +1148,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 	}
 
 out_reset:
-	fake_hangcheck(rq, intel_engine_flag(rq->engine));
+	fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
 
 	if (tsk) {
 		struct igt_wedge_me w;
@@ -1341,12 +1327,7 @@ static int igt_reset_queue(void *arg)
 				goto fini;
 			}
 
-			reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
-
-			i915_reset(i915, ENGINE_MASK(id), NULL);
-
-			GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
-					    &i915->gpu_error.flags));
+			reset_count = fake_hangcheck(i915, ENGINE_MASK(id));
 
 			if (prev->fence.error != -EIO) {
 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
@@ -1565,6 +1546,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
 		       __func__, engine->name,
 		       rq->fence.seqno, hws_seqno(&h, rq));
+		i915_gem_set_wedged(i915);
 		err = -EIO;
 	}
 
@@ -1588,7 +1570,6 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
 static void force_reset(struct drm_i915_private *i915)
 {
 	i915_gem_set_wedged(i915);
-	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 	i915_reset(i915, 0, NULL);
 }
 
@@ -1618,6 +1599,26 @@ static int igt_atomic_reset(void *arg)
 	if (i915_terminally_wedged(&i915->gpu_error))
 		goto unlock;
 
+	if (intel_has_gpu_reset(i915)) {
+		const typeof(*phases) *p;
+
+		for (p = phases; p->name; p++) {
+			GEM_TRACE("intel_gpu_reset under %s\n", p->name);
+
+			p->critical_section_begin();
+			err = intel_gpu_reset(i915, ALL_ENGINES);
+			p->critical_section_end();
+
+			if (err) {
+				pr_err("intel_gpu_reset failed under %s\n",
+				       p->name);
+				goto out;
+			}
+		}
+
+		force_reset(i915);
+	}
+
 	if (intel_has_reset_engine(i915)) {
 		struct intel_engine_cs *engine;
 		enum intel_engine_id id;
diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
index a8cac56be835..b15c4f26c593 100644
--- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
@@ -214,7 +214,6 @@ static int check_whitelist(struct i915_gem_context *ctx,
 
 static int do_device_reset(struct intel_engine_cs *engine)
 {
-	set_bit(I915_RESET_HANDOFF, &engine->i915->gpu_error.flags);
 	i915_reset(engine->i915, ENGINE_MASK(engine->id), "live_workarounds");
 	return 0;
 }
@@ -394,7 +393,6 @@ static int
 live_gpu_reset_gt_engine_workarounds(void *arg)
 {
 	struct drm_i915_private *i915 = arg;
-	struct i915_gpu_error *error = &i915->gpu_error;
 	intel_wakeref_t wakeref;
 	struct wa_lists lists;
 	bool ok;
@@ -413,7 +411,6 @@ live_gpu_reset_gt_engine_workarounds(void *arg)
 	if (!ok)
 		goto out;
 
-	set_bit(I915_RESET_HANDOFF, &error->flags);
 	i915_reset(i915, ALL_ENGINES, "live_workarounds");
 
 	ok = verify_gt_engine_wa(i915, &lists, "after reset");
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 7/8] drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest
  2019-01-14 21:04 Mika's reward Chris Wilson
                   ` (5 preceding siblings ...)
  2019-01-14 21:04 ` [PATCH 6/8] drm/i915: Remove GPU reset dependence on struct_mutex Chris Wilson
@ 2019-01-14 21:04 ` Chris Wilson
  2019-01-14 21:04 ` [PATCH 8/8] drm/i915: Issue engine resets onto idle engines Chris Wilson
  2019-01-14 21:26 ` ✗ Fi.CI.BAT: failure for series starting with [1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged() Patchwork
  8 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2019-01-14 21:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Trim the struct_mutex hold and exclude the call to i915_gem_set_wedged()
as a reminder that it must be callable without struct_mutex held.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 67431355cd6e..28144fd72550 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -389,16 +389,16 @@ static int igt_wedged_reset(void *arg)
 	/* Check that we can recover a wedged device with a GPU reset */
 
 	igt_global_reset_lock(i915);
-	mutex_lock(&i915->drm.struct_mutex);
 	wakeref = intel_runtime_pm_get(i915);
 
 	i915_gem_set_wedged(i915);
-	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
 
+	mutex_lock(&i915->drm.struct_mutex);
+	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
 	i915_reset(i915, ALL_ENGINES, NULL);
+	mutex_unlock(&i915->drm.struct_mutex);
 
 	intel_runtime_pm_put(i915, wakeref);
-	mutex_unlock(&i915->drm.struct_mutex);
 	igt_global_reset_unlock(i915);
 
 	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 8/8] drm/i915: Issue engine resets onto idle engines
  2019-01-14 21:04 Mika's reward Chris Wilson
                   ` (6 preceding siblings ...)
  2019-01-14 21:04 ` [PATCH 7/8] drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest Chris Wilson
@ 2019-01-14 21:04 ` Chris Wilson
  2019-01-14 21:26 ` ✗ Fi.CI.BAT: failure for series starting with [1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged() Patchwork
  8 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2019-01-14 21:04 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Always perform the requested reset, even if we believe the engine is
idle. Presumably there was a reason the caller wanted the reset, and in
the near future we lose the easy tracking for whether the engine is
idle.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c             |  4 ----
 .../gpu/drm/i915/selftests/intel_hangcheck.c  | 22 +++++--------------
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 30f669aa526a..3e0833221c3a 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -1063,10 +1063,6 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
 	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
 	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
 
-	if (i915_seqno_passed(intel_engine_get_seqno(engine),
-			      intel_engine_last_submit(engine)))
-		return 0;
-
 	reset_prepare_engine(engine);
 
 	if (msg)
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 28144fd72550..9d0cc9d63a1e 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -449,8 +449,6 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 
 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 		do {
-			u32 seqno = intel_engine_get_seqno(engine);
-
 			if (active) {
 				struct i915_request *rq;
 
@@ -479,8 +477,6 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 					break;
 				}
 
-				GEM_BUG_ON(!rq->global_seqno);
-				seqno = rq->global_seqno - 1;
 				i915_request_put(rq);
 			}
 
@@ -496,11 +492,10 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 				break;
 			}
 
-			reset_engine_count += active;
 			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
-			    reset_engine_count) {
-				pr_err("%s engine reset %srecorded!\n",
-				       engine->name, active ? "not " : "");
+			    ++reset_engine_count) {
+				pr_err("%s engine reset not recorded!\n",
+				       engine->name);
 				err = -EINVAL;
 				break;
 			}
@@ -728,7 +723,6 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
 
 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 		do {
-			u32 seqno = intel_engine_get_seqno(engine);
 			struct i915_request *rq = NULL;
 
 			if (flags & TEST_ACTIVE) {
@@ -756,9 +750,6 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
 					err = -EIO;
 					break;
 				}
-
-				GEM_BUG_ON(!rq->global_seqno);
-				seqno = rq->global_seqno - 1;
 			}
 
 			err = i915_reset_engine(engine, NULL);
@@ -795,10 +786,9 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
 
 		reported = i915_reset_engine_count(&i915->gpu_error, engine);
 		reported -= threads[engine->id].resets;
-		if (reported != (flags & TEST_ACTIVE ? count : 0)) {
-			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
-			       engine->name, test_name, count, reported,
-			       (flags & TEST_ACTIVE ? count : 0));
+		if (reported != count) {
+			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
+			       engine->name, test_name, count, reported);
 			if (!err)
 				err = -EINVAL;
 		}
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
  2019-01-14 21:04 Mika's reward Chris Wilson
                   ` (7 preceding siblings ...)
  2019-01-14 21:04 ` [PATCH 8/8] drm/i915: Issue engine resets onto idle engines Chris Wilson
@ 2019-01-14 21:26 ` Patchwork
  2019-01-14 21:59   ` Chris Wilson
  8 siblings, 1 reply; 20+ messages in thread
From: Patchwork @ 2019-01-14 21:26 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
URL   : https://patchwork.freedesktop.org/series/55200/
State : failure

== Summary ==

Applying: drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
Applying: drm/i915: Differentiate between ggtt->mutex and ppgtt->mutex
Applying: drm/i915: Pull all the reset functionality together into i915_reset.c
error: patch failed: drivers/gpu/drm/i915/selftests/intel_lrc.c:4
error: drivers/gpu/drm/i915/selftests/intel_lrc.c: patch does not apply
error: Did you hand edit your patch?
It does not apply to blobs recorded in its index.
hint: Use 'git am --show-current-patch' to see the failed patch
Using index info to reconstruct a base tree...
Patch failed at 0003 drm/i915: Pull all the reset functionality together into i915_reset.c
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: ✗ Fi.CI.BAT: failure for series starting with [1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
  2019-01-14 21:26 ` ✗ Fi.CI.BAT: failure for series starting with [1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged() Patchwork
@ 2019-01-14 21:59   ` Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2019-01-14 21:59 UTC (permalink / raw)
  To: Patchwork; +Cc: intel-gfx

Quoting Patchwork (2019-01-14 21:26:32)
> == Series Details ==
> 
> Series: series starting with [1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
> URL   : https://patchwork.freedesktop.org/series/55200/
> State : failure
> 
> == Summary ==
> 
> Applying: drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
> Applying: drm/i915: Differentiate between ggtt->mutex and ppgtt->mutex
> Applying: drm/i915: Pull all the reset functionality together into i915_reset.c
> error: patch failed: drivers/gpu/drm/i915/selftests/intel_lrc.c:4
> error: drivers/gpu/drm/i915/selftests/intel_lrc.c: patch does not apply
> error: Did you hand edit your patch?

Something is very fishy here. :|
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
  2019-01-14 21:04 ` [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged() Chris Wilson
@ 2019-01-15 11:56   ` Mika Kuoppala
  2019-01-15 12:05     ` Chris Wilson
  0 siblings, 1 reply; 20+ messages in thread
From: Mika Kuoppala @ 2019-01-15 11:56 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Make i915_gem_set_wedged() and i915_gem_unset_wedged() behaviour more
> consistently if called concurrently.

More is needed in here. The purpose is to make them wait in turns
on top of mutex, instead of racing on the bit? Where is
the inconsistency tho.

>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c               | 32 ++++++++++++++-----
>  drivers/gpu/drm/i915/i915_gpu_error.h         |  4 ++-
>  .../gpu/drm/i915/selftests/mock_gem_device.c  |  1 +
>  3 files changed, 28 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 0bfed33178e1..910c49befc50 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3173,10 +3173,15 @@ static void nop_submit_request(struct i915_request *request)
>  
>  void i915_gem_set_wedged(struct drm_i915_private *i915)
>  {
> +	struct i915_gpu_error *error = &i915->gpu_error;
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>  
> -	GEM_TRACE("start\n");
> +	mutex_lock(&error->wedge_mutex);
> +	if (test_bit(I915_WEDGED, &error->flags)) {
> +		mutex_unlock(&error->wedge_mutex);
> +		return;
> +	}
>  
>  	if (GEM_SHOW_DEBUG()) {
>  		struct drm_printer p = drm_debug_printer(__func__);
> @@ -3185,8 +3190,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  			intel_engine_dump(engine, &p, "%s\n", engine->name);
>  	}
>  
> -	if (test_and_set_bit(I915_WEDGED, &i915->gpu_error.flags))
> -		goto out;
> +	GEM_TRACE("start\n");
>  
>  	/*
>  	 * First, stop submission to hw, but do not yet complete requests by
> @@ -3222,23 +3226,31 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  		intel_engine_wakeup(engine);
>  	}
>  
> -out:
> +	smp_mb__before_atomic();

I was thinking of what state you want to guard against as you
now hold the mutex for wedging. But the answer must: any other
external state. Make everything visible before flipping the bit.

-Mika

> +	set_bit(I915_WEDGED, &error->flags);
> +
>  	GEM_TRACE("end\n");
> +	mutex_unlock(&error->wedge_mutex);
>  
> -	wake_up_all(&i915->gpu_error.reset_queue);
> +	wake_up_all(&error->reset_queue);
>  }
>  
>  bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  {
> +	struct i915_gpu_error *error = &i915->gpu_error;
>  	struct i915_timeline *tl;
> +	bool ret = false;
>  
>  	lockdep_assert_held(&i915->drm.struct_mutex);
> -	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
> +
> +	if (!test_bit(I915_WEDGED, &error->flags))
>  		return true;
>  
>  	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
>  		return false;
>  
> +	mutex_lock(&error->wedge_mutex);
> +
>  	GEM_TRACE("start\n");
>  
>  	/*
> @@ -3272,7 +3284,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  		 */
>  		if (dma_fence_default_wait(&rq->fence, true,
>  					   MAX_SCHEDULE_TIMEOUT) < 0)
> -			return false;
> +			goto unlock;
>  	}
>  	i915_retire_requests(i915);
>  	GEM_BUG_ON(i915->gt.active_requests);
> @@ -3295,8 +3307,11 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  
>  	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
>  	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> +	ret = true;
> +unlock:
> +	mutex_unlock(&i915->gpu_error.wedge_mutex);
>  
> -	return true;
> +	return ret;
>  }
>  
>  static void
> @@ -5692,6 +5707,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
>  			  i915_gem_idle_work_handler);
>  	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
>  	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
> +	mutex_init(&dev_priv->gpu_error.wedge_mutex);
>  
>  	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 6d9f45468ac1..604291f7762d 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -271,8 +271,8 @@ struct i915_gpu_error {
>  #define I915_RESET_BACKOFF	0
>  #define I915_RESET_HANDOFF	1
>  #define I915_RESET_MODESET	2
> +#define I915_RESET_ENGINE	3
>  #define I915_WEDGED		(BITS_PER_LONG - 1)
> -#define I915_RESET_ENGINE	(I915_WEDGED - I915_NUM_ENGINES)
>  
>  	/** Number of times an engine has been reset */
>  	u32 reset_engine_count[I915_NUM_ENGINES];
> @@ -283,6 +283,8 @@ struct i915_gpu_error {
>  	/** Reason for the current *global* reset */
>  	const char *reason;
>  
> +	struct mutex wedge_mutex; /* serialises wedging/unwedging */
> +
>  	/**
>  	 * Waitqueue to signal when a hang is detected. Used to for waiters
>  	 * to release the struct_mutex for the reset to procede.
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index 082809569681..3cda66292e76 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -188,6 +188,7 @@ struct drm_i915_private *mock_gem_device(void)
>  
>  	init_waitqueue_head(&i915->gpu_error.wait_queue);
>  	init_waitqueue_head(&i915->gpu_error.reset_queue);
> +	mutex_init(&i915->gpu_error.wedge_mutex);
>  
>  	i915->wq = alloc_ordered_workqueue("mock", 0);
>  	if (!i915->wq)
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
  2019-01-15 11:56   ` Mika Kuoppala
@ 2019-01-15 12:05     ` Chris Wilson
  2019-01-16  9:27       ` Chris Wilson
  0 siblings, 1 reply; 20+ messages in thread
From: Chris Wilson @ 2019-01-15 12:05 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2019-01-15 11:56:11)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Make i915_gem_set_wedged() and i915_gem_unset_wedged() behaviour more
> > consistently if called concurrently.
> 
> More is needed in here. The purpose is to make them wait in turns
> on top of mutex, instead of racing on the bit? Where is
> the inconsistency tho.

We report set-wedged multiple times on failure paths. Worse is when we
report set-wedged multiple times simultaneously.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
  2019-01-15 12:05     ` Chris Wilson
@ 2019-01-16  9:27       ` Chris Wilson
  2019-01-16 15:04         ` Mika Kuoppala
  0 siblings, 1 reply; 20+ messages in thread
From: Chris Wilson @ 2019-01-16  9:27 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Chris Wilson (2019-01-15 12:05:27)
> Quoting Mika Kuoppala (2019-01-15 11:56:11)
> > Chris Wilson <chris@chris-wilson.co.uk> writes:
> > 
> > > Make i915_gem_set_wedged() and i915_gem_unset_wedged() behaviour more
> > > consistently if called concurrently.
> > 
> > More is needed in here. The purpose is to make them wait in turns
> > on top of mutex, instead of racing on the bit? Where is
> > the inconsistency tho.
> 
> We report set-wedged multiple times on failure paths. Worse is when we
> report set-wedged multiple times simultaneously.

I've been contemplating just moving the reporting inside the test-bit
serialisation, but I kept resisting. This issue has been nagging at me
ever since using the bit for loose serialisation; you either fix a race
or live to regret it.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged()
  2019-01-16  9:27       ` Chris Wilson
@ 2019-01-16 15:04         ` Mika Kuoppala
  0 siblings, 0 replies; 20+ messages in thread
From: Mika Kuoppala @ 2019-01-16 15:04 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Chris Wilson (2019-01-15 12:05:27)
>> Quoting Mika Kuoppala (2019-01-15 11:56:11)
>> > Chris Wilson <chris@chris-wilson.co.uk> writes:
>> > 
>> > > Make i915_gem_set_wedged() and i915_gem_unset_wedged() behaviour more
>> > > consistently if called concurrently.
>> > 
>> > More is needed in here. The purpose is to make them wait in turns
>> > on top of mutex, instead of racing on the bit? Where is
>> > the inconsistency tho.
>> 
>> We report set-wedged multiple times on failure paths. Worse is when we
>> report set-wedged multiple times simultaneously.
>
> I've been contemplating just moving the reporting inside the test-bit
> serialisation, but I kept resisting. This issue has been nagging at me
> ever since using the bit for loose serialisation; you either fix a race
> or live to regret it.

That part I can agree with. Now it is all contained so if it coalesces
on reports, it will be alarming :)

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/8] drm/i915: Pull all the reset functionality together into i915_reset.c
  2019-01-14 21:04 ` [PATCH 3/8] drm/i915: Pull all the reset functionality together into i915_reset.c Chris Wilson
@ 2019-01-16 15:06   ` Mika Kuoppala
  2019-01-16 15:31     ` Chris Wilson
  0 siblings, 1 reply; 20+ messages in thread
From: Mika Kuoppala @ 2019-01-16 15:06 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Currently the code to reset the GPU and our state is spread widely
> across a few files. Pull the logic together into a common file.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Dunno how it goes but gut feeling that this would have
been better at the end of series after the dust has settled.
Regardless,

Acked-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/Makefile                 |    3 +-
>  drivers/gpu/drm/i915/i915_debugfs.c           |    2 +
>  drivers/gpu/drm/i915/i915_drv.c               |  206 +--
>  drivers/gpu/drm/i915/i915_drv.h               |   33 +-
>  drivers/gpu/drm/i915/i915_gem.c               |  446 +-----
>  drivers/gpu/drm/i915/i915_gem_gtt.c           |    1 +
>  drivers/gpu/drm/i915/i915_irq.c               |  238 ---
>  drivers/gpu/drm/i915/i915_request.c           |    1 +
>  drivers/gpu/drm/i915/i915_reset.c             | 1389 +++++++++++++++++
>  drivers/gpu/drm/i915/i915_reset.h             |   56 +
>  drivers/gpu/drm/i915/intel_display.c          |   15 +-
>  drivers/gpu/drm/i915/intel_engine_cs.c        |    1 +
>  drivers/gpu/drm/i915/intel_guc.h              |    3 +
>  drivers/gpu/drm/i915/intel_hangcheck.c        |    1 +
>  drivers/gpu/drm/i915/intel_uc.c               |    1 +
>  drivers/gpu/drm/i915/intel_uncore.c           |  556 -------
>  drivers/gpu/drm/i915/selftests/intel_lrc.c    |    2 +
>  .../drm/i915/selftests/intel_workarounds.c    |    1 +
>  18 files changed, 1483 insertions(+), 1472 deletions(-)
>  create mode 100644 drivers/gpu/drm/i915/i915_reset.c
>  create mode 100644 drivers/gpu/drm/i915/i915_reset.h
>
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index c34bee16730d..611115ed00db 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -40,9 +40,10 @@ i915-y := i915_drv.o \
>  	  i915_mm.o \
>  	  i915_params.o \
>  	  i915_pci.o \
> +	  i915_reset.o \
>  	  i915_suspend.o \
> -	  i915_syncmap.o \
>  	  i915_sw_fence.o \
> +	  i915_syncmap.o \
>  	  i915_sysfs.o \
>  	  intel_csr.o \
>  	  intel_device_info.o \
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index da6d2581cb0e..a93abb2274e6 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -32,6 +32,8 @@
>  #include "intel_drv.h"
>  #include "intel_guc_submission.h"
>  
> +#include "i915_reset.h"
> +
>  static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
>  {
>  	return to_i915(node->minor->dev);
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index dafbbfadd1ad..f462a4d28af4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -48,6 +48,7 @@
>  #include "i915_drv.h"
>  #include "i915_trace.h"
>  #include "i915_pmu.h"
> +#include "i915_reset.h"
>  #include "i915_query.h"
>  #include "i915_vgpu.h"
>  #include "intel_drv.h"
> @@ -2205,211 +2206,6 @@ static int i915_resume_switcheroo(struct drm_device *dev)
>  	return i915_drm_resume(dev);
>  }
>  
> -/**
> - * i915_reset - reset chip after a hang
> - * @i915: #drm_i915_private to reset
> - * @stalled_mask: mask of the stalled engines with the guilty requests
> - * @reason: user error message for why we are resetting
> - *
> - * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
> - * on failure.
> - *
> - * Caller must hold the struct_mutex.
> - *
> - * Procedure is fairly simple:
> - *   - reset the chip using the reset reg
> - *   - re-init context state
> - *   - re-init hardware status page
> - *   - re-init ring buffer
> - *   - re-init interrupt state
> - *   - re-init display
> - */
> -void i915_reset(struct drm_i915_private *i915,
> -		unsigned int stalled_mask,
> -		const char *reason)
> -{
> -	struct i915_gpu_error *error = &i915->gpu_error;
> -	int ret;
> -	int i;
> -
> -	GEM_TRACE("flags=%lx\n", error->flags);
> -
> -	might_sleep();
> -	lockdep_assert_held(&i915->drm.struct_mutex);
> -	assert_rpm_wakelock_held(i915);
> -	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
> -
> -	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
> -		return;
> -
> -	/* Clear any previous failed attempts at recovery. Time to try again. */
> -	if (!i915_gem_unset_wedged(i915))
> -		goto wakeup;
> -
> -	if (reason)
> -		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
> -	error->reset_count++;
> -
> -	ret = i915_gem_reset_prepare(i915);
> -	if (ret) {
> -		dev_err(i915->drm.dev, "GPU recovery failed\n");
> -		goto taint;
> -	}
> -
> -	if (!intel_has_gpu_reset(i915)) {
> -		if (i915_modparams.reset)
> -			dev_err(i915->drm.dev, "GPU reset not supported\n");
> -		else
> -			DRM_DEBUG_DRIVER("GPU reset disabled\n");
> -		goto error;
> -	}
> -
> -	for (i = 0; i < 3; i++) {
> -		ret = intel_gpu_reset(i915, ALL_ENGINES);
> -		if (ret == 0)
> -			break;
> -
> -		msleep(100);
> -	}
> -	if (ret) {
> -		dev_err(i915->drm.dev, "Failed to reset chip\n");
> -		goto taint;
> -	}
> -
> -	/* Ok, now get things going again... */
> -
> -	/*
> -	 * Everything depends on having the GTT running, so we need to start
> -	 * there.
> -	 */
> -	ret = i915_ggtt_enable_hw(i915);
> -	if (ret) {
> -		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
> -			  ret);
> -		goto error;
> -	}
> -
> -	i915_gem_reset(i915, stalled_mask);
> -	intel_overlay_reset(i915);
> -
> -	/*
> -	 * Next we need to restore the context, but we don't use those
> -	 * yet either...
> -	 *
> -	 * Ring buffer needs to be re-initialized in the KMS case, or if X
> -	 * was running at the time of the reset (i.e. we weren't VT
> -	 * switched away).
> -	 */
> -	ret = i915_gem_init_hw(i915);
> -	if (ret) {
> -		DRM_ERROR("Failed to initialise HW following reset (%d)\n",
> -			  ret);
> -		goto error;
> -	}
> -
> -	i915_queue_hangcheck(i915);
> -
> -finish:
> -	i915_gem_reset_finish(i915);
> -wakeup:
> -	clear_bit(I915_RESET_HANDOFF, &error->flags);
> -	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
> -	return;
> -
> -taint:
> -	/*
> -	 * History tells us that if we cannot reset the GPU now, we
> -	 * never will. This then impacts everything that is run
> -	 * subsequently. On failing the reset, we mark the driver
> -	 * as wedged, preventing further execution on the GPU.
> -	 * We also want to go one step further and add a taint to the
> -	 * kernel so that any subsequent faults can be traced back to
> -	 * this failure. This is important for CI, where if the
> -	 * GPU/driver fails we would like to reboot and restart testing
> -	 * rather than continue on into oblivion. For everyone else,
> -	 * the system should still plod along, but they have been warned!
> -	 */
> -	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
> -error:
> -	i915_gem_set_wedged(i915);
> -	i915_retire_requests(i915);
> -	goto finish;
> -}
> -
> -static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
> -					struct intel_engine_cs *engine)
> -{
> -	return intel_gpu_reset(dev_priv, intel_engine_flag(engine));
> -}
> -
> -/**
> - * i915_reset_engine - reset GPU engine to recover from a hang
> - * @engine: engine to reset
> - * @msg: reason for GPU reset; or NULL for no dev_notice()
> - *
> - * Reset a specific GPU engine. Useful if a hang is detected.
> - * Returns zero on successful reset or otherwise an error code.
> - *
> - * Procedure is:
> - *  - identifies the request that caused the hang and it is dropped
> - *  - reset engine (which will force the engine to idle)
> - *  - re-init/configure engine
> - */
> -int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
> -{
> -	struct i915_gpu_error *error = &engine->i915->gpu_error;
> -	struct i915_request *active_request;
> -	int ret;
> -
> -	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
> -	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
> -
> -	active_request = i915_gem_reset_prepare_engine(engine);
> -	if (IS_ERR_OR_NULL(active_request)) {
> -		/* Either the previous reset failed, or we pardon the reset. */
> -		ret = PTR_ERR(active_request);
> -		goto out;
> -	}
> -
> -	if (msg)
> -		dev_notice(engine->i915->drm.dev,
> -			   "Resetting %s for %s\n", engine->name, msg);
> -	error->reset_engine_count[engine->id]++;
> -
> -	if (!engine->i915->guc.execbuf_client)
> -		ret = intel_gt_reset_engine(engine->i915, engine);
> -	else
> -		ret = intel_guc_reset_engine(&engine->i915->guc, engine);
> -	if (ret) {
> -		/* If we fail here, we expect to fallback to a global reset */
> -		DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
> -				 engine->i915->guc.execbuf_client ? "GuC " : "",
> -				 engine->name, ret);
> -		goto out;
> -	}
> -
> -	/*
> -	 * The request that caused the hang is stuck on elsp, we know the
> -	 * active request and can drop it, adjust head to skip the offending
> -	 * request to resume executing remaining requests in the queue.
> -	 */
> -	i915_gem_reset_engine(engine, active_request, true);
> -
> -	/*
> -	 * The engine and its registers (and workarounds in case of render)
> -	 * have been reset to their default values. Follow the init_ring
> -	 * process to program RING_MODE, HWSP and re-enable submission.
> -	 */
> -	ret = engine->init_hw(engine);
> -	if (ret)
> -		goto out;
> -
> -out:
> -	intel_engine_cancel_stop_cs(engine);
> -	i915_gem_reset_finish_engine(engine);
> -	return ret;
> -}
> -
>  static int i915_pm_prepare(struct device *kdev)
>  {
>  	struct pci_dev *pdev = to_pci_dev(kdev);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index fa99824f63b3..224d433ac7b6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2611,19 +2611,7 @@ extern const struct dev_pm_ops i915_pm_ops;
>  extern int i915_driver_load(struct pci_dev *pdev,
>  			    const struct pci_device_id *ent);
>  extern void i915_driver_unload(struct drm_device *dev);
> -extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
> -extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
> -
> -extern void i915_reset(struct drm_i915_private *i915,
> -		       unsigned int stalled_mask,
> -		       const char *reason);
> -extern int i915_reset_engine(struct intel_engine_cs *engine,
> -			     const char *reason);
> -
> -extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
> -extern int intel_reset_guc(struct drm_i915_private *dev_priv);
> -extern int intel_guc_reset_engine(struct intel_guc *guc,
> -				  struct intel_engine_cs *engine);
> +
>  extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
>  extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
>  extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
> @@ -2666,20 +2654,11 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
>  			   &dev_priv->gpu_error.hangcheck_work, delay);
>  }
>  
> -__printf(4, 5)
> -void i915_handle_error(struct drm_i915_private *dev_priv,
> -		       u32 engine_mask,
> -		       unsigned long flags,
> -		       const char *fmt, ...);
> -#define I915_ERROR_CAPTURE BIT(0)
> -
>  extern void intel_irq_init(struct drm_i915_private *dev_priv);
>  extern void intel_irq_fini(struct drm_i915_private *dev_priv);
>  int intel_irq_install(struct drm_i915_private *dev_priv);
>  void intel_irq_uninstall(struct drm_i915_private *dev_priv);
>  
> -void i915_clear_error_registers(struct drm_i915_private *dev_priv);
> -
>  static inline bool intel_gvt_active(struct drm_i915_private *dev_priv)
>  {
>  	return dev_priv->gvt;
> @@ -3044,18 +3023,8 @@ static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
>  	return READ_ONCE(error->reset_engine_count[engine->id]);
>  }
>  
> -struct i915_request *
> -i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
> -int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
> -void i915_gem_reset(struct drm_i915_private *dev_priv,
> -		    unsigned int stalled_mask);
> -void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
> -void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
>  bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
> -void i915_gem_reset_engine(struct intel_engine_cs *engine,
> -			   struct i915_request *request,
> -			   bool stalled);
>  
>  void i915_gem_init_mmio(struct drm_i915_private *i915);
>  int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 910c49befc50..a7e0d61a45ea 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -27,15 +27,6 @@
>  
>  #include <drm/drm_vma_manager.h>
>  #include <drm/i915_drm.h>
> -#include "i915_drv.h"
> -#include "i915_gem_clflush.h"
> -#include "i915_vgpu.h"
> -#include "i915_trace.h"
> -#include "intel_drv.h"
> -#include "intel_frontbuffer.h"
> -#include "intel_mocs.h"
> -#include "intel_workarounds.h"
> -#include "i915_gemfs.h"
>  #include <linux/dma-fence-array.h>
>  #include <linux/kthread.h>
>  #include <linux/reservation.h>
> @@ -46,6 +37,18 @@
>  #include <linux/pci.h>
>  #include <linux/dma-buf.h>
>  
> +#include "i915_drv.h"
> +#include "i915_gem_clflush.h"
> +#include "i915_gemfs.h"
> +#include "i915_reset.h"
> +#include "i915_trace.h"
> +#include "i915_vgpu.h"
> +
> +#include "intel_drv.h"
> +#include "intel_frontbuffer.h"
> +#include "intel_mocs.h"
> +#include "intel_workarounds.h"
> +
>  static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
>  
>  static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> @@ -2859,61 +2862,6 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
>  	return 0;
>  }
>  
> -static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
> -					const struct i915_gem_context *ctx)
> -{
> -	unsigned int score;
> -	unsigned long prev_hang;
> -
> -	if (i915_gem_context_is_banned(ctx))
> -		score = I915_CLIENT_SCORE_CONTEXT_BAN;
> -	else
> -		score = 0;
> -
> -	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
> -	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
> -		score += I915_CLIENT_SCORE_HANG_FAST;
> -
> -	if (score) {
> -		atomic_add(score, &file_priv->ban_score);
> -
> -		DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
> -				 ctx->name, score,
> -				 atomic_read(&file_priv->ban_score));
> -	}
> -}
> -
> -static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
> -{
> -	unsigned int score;
> -	bool banned, bannable;
> -
> -	atomic_inc(&ctx->guilty_count);
> -
> -	bannable = i915_gem_context_is_bannable(ctx);
> -	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> -	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> -
> -	/* Cool contexts don't accumulate client ban score */
> -	if (!bannable)
> -		return;
> -
> -	if (banned) {
> -		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
> -				 ctx->name, atomic_read(&ctx->guilty_count),
> -				 score);
> -		i915_gem_context_set_banned(ctx);
> -	}
> -
> -	if (!IS_ERR_OR_NULL(ctx->file_priv))
> -		i915_gem_client_mark_guilty(ctx->file_priv, ctx);
> -}
> -
> -static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
> -{
> -	atomic_inc(&ctx->active_count);
> -}
> -
>  struct i915_request *
>  i915_gem_find_active_request(struct intel_engine_cs *engine)
>  {
> @@ -2944,376 +2892,6 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
>  	return active;
>  }
>  
> -/*
> - * Ensure irq handler finishes, and not run again.
> - * Also return the active request so that we only search for it once.
> - */
> -struct i915_request *
> -i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> -{
> -	struct i915_request *request;
> -
> -	/*
> -	 * During the reset sequence, we must prevent the engine from
> -	 * entering RC6. As the context state is undefined until we restart
> -	 * the engine, if it does enter RC6 during the reset, the state
> -	 * written to the powercontext is undefined and so we may lose
> -	 * GPU state upon resume, i.e. fail to restart after a reset.
> -	 */
> -	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
> -
> -	request = engine->reset.prepare(engine);
> -	if (request && request->fence.error == -EIO)
> -		request = ERR_PTR(-EIO); /* Previous reset failed! */
> -
> -	return request;
> -}
> -
> -int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
> -{
> -	struct intel_engine_cs *engine;
> -	struct i915_request *request;
> -	enum intel_engine_id id;
> -	int err = 0;
> -
> -	for_each_engine(engine, dev_priv, id) {
> -		request = i915_gem_reset_prepare_engine(engine);
> -		if (IS_ERR(request)) {
> -			err = PTR_ERR(request);
> -			continue;
> -		}
> -
> -		engine->hangcheck.active_request = request;
> -	}
> -
> -	i915_gem_revoke_fences(dev_priv);
> -	intel_uc_sanitize(dev_priv);
> -
> -	return err;
> -}
> -
> -static void engine_skip_context(struct i915_request *request)
> -{
> -	struct intel_engine_cs *engine = request->engine;
> -	struct i915_gem_context *hung_ctx = request->gem_context;
> -	struct i915_timeline *timeline = request->timeline;
> -	unsigned long flags;
> -
> -	GEM_BUG_ON(timeline == &engine->timeline);
> -
> -	spin_lock_irqsave(&engine->timeline.lock, flags);
> -	spin_lock(&timeline->lock);
> -
> -	list_for_each_entry_continue(request, &engine->timeline.requests, link)
> -		if (request->gem_context == hung_ctx)
> -			i915_request_skip(request, -EIO);
> -
> -	list_for_each_entry(request, &timeline->requests, link)
> -		i915_request_skip(request, -EIO);
> -
> -	spin_unlock(&timeline->lock);
> -	spin_unlock_irqrestore(&engine->timeline.lock, flags);
> -}
> -
> -/* Returns the request if it was guilty of the hang */
> -static struct i915_request *
> -i915_gem_reset_request(struct intel_engine_cs *engine,
> -		       struct i915_request *request,
> -		       bool stalled)
> -{
> -	/* The guilty request will get skipped on a hung engine.
> -	 *
> -	 * Users of client default contexts do not rely on logical
> -	 * state preserved between batches so it is safe to execute
> -	 * queued requests following the hang. Non default contexts
> -	 * rely on preserved state, so skipping a batch loses the
> -	 * evolution of the state and it needs to be considered corrupted.
> -	 * Executing more queued batches on top of corrupted state is
> -	 * risky. But we take the risk by trying to advance through
> -	 * the queued requests in order to make the client behaviour
> -	 * more predictable around resets, by not throwing away random
> -	 * amount of batches it has prepared for execution. Sophisticated
> -	 * clients can use gem_reset_stats_ioctl and dma fence status
> -	 * (exported via sync_file info ioctl on explicit fences) to observe
> -	 * when it loses the context state and should rebuild accordingly.
> -	 *
> -	 * The context ban, and ultimately the client ban, mechanism are safety
> -	 * valves if client submission ends up resulting in nothing more than
> -	 * subsequent hangs.
> -	 */
> -
> -	if (i915_request_completed(request)) {
> -		GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
> -			  engine->name, request->global_seqno,
> -			  request->fence.context, request->fence.seqno,
> -			  intel_engine_get_seqno(engine));
> -		stalled = false;
> -	}
> -
> -	if (stalled) {
> -		i915_gem_context_mark_guilty(request->gem_context);
> -		i915_request_skip(request, -EIO);
> -
> -		/* If this context is now banned, skip all pending requests. */
> -		if (i915_gem_context_is_banned(request->gem_context))
> -			engine_skip_context(request);
> -	} else {
> -		/*
> -		 * Since this is not the hung engine, it may have advanced
> -		 * since the hang declaration. Double check by refinding
> -		 * the active request at the time of the reset.
> -		 */
> -		request = i915_gem_find_active_request(engine);
> -		if (request) {
> -			unsigned long flags;
> -
> -			i915_gem_context_mark_innocent(request->gem_context);
> -			dma_fence_set_error(&request->fence, -EAGAIN);
> -
> -			/* Rewind the engine to replay the incomplete rq */
> -			spin_lock_irqsave(&engine->timeline.lock, flags);
> -			request = list_prev_entry(request, link);
> -			if (&request->link == &engine->timeline.requests)
> -				request = NULL;
> -			spin_unlock_irqrestore(&engine->timeline.lock, flags);
> -		}
> -	}
> -
> -	return request;
> -}
> -
> -void i915_gem_reset_engine(struct intel_engine_cs *engine,
> -			   struct i915_request *request,
> -			   bool stalled)
> -{
> -	if (request)
> -		request = i915_gem_reset_request(engine, request, stalled);
> -
> -	/* Setup the CS to resume from the breadcrumb of the hung request */
> -	engine->reset.reset(engine, request);
> -}
> -
> -void i915_gem_reset(struct drm_i915_private *dev_priv,
> -		    unsigned int stalled_mask)
> -{
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> -
> -	i915_retire_requests(dev_priv);
> -
> -	for_each_engine(engine, dev_priv, id) {
> -		struct intel_context *ce;
> -
> -		i915_gem_reset_engine(engine,
> -				      engine->hangcheck.active_request,
> -				      stalled_mask & ENGINE_MASK(id));
> -		ce = fetch_and_zero(&engine->last_retired_context);
> -		if (ce)
> -			intel_context_unpin(ce);
> -
> -		/*
> -		 * Ostensibily, we always want a context loaded for powersaving,
> -		 * so if the engine is idle after the reset, send a request
> -		 * to load our scratch kernel_context.
> -		 *
> -		 * More mysteriously, if we leave the engine idle after a reset,
> -		 * the next userspace batch may hang, with what appears to be
> -		 * an incoherent read by the CS (presumably stale TLB). An
> -		 * empty request appears sufficient to paper over the glitch.
> -		 */
> -		if (intel_engine_is_idle(engine)) {
> -			struct i915_request *rq;
> -
> -			rq = i915_request_alloc(engine,
> -						dev_priv->kernel_context);
> -			if (!IS_ERR(rq))
> -				i915_request_add(rq);
> -		}
> -	}
> -
> -	i915_gem_restore_fences(dev_priv);
> -}
> -
> -void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
> -{
> -	engine->reset.finish(engine);
> -
> -	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> -}
> -
> -void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
> -{
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> -
> -	for_each_engine(engine, dev_priv, id) {
> -		engine->hangcheck.active_request = NULL;
> -		i915_gem_reset_finish_engine(engine);
> -	}
> -}
> -
> -static void nop_submit_request(struct i915_request *request)
> -{
> -	unsigned long flags;
> -
> -	GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
> -		  request->engine->name,
> -		  request->fence.context, request->fence.seqno);
> -	dma_fence_set_error(&request->fence, -EIO);
> -
> -	spin_lock_irqsave(&request->engine->timeline.lock, flags);
> -	__i915_request_submit(request);
> -	intel_engine_write_global_seqno(request->engine, request->global_seqno);
> -	spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
> -}
> -
> -void i915_gem_set_wedged(struct drm_i915_private *i915)
> -{
> -	struct i915_gpu_error *error = &i915->gpu_error;
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	mutex_lock(&error->wedge_mutex);
> -	if (test_bit(I915_WEDGED, &error->flags)) {
> -		mutex_unlock(&error->wedge_mutex);
> -		return;
> -	}
> -
> -	if (GEM_SHOW_DEBUG()) {
> -		struct drm_printer p = drm_debug_printer(__func__);
> -
> -		for_each_engine(engine, i915, id)
> -			intel_engine_dump(engine, &p, "%s\n", engine->name);
> -	}
> -
> -	GEM_TRACE("start\n");
> -
> -	/*
> -	 * First, stop submission to hw, but do not yet complete requests by
> -	 * rolling the global seqno forward (since this would complete requests
> -	 * for which we haven't set the fence error to EIO yet).
> -	 */
> -	for_each_engine(engine, i915, id)
> -		i915_gem_reset_prepare_engine(engine);
> -
> -	/* Even if the GPU reset fails, it should still stop the engines */
> -	if (INTEL_GEN(i915) >= 5)
> -		intel_gpu_reset(i915, ALL_ENGINES);
> -
> -	for_each_engine(engine, i915, id) {
> -		engine->submit_request = nop_submit_request;
> -		engine->schedule = NULL;
> -	}
> -	i915->caps.scheduler = 0;
> -
> -	/*
> -	 * Make sure no request can slip through without getting completed by
> -	 * either this call here to intel_engine_write_global_seqno, or the one
> -	 * in nop_submit_request.
> -	 */
> -	synchronize_rcu();
> -
> -	/* Mark all executing requests as skipped */
> -	for_each_engine(engine, i915, id)
> -		engine->cancel_requests(engine);
> -
> -	for_each_engine(engine, i915, id) {
> -		i915_gem_reset_finish_engine(engine);
> -		intel_engine_wakeup(engine);
> -	}
> -
> -	smp_mb__before_atomic();
> -	set_bit(I915_WEDGED, &error->flags);
> -
> -	GEM_TRACE("end\n");
> -	mutex_unlock(&error->wedge_mutex);
> -
> -	wake_up_all(&error->reset_queue);
> -}
> -
> -bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> -{
> -	struct i915_gpu_error *error = &i915->gpu_error;
> -	struct i915_timeline *tl;
> -	bool ret = false;
> -
> -	lockdep_assert_held(&i915->drm.struct_mutex);
> -
> -	if (!test_bit(I915_WEDGED, &error->flags))
> -		return true;
> -
> -	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
> -		return false;
> -
> -	mutex_lock(&error->wedge_mutex);
> -
> -	GEM_TRACE("start\n");
> -
> -	/*
> -	 * Before unwedging, make sure that all pending operations
> -	 * are flushed and errored out - we may have requests waiting upon
> -	 * third party fences. We marked all inflight requests as EIO, and
> -	 * every execbuf since returned EIO, for consistency we want all
> -	 * the currently pending requests to also be marked as EIO, which
> -	 * is done inside our nop_submit_request - and so we must wait.
> -	 *
> -	 * No more can be submitted until we reset the wedged bit.
> -	 */
> -	list_for_each_entry(tl, &i915->gt.timelines, link) {
> -		struct i915_request *rq;
> -
> -		rq = i915_gem_active_peek(&tl->last_request,
> -					  &i915->drm.struct_mutex);
> -		if (!rq)
> -			continue;
> -
> -		/*
> -		 * We can't use our normal waiter as we want to
> -		 * avoid recursively trying to handle the current
> -		 * reset. The basic dma_fence_default_wait() installs
> -		 * a callback for dma_fence_signal(), which is
> -		 * triggered by our nop handler (indirectly, the
> -		 * callback enables the signaler thread which is
> -		 * woken by the nop_submit_request() advancing the seqno
> -		 * and when the seqno passes the fence, the signaler
> -		 * then signals the fence waking us up).
> -		 */
> -		if (dma_fence_default_wait(&rq->fence, true,
> -					   MAX_SCHEDULE_TIMEOUT) < 0)
> -			goto unlock;
> -	}
> -	i915_retire_requests(i915);
> -	GEM_BUG_ON(i915->gt.active_requests);
> -
> -	intel_engines_sanitize(i915, false);
> -
> -	/*
> -	 * Undo nop_submit_request. We prevent all new i915 requests from
> -	 * being queued (by disallowing execbuf whilst wedged) so having
> -	 * waited for all active requests above, we know the system is idle
> -	 * and do not have to worry about a thread being inside
> -	 * engine->submit_request() as we swap over. So unlike installing
> -	 * the nop_submit_request on reset, we can do this from normal
> -	 * context and do not require stop_machine().
> -	 */
> -	intel_engines_reset_default_submission(i915);
> -	i915_gem_contexts_lost(i915);
> -
> -	GEM_TRACE("end\n");
> -
> -	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
> -	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> -	ret = true;
> -unlock:
> -	mutex_unlock(&i915->gpu_error.wedge_mutex);
> -
> -	return ret;
> -}
> -
>  static void
>  i915_gem_retire_work_handler(struct work_struct *work)
>  {
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 74e6d02dcbbf..68e02e46186b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -37,6 +37,7 @@
>  
>  #include "i915_drv.h"
>  #include "i915_vgpu.h"
> +#include "i915_reset.h"
>  #include "i915_trace.h"
>  #include "intel_drv.h"
>  #include "intel_frontbuffer.h"
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 94187e68d39a..1c6cf024a509 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2930,46 +2930,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>  	return IRQ_HANDLED;
>  }
>  
> -struct wedge_me {
> -	struct delayed_work work;
> -	struct drm_i915_private *i915;
> -	const char *name;
> -};
> -
> -static void wedge_me(struct work_struct *work)
> -{
> -	struct wedge_me *w = container_of(work, typeof(*w), work.work);
> -
> -	dev_err(w->i915->drm.dev,
> -		"%s timed out, cancelling all in-flight rendering.\n",
> -		w->name);
> -	i915_gem_set_wedged(w->i915);
> -}
> -
> -static void __init_wedge(struct wedge_me *w,
> -			 struct drm_i915_private *i915,
> -			 long timeout,
> -			 const char *name)
> -{
> -	w->i915 = i915;
> -	w->name = name;
> -
> -	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> -	schedule_delayed_work(&w->work, timeout);
> -}
> -
> -static void __fini_wedge(struct wedge_me *w)
> -{
> -	cancel_delayed_work_sync(&w->work);
> -	destroy_delayed_work_on_stack(&w->work);
> -	w->i915 = NULL;
> -}
> -
> -#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
> -	for (__init_wedge((W), (DEV), (TIMEOUT), __func__);		\
> -	     (W)->i915;							\
> -	     __fini_wedge((W)))
> -
>  static u32
>  gen11_gt_engine_identity(struct drm_i915_private * const i915,
>  			 const unsigned int bank, const unsigned int bit)
> @@ -3180,204 +3140,6 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg)
>  	return IRQ_HANDLED;
>  }
>  
> -static void i915_reset_device(struct drm_i915_private *dev_priv,
> -			      u32 engine_mask,
> -			      const char *reason)
> -{
> -	struct i915_gpu_error *error = &dev_priv->gpu_error;
> -	struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
> -	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
> -	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
> -	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
> -	struct wedge_me w;
> -
> -	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
> -
> -	DRM_DEBUG_DRIVER("resetting chip\n");
> -	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> -
> -	/* Use a watchdog to ensure that our reset completes */
> -	i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
> -		intel_prepare_reset(dev_priv);
> -
> -		error->reason = reason;
> -		error->stalled_mask = engine_mask;
> -
> -		/* Signal that locked waiters should reset the GPU */
> -		smp_mb__before_atomic();
> -		set_bit(I915_RESET_HANDOFF, &error->flags);
> -		wake_up_all(&error->wait_queue);
> -
> -		/* Wait for anyone holding the lock to wakeup, without
> -		 * blocking indefinitely on struct_mutex.
> -		 */
> -		do {
> -			if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
> -				i915_reset(dev_priv, engine_mask, reason);
> -				mutex_unlock(&dev_priv->drm.struct_mutex);
> -			}
> -		} while (wait_on_bit_timeout(&error->flags,
> -					     I915_RESET_HANDOFF,
> -					     TASK_UNINTERRUPTIBLE,
> -					     1));
> -
> -		error->stalled_mask = 0;
> -		error->reason = NULL;
> -
> -		intel_finish_reset(dev_priv);
> -	}
> -
> -	if (!test_bit(I915_WEDGED, &error->flags))
> -		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
> -}
> -
> -void i915_clear_error_registers(struct drm_i915_private *dev_priv)
> -{
> -	u32 eir;
> -
> -	if (!IS_GEN(dev_priv, 2))
> -		I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER));
> -
> -	if (INTEL_GEN(dev_priv) < 4)
> -		I915_WRITE(IPEIR, I915_READ(IPEIR));
> -	else
> -		I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965));
> -
> -	I915_WRITE(EIR, I915_READ(EIR));
> -	eir = I915_READ(EIR);
> -	if (eir) {
> -		/*
> -		 * some errors might have become stuck,
> -		 * mask them.
> -		 */
> -		DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
> -		I915_WRITE(EMR, I915_READ(EMR) | eir);
> -		I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT);
> -	}
> -
> -	if (INTEL_GEN(dev_priv) >= 8) {
> -		I915_WRITE(GEN8_RING_FAULT_REG,
> -			   I915_READ(GEN8_RING_FAULT_REG) & ~RING_FAULT_VALID);
> -		POSTING_READ(GEN8_RING_FAULT_REG);
> -	} else if (INTEL_GEN(dev_priv) >= 6) {
> -		struct intel_engine_cs *engine;
> -		enum intel_engine_id id;
> -
> -		for_each_engine(engine, dev_priv, id) {
> -			I915_WRITE(RING_FAULT_REG(engine),
> -				   I915_READ(RING_FAULT_REG(engine)) &
> -				   ~RING_FAULT_VALID);
> -		}
> -		POSTING_READ(RING_FAULT_REG(dev_priv->engine[RCS]));
> -	}
> -}
> -
> -/**
> - * i915_handle_error - handle a gpu error
> - * @dev_priv: i915 device private
> - * @engine_mask: mask representing engines that are hung
> - * @flags: control flags
> - * @fmt: Error message format string
> - *
> - * Do some basic checking of register state at error time and
> - * dump it to the syslog.  Also call i915_capture_error_state() to make
> - * sure we get a record and make it available in debugfs.  Fire a uevent
> - * so userspace knows something bad happened (should trigger collection
> - * of a ring dump etc.).
> - */
> -void i915_handle_error(struct drm_i915_private *dev_priv,
> -		       u32 engine_mask,
> -		       unsigned long flags,
> -		       const char *fmt, ...)
> -{
> -	struct intel_engine_cs *engine;
> -	intel_wakeref_t wakeref;
> -	unsigned int tmp;
> -	char error_msg[80];
> -	char *msg = NULL;
> -
> -	if (fmt) {
> -		va_list args;
> -
> -		va_start(args, fmt);
> -		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> -		va_end(args);
> -
> -		msg = error_msg;
> -	}
> -
> -	/*
> -	 * In most cases it's guaranteed that we get here with an RPM
> -	 * reference held, for example because there is a pending GPU
> -	 * request that won't finish until the reset is done. This
> -	 * isn't the case at least when we get here by doing a
> -	 * simulated reset via debugfs, so get an RPM reference.
> -	 */
> -	wakeref = intel_runtime_pm_get(dev_priv);
> -
> -	engine_mask &= INTEL_INFO(dev_priv)->ring_mask;
> -
> -	if (flags & I915_ERROR_CAPTURE) {
> -		i915_capture_error_state(dev_priv, engine_mask, msg);
> -		i915_clear_error_registers(dev_priv);
> -	}
> -
> -	/*
> -	 * Try engine reset when available. We fall back to full reset if
> -	 * single reset fails.
> -	 */
> -	if (intel_has_reset_engine(dev_priv) &&
> -	    !i915_terminally_wedged(&dev_priv->gpu_error)) {
> -		for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
> -			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
> -			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> -					     &dev_priv->gpu_error.flags))
> -				continue;
> -
> -			if (i915_reset_engine(engine, msg) == 0)
> -				engine_mask &= ~intel_engine_flag(engine);
> -
> -			clear_bit(I915_RESET_ENGINE + engine->id,
> -				  &dev_priv->gpu_error.flags);
> -			wake_up_bit(&dev_priv->gpu_error.flags,
> -				    I915_RESET_ENGINE + engine->id);
> -		}
> -	}
> -
> -	if (!engine_mask)
> -		goto out;
> -
> -	/* Full reset needs the mutex, stop any other user trying to do so. */
> -	if (test_and_set_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags)) {
> -		wait_event(dev_priv->gpu_error.reset_queue,
> -			   !test_bit(I915_RESET_BACKOFF,
> -				     &dev_priv->gpu_error.flags));
> -		goto out;
> -	}
> -
> -	/* Prevent any other reset-engine attempt. */
> -	for_each_engine(engine, dev_priv, tmp) {
> -		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> -					&dev_priv->gpu_error.flags))
> -			wait_on_bit(&dev_priv->gpu_error.flags,
> -				    I915_RESET_ENGINE + engine->id,
> -				    TASK_UNINTERRUPTIBLE);
> -	}
> -
> -	i915_reset_device(dev_priv, engine_mask, msg);
> -
> -	for_each_engine(engine, dev_priv, tmp) {
> -		clear_bit(I915_RESET_ENGINE + engine->id,
> -			  &dev_priv->gpu_error.flags);
> -	}
> -
> -	clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
> -	wake_up_all(&dev_priv->gpu_error.reset_queue);
> -
> -out:
> -	intel_runtime_pm_put(dev_priv, wakeref);
> -}
> -
>  /* Called from drm generic code, passed 'crtc' which
>   * we use as a pipe index
>   */
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index d1355154886a..5403d4e2cee0 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -29,6 +29,7 @@
>  #include <linux/sched/signal.h>
>  
>  #include "i915_drv.h"
> +#include "i915_reset.h"
>  
>  static const char *i915_fence_get_driver_name(struct dma_fence *fence)
>  {
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> new file mode 100644
> index 000000000000..e2e40b44a9a8
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -0,0 +1,1389 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2008-2018 Intel Corporation
> + */
> +
> +#include <linux/sched/mm.h>
> +
> +#include "i915_drv.h"
> +#include "i915_gpu_error.h"
> +#include "i915_reset.h"
> +
> +#include "intel_guc.h"
> +
> +static void engine_skip_context(struct i915_request *rq)
> +{
> +	struct intel_engine_cs *engine = rq->engine;
> +	struct i915_gem_context *hung_ctx = rq->gem_context;
> +	struct i915_timeline *timeline = rq->timeline;
> +	unsigned long flags;
> +
> +	GEM_BUG_ON(timeline == &engine->timeline);
> +
> +	spin_lock_irqsave(&engine->timeline.lock, flags);
> +	spin_lock(&timeline->lock);
> +
> +	list_for_each_entry_continue(rq, &engine->timeline.requests, link)
> +		if (rq->gem_context == hung_ctx)
> +			i915_request_skip(rq, -EIO);
> +
> +	list_for_each_entry(rq, &timeline->requests, link)
> +		i915_request_skip(rq, -EIO);
> +
> +	spin_unlock(&timeline->lock);
> +	spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +}
> +
> +static void client_mark_guilty(struct drm_i915_file_private *file_priv,
> +			       const struct i915_gem_context *ctx)
> +{
> +	unsigned int score;
> +	unsigned long prev_hang;
> +
> +	if (i915_gem_context_is_banned(ctx))
> +		score = I915_CLIENT_SCORE_CONTEXT_BAN;
> +	else
> +		score = 0;
> +
> +	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
> +	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
> +		score += I915_CLIENT_SCORE_HANG_FAST;
> +
> +	if (score) {
> +		atomic_add(score, &file_priv->ban_score);
> +
> +		DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
> +				 ctx->name, score,
> +				 atomic_read(&file_priv->ban_score));
> +	}
> +}
> +
> +static void context_mark_guilty(struct i915_gem_context *ctx)
> +{
> +	unsigned int score;
> +	bool banned, bannable;
> +
> +	atomic_inc(&ctx->guilty_count);
> +
> +	bannable = i915_gem_context_is_bannable(ctx);
> +	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> +	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> +
> +	/* Cool contexts don't accumulate client ban score */
> +	if (!bannable)
> +		return;
> +
> +	if (banned) {
> +		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
> +				 ctx->name, atomic_read(&ctx->guilty_count),
> +				 score);
> +		i915_gem_context_set_banned(ctx);
> +	}
> +
> +	if (!IS_ERR_OR_NULL(ctx->file_priv))
> +		client_mark_guilty(ctx->file_priv, ctx);
> +}
> +
> +static void context_mark_innocent(struct i915_gem_context *ctx)
> +{
> +	atomic_inc(&ctx->active_count);
> +}
> +
> +static void gen3_stop_engine(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	const u32 base = engine->mmio_base;
> +
> +	if (intel_engine_stop_cs(engine))
> +		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name);
> +
> +	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
> +	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
> +
> +	I915_WRITE_FW(RING_HEAD(base), 0);
> +	I915_WRITE_FW(RING_TAIL(base), 0);
> +	POSTING_READ_FW(RING_TAIL(base));
> +
> +	/* The ring must be empty before it is disabled */
> +	I915_WRITE_FW(RING_CTL(base), 0);
> +
> +	/* Check acts as a post */
> +	if (I915_READ_FW(RING_HEAD(base)) != 0)
> +		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
> +				 engine->name);
> +}
> +
> +static void i915_stop_engines(struct drm_i915_private *i915,
> +			      unsigned int engine_mask)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	if (INTEL_GEN(i915) < 3)
> +		return;
> +
> +	for_each_engine_masked(engine, i915, engine_mask, id)
> +		gen3_stop_engine(engine);
> +}
> +
> +static bool i915_in_reset(struct pci_dev *pdev)
> +{
> +	u8 gdrst;
> +
> +	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
> +	return gdrst & GRDOM_RESET_STATUS;
> +}
> +
> +static int i915_do_reset(struct drm_i915_private *i915,
> +			 unsigned int engine_mask,
> +			 unsigned int retry)
> +{
> +	struct pci_dev *pdev = i915->drm.pdev;
> +	int err;
> +
> +	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
> +	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> +	usleep_range(50, 200);
> +	err = wait_for(i915_in_reset(pdev), 500);
> +
> +	/* Clear the reset request. */
> +	pci_write_config_byte(pdev, I915_GDRST, 0);
> +	usleep_range(50, 200);
> +	if (!err)
> +		err = wait_for(!i915_in_reset(pdev), 500);
> +
> +	return err;
> +}
> +
> +static bool g4x_reset_complete(struct pci_dev *pdev)
> +{
> +	u8 gdrst;
> +
> +	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
> +	return (gdrst & GRDOM_RESET_ENABLE) == 0;
> +}
> +
> +static int g33_do_reset(struct drm_i915_private *i915,
> +			unsigned int engine_mask,
> +			unsigned int retry)
> +{
> +	struct pci_dev *pdev = i915->drm.pdev;
> +
> +	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> +	return wait_for(g4x_reset_complete(pdev), 500);
> +}
> +
> +static int g4x_do_reset(struct drm_i915_private *dev_priv,
> +			unsigned int engine_mask,
> +			unsigned int retry)
> +{
> +	struct pci_dev *pdev = dev_priv->drm.pdev;
> +	int ret;
> +
> +	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
> +	I915_WRITE(VDECCLK_GATE_D,
> +		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
> +	POSTING_READ(VDECCLK_GATE_D);
> +
> +	pci_write_config_byte(pdev, I915_GDRST,
> +			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
> +	ret =  wait_for(g4x_reset_complete(pdev), 500);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
> +		goto out;
> +	}
> +
> +	pci_write_config_byte(pdev, I915_GDRST,
> +			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
> +	ret =  wait_for(g4x_reset_complete(pdev), 500);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
> +		goto out;
> +	}
> +
> +out:
> +	pci_write_config_byte(pdev, I915_GDRST, 0);
> +
> +	I915_WRITE(VDECCLK_GATE_D,
> +		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
> +	POSTING_READ(VDECCLK_GATE_D);
> +
> +	return ret;
> +}
> +
> +static int ironlake_do_reset(struct drm_i915_private *dev_priv,
> +			     unsigned int engine_mask,
> +			     unsigned int retry)
> +{
> +	int ret;
> +
> +	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
> +	ret = intel_wait_for_register(dev_priv,
> +				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> +				      500);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
> +		goto out;
> +	}
> +
> +	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
> +	ret = intel_wait_for_register(dev_priv,
> +				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> +				      500);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
> +		goto out;
> +	}
> +
> +out:
> +	I915_WRITE(ILK_GDSR, 0);
> +	POSTING_READ(ILK_GDSR);
> +	return ret;
> +}
> +
> +/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
> +static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
> +				u32 hw_domain_mask)
> +{
> +	int err;
> +
> +	/*
> +	 * GEN6_GDRST is not in the gt power well, no need to check
> +	 * for fifo space for the write or forcewake the chip for
> +	 * the read
> +	 */
> +	I915_WRITE_FW(GEN6_GDRST, hw_domain_mask);
> +
> +	/* Wait for the device to ack the reset requests */
> +	err = __intel_wait_for_register_fw(dev_priv,
> +					   GEN6_GDRST, hw_domain_mask, 0,
> +					   500, 0,
> +					   NULL);
> +	if (err)
> +		DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
> +				 hw_domain_mask);
> +
> +	return err;
> +}
> +
> +static int gen6_reset_engines(struct drm_i915_private *i915,
> +			      unsigned int engine_mask,
> +			      unsigned int retry)
> +{
> +	struct intel_engine_cs *engine;
> +	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
> +		[RCS] = GEN6_GRDOM_RENDER,
> +		[BCS] = GEN6_GRDOM_BLT,
> +		[VCS] = GEN6_GRDOM_MEDIA,
> +		[VCS2] = GEN8_GRDOM_MEDIA2,
> +		[VECS] = GEN6_GRDOM_VECS,
> +	};
> +	u32 hw_mask;
> +
> +	if (engine_mask == ALL_ENGINES) {
> +		hw_mask = GEN6_GRDOM_FULL;
> +	} else {
> +		unsigned int tmp;
> +
> +		hw_mask = 0;
> +		for_each_engine_masked(engine, i915, engine_mask, tmp)
> +			hw_mask |= hw_engine_mask[engine->id];
> +	}
> +
> +	return gen6_hw_domain_reset(i915, hw_mask);
> +}
> +
> +static u32 gen11_lock_sfc(struct drm_i915_private *dev_priv,
> +			  struct intel_engine_cs *engine)
> +{
> +	u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
> +	i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
> +	u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
> +	i915_reg_t sfc_usage;
> +	u32 sfc_usage_bit;
> +	u32 sfc_reset_bit;
> +
> +	switch (engine->class) {
> +	case VIDEO_DECODE_CLASS:
> +		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
> +			return 0;
> +
> +		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
> +		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
> +
> +		sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
> +		sfc_forced_lock_ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
> +
> +		sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
> +		sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
> +		sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
> +		break;
> +
> +	case VIDEO_ENHANCEMENT_CLASS:
> +		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
> +		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
> +
> +		sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
> +		sfc_forced_lock_ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
> +
> +		sfc_usage = GEN11_VECS_SFC_USAGE(engine);
> +		sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
> +		sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
> +		break;
> +
> +	default:
> +		return 0;
> +	}
> +
> +	/*
> +	 * Tell the engine that a software reset is going to happen. The engine
> +	 * will then try to force lock the SFC (if currently locked, it will
> +	 * remain so until we tell the engine it is safe to unlock; if currently
> +	 * unlocked, it will ignore this and all new lock requests). If SFC
> +	 * ends up being locked to the engine we want to reset, we have to reset
> +	 * it as well (we will unlock it once the reset sequence is completed).
> +	 */
> +	I915_WRITE_FW(sfc_forced_lock,
> +		      I915_READ_FW(sfc_forced_lock) | sfc_forced_lock_bit);
> +
> +	if (__intel_wait_for_register_fw(dev_priv,
> +					 sfc_forced_lock_ack,
> +					 sfc_forced_lock_ack_bit,
> +					 sfc_forced_lock_ack_bit,
> +					 1000, 0, NULL)) {
> +		DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
> +		return 0;
> +	}
> +
> +	if (I915_READ_FW(sfc_usage) & sfc_usage_bit)
> +		return sfc_reset_bit;
> +
> +	return 0;
> +}
> +
> +static void gen11_unlock_sfc(struct drm_i915_private *dev_priv,
> +			     struct intel_engine_cs *engine)
> +{
> +	u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
> +	i915_reg_t sfc_forced_lock;
> +	u32 sfc_forced_lock_bit;
> +
> +	switch (engine->class) {
> +	case VIDEO_DECODE_CLASS:
> +		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
> +			return;
> +
> +		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
> +		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
> +		break;
> +
> +	case VIDEO_ENHANCEMENT_CLASS:
> +		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
> +		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
> +		break;
> +
> +	default:
> +		return;
> +	}
> +
> +	I915_WRITE_FW(sfc_forced_lock,
> +		      I915_READ_FW(sfc_forced_lock) & ~sfc_forced_lock_bit);
> +}
> +
> +static int gen11_reset_engines(struct drm_i915_private *i915,
> +			       unsigned int engine_mask,
> +			       unsigned int retry)
> +{
> +	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
> +		[RCS] = GEN11_GRDOM_RENDER,
> +		[BCS] = GEN11_GRDOM_BLT,
> +		[VCS] = GEN11_GRDOM_MEDIA,
> +		[VCS2] = GEN11_GRDOM_MEDIA2,
> +		[VCS3] = GEN11_GRDOM_MEDIA3,
> +		[VCS4] = GEN11_GRDOM_MEDIA4,
> +		[VECS] = GEN11_GRDOM_VECS,
> +		[VECS2] = GEN11_GRDOM_VECS2,
> +	};
> +	struct intel_engine_cs *engine;
> +	unsigned int tmp;
> +	u32 hw_mask;
> +	int ret;
> +
> +	BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES);
> +
> +	if (engine_mask == ALL_ENGINES) {
> +		hw_mask = GEN11_GRDOM_FULL;
> +	} else {
> +		hw_mask = 0;
> +		for_each_engine_masked(engine, i915, engine_mask, tmp) {
> +			hw_mask |= hw_engine_mask[engine->id];
> +			hw_mask |= gen11_lock_sfc(i915, engine);
> +		}
> +	}
> +
> +	ret = gen6_hw_domain_reset(i915, hw_mask);
> +
> +	if (engine_mask != ALL_ENGINES)
> +		for_each_engine_masked(engine, i915, engine_mask, tmp)
> +			gen11_unlock_sfc(i915, engine);
> +
> +	return ret;
> +}
> +
> +static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	int ret;
> +
> +	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
> +		      _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
> +
> +	ret = __intel_wait_for_register_fw(dev_priv,
> +					   RING_RESET_CTL(engine->mmio_base),
> +					   RESET_CTL_READY_TO_RESET,
> +					   RESET_CTL_READY_TO_RESET,
> +					   700, 0,
> +					   NULL);
> +	if (ret)
> +		DRM_ERROR("%s: reset request timeout\n", engine->name);
> +
> +	return ret;
> +}
> +
> +static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +
> +	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
> +		      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
> +}
> +
> +static int gen8_reset_engines(struct drm_i915_private *i915,
> +			      unsigned int engine_mask,
> +			      unsigned int retry)
> +{
> +	struct intel_engine_cs *engine;
> +	const bool reset_non_ready = retry >= 1;
> +	unsigned int tmp;
> +	int ret;
> +
> +	for_each_engine_masked(engine, i915, engine_mask, tmp) {
> +		ret = gen8_engine_reset_prepare(engine);
> +		if (ret && !reset_non_ready)
> +			goto skip_reset;
> +
> +		/*
> +		 * If this is not the first failed attempt to prepare,
> +		 * we decide to proceed anyway.
> +		 *
> +		 * By doing so we risk context corruption and with
> +		 * some gens (kbl), possible system hang if reset
> +		 * happens during active bb execution.
> +		 *
> +		 * We rather take context corruption instead of
> +		 * failed reset with a wedged driver/gpu. And
> +		 * active bb execution case should be covered by
> +		 * i915_stop_engines we have before the reset.
> +		 */
> +	}
> +
> +	if (INTEL_GEN(i915) >= 11)
> +		ret = gen11_reset_engines(i915, engine_mask, retry);
> +	else
> +		ret = gen6_reset_engines(i915, engine_mask, retry);
> +
> +skip_reset:
> +	for_each_engine_masked(engine, i915, engine_mask, tmp)
> +		gen8_engine_reset_cancel(engine);
> +
> +	return ret;
> +}
> +
> +typedef int (*reset_func)(struct drm_i915_private *,
> +			  unsigned int engine_mask,
> +			  unsigned int retry);
> +
> +static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
> +{
> +	if (!i915_modparams.reset)
> +		return NULL;
> +
> +	if (INTEL_GEN(i915) >= 8)
> +		return gen8_reset_engines;
> +	else if (INTEL_GEN(i915) >= 6)
> +		return gen6_reset_engines;
> +	else if (INTEL_GEN(i915) >= 5)
> +		return ironlake_do_reset;
> +	else if (IS_G4X(i915))
> +		return g4x_do_reset;
> +	else if (IS_G33(i915) || IS_PINEVIEW(i915))
> +		return g33_do_reset;
> +	else if (INTEL_GEN(i915) >= 3)
> +		return i915_do_reset;
> +	else
> +		return NULL;
> +}
> +
> +int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
> +{
> +	reset_func reset = intel_get_gpu_reset(i915);
> +	int retry;
> +	int ret;
> +
> +	/*
> +	 * We want to perform per-engine reset from atomic context (e.g.
> +	 * softirq), which imposes the constraint that we cannot sleep.
> +	 * However, experience suggests that spending a bit of time waiting
> +	 * for a reset helps in various cases, so for a full-device reset
> +	 * we apply the opposite rule and wait if we want to. As we should
> +	 * always follow up a failed per-engine reset with a full device reset,
> +	 * being a little faster, stricter and more error prone for the
> +	 * atomic case seems an acceptable compromise.
> +	 *
> +	 * Unfortunately this leads to a bimodal routine, when the goal was
> +	 * to have a single reset function that worked for resetting any
> +	 * number of engines simultaneously.
> +	 */
> +	might_sleep_if(engine_mask == ALL_ENGINES);
> +
> +	/*
> +	 * If the power well sleeps during the reset, the reset
> +	 * request may be dropped and never completes (causing -EIO).
> +	 */
> +	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
> +	for (retry = 0; retry < 3; retry++) {
> +		/*
> +		 * We stop engines, otherwise we might get failed reset and a
> +		 * dead gpu (on elk). Also as modern gpu as kbl can suffer
> +		 * from system hang if batchbuffer is progressing when
> +		 * the reset is issued, regardless of READY_TO_RESET ack.
> +		 * Thus assume it is best to stop engines on all gens
> +		 * where we have a gpu reset.
> +		 *
> +		 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
> +		 *
> +		 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
> +		 *
> +		 * FIXME: Wa for more modern gens needs to be validated
> +		 */
> +		i915_stop_engines(i915, engine_mask);
> +
> +		ret = -ENODEV;
> +		if (reset) {
> +			GEM_TRACE("engine_mask=%x\n", engine_mask);
> +			ret = reset(i915, engine_mask, retry);
> +		}
> +		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
> +			break;
> +
> +		cond_resched();
> +	}
> +	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
> +
> +	return ret;
> +}
> +
> +bool intel_has_gpu_reset(struct drm_i915_private *i915)
> +{
> +	return intel_get_gpu_reset(i915);
> +}
> +
> +bool intel_has_reset_engine(struct drm_i915_private *i915)
> +{
> +	return INTEL_INFO(i915)->has_reset_engine && i915_modparams.reset >= 2;
> +}
> +
> +int intel_reset_guc(struct drm_i915_private *i915)
> +{
> +	u32 guc_domain =
> +		INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
> +	int ret;
> +
> +	GEM_BUG_ON(!HAS_GUC(i915));
> +
> +	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
> +	ret = gen6_hw_domain_reset(i915, guc_domain);
> +	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
> +
> +	return ret;
> +}
> +
> +/*
> + * Ensure irq handler finishes, and not run again.
> + * Also return the active request so that we only search for it once.
> + */
> +static struct i915_request *
> +reset_prepare_engine(struct intel_engine_cs *engine)
> +{
> +	struct i915_request *rq;
> +
> +	/*
> +	 * During the reset sequence, we must prevent the engine from
> +	 * entering RC6. As the context state is undefined until we restart
> +	 * the engine, if it does enter RC6 during the reset, the state
> +	 * written to the powercontext is undefined and so we may lose
> +	 * GPU state upon resume, i.e. fail to restart after a reset.
> +	 */
> +	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
> +
> +	rq = engine->reset.prepare(engine);
> +	if (rq && rq->fence.error == -EIO)
> +		rq = ERR_PTR(-EIO); /* Previous reset failed! */
> +
> +	return rq;
> +}
> +
> +static int reset_prepare(struct drm_i915_private *i915)
> +{
> +	struct intel_engine_cs *engine;
> +	struct i915_request *rq;
> +	enum intel_engine_id id;
> +	int err = 0;
> +
> +	for_each_engine(engine, i915, id) {
> +		rq = reset_prepare_engine(engine);
> +		if (IS_ERR(rq)) {
> +			err = PTR_ERR(rq);
> +			continue;
> +		}
> +
> +		engine->hangcheck.active_request = rq;
> +	}
> +
> +	i915_gem_revoke_fences(i915);
> +	intel_uc_sanitize(i915);
> +
> +	return err;
> +}
> +
> +/* Returns the request if it was guilty of the hang */
> +static struct i915_request *
> +reset_request(struct intel_engine_cs *engine,
> +	      struct i915_request *rq,
> +	      bool stalled)
> +{
> +	/*
> +	 * The guilty request will get skipped on a hung engine.
> +	 *
> +	 * Users of client default contexts do not rely on logical
> +	 * state preserved between batches so it is safe to execute
> +	 * queued requests following the hang. Non default contexts
> +	 * rely on preserved state, so skipping a batch loses the
> +	 * evolution of the state and it needs to be considered corrupted.
> +	 * Executing more queued batches on top of corrupted state is
> +	 * risky. But we take the risk by trying to advance through
> +	 * the queued requests in order to make the client behaviour
> +	 * more predictable around resets, by not throwing away random
> +	 * amount of batches it has prepared for execution. Sophisticated
> +	 * clients can use gem_reset_stats_ioctl and dma fence status
> +	 * (exported via sync_file info ioctl on explicit fences) to observe
> +	 * when it loses the context state and should rebuild accordingly.
> +	 *
> +	 * The context ban, and ultimately the client ban, mechanism are safety
> +	 * valves if client submission ends up resulting in nothing more than
> +	 * subsequent hangs.
> +	 */
> +
> +	if (i915_request_completed(rq)) {
> +		GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
> +			  engine->name, rq->global_seqno,
> +			  rq->fence.context, rq->fence.seqno,
> +			  intel_engine_get_seqno(engine));
> +		stalled = false;
> +	}
> +
> +	if (stalled) {
> +		context_mark_guilty(rq->gem_context);
> +		i915_request_skip(rq, -EIO);
> +
> +		/* If this context is now banned, skip all pending requests. */
> +		if (i915_gem_context_is_banned(rq->gem_context))
> +			engine_skip_context(rq);
> +	} else {
> +		/*
> +		 * Since this is not the hung engine, it may have advanced
> +		 * since the hang declaration. Double check by refinding
> +		 * the active request at the time of the reset.
> +		 */
> +		rq = i915_gem_find_active_request(engine);
> +		if (rq) {
> +			unsigned long flags;
> +
> +			context_mark_innocent(rq->gem_context);
> +			dma_fence_set_error(&rq->fence, -EAGAIN);
> +
> +			/* Rewind the engine to replay the incomplete rq */
> +			spin_lock_irqsave(&engine->timeline.lock, flags);
> +			rq = list_prev_entry(rq, link);
> +			if (&rq->link == &engine->timeline.requests)
> +				rq = NULL;
> +			spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +		}
> +	}
> +
> +	return rq;
> +}
> +
> +static void reset_engine(struct intel_engine_cs *engine,
> +			 struct i915_request *rq,
> +			 bool stalled)
> +{
> +	if (rq)
> +		rq = reset_request(engine, rq, stalled);
> +
> +	/* Setup the CS to resume from the breadcrumb of the hung request */
> +	engine->reset.reset(engine, rq);
> +}
> +
> +static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +
> +	i915_retire_requests(i915);
> +
> +	for_each_engine(engine, i915, id) {
> +		struct intel_context *ce;
> +
> +		reset_engine(engine,
> +			     engine->hangcheck.active_request,
> +			     stalled_mask & ENGINE_MASK(id));
> +		ce = fetch_and_zero(&engine->last_retired_context);
> +		if (ce)
> +			intel_context_unpin(ce);
> +
> +		/*
> +		 * Ostensibily, we always want a context loaded for powersaving,
> +		 * so if the engine is idle after the reset, send a request
> +		 * to load our scratch kernel_context.
> +		 *
> +		 * More mysteriously, if we leave the engine idle after a reset,
> +		 * the next userspace batch may hang, with what appears to be
> +		 * an incoherent read by the CS (presumably stale TLB). An
> +		 * empty request appears sufficient to paper over the glitch.
> +		 */
> +		if (intel_engine_is_idle(engine)) {
> +			struct i915_request *rq;
> +
> +			rq = i915_request_alloc(engine, i915->kernel_context);
> +			if (!IS_ERR(rq))
> +				i915_request_add(rq);
> +		}
> +	}
> +
> +	i915_gem_restore_fences(i915);
> +}
> +
> +static void reset_finish_engine(struct intel_engine_cs *engine)
> +{
> +	engine->reset.finish(engine);
> +
> +	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> +}
> +
> +static void reset_finish(struct drm_i915_private *i915)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +
> +	for_each_engine(engine, i915, id) {
> +		engine->hangcheck.active_request = NULL;
> +		reset_finish_engine(engine);
> +	}
> +}
> +
> +static void nop_submit_request(struct i915_request *request)
> +{
> +	unsigned long flags;
> +
> +	GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
> +		  request->engine->name,
> +		  request->fence.context, request->fence.seqno);
> +	dma_fence_set_error(&request->fence, -EIO);
> +
> +	spin_lock_irqsave(&request->engine->timeline.lock, flags);
> +	__i915_request_submit(request);
> +	intel_engine_write_global_seqno(request->engine, request->global_seqno);
> +	spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
> +}
> +
> +void i915_gem_set_wedged(struct drm_i915_private *i915)
> +{
> +	struct i915_gpu_error *error = &i915->gpu_error;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	mutex_lock(&error->wedge_mutex);
> +	if (test_bit(I915_WEDGED, &error->flags)) {
> +		mutex_unlock(&error->wedge_mutex);
> +		return;
> +	}
> +
> +	if (GEM_SHOW_DEBUG()) {
> +		struct drm_printer p = drm_debug_printer(__func__);
> +
> +		for_each_engine(engine, i915, id)
> +			intel_engine_dump(engine, &p, "%s\n", engine->name);
> +	}
> +
> +	GEM_TRACE("start\n");
> +
> +	/*
> +	 * First, stop submission to hw, but do not yet complete requests by
> +	 * rolling the global seqno forward (since this would complete requests
> +	 * for which we haven't set the fence error to EIO yet).
> +	 */
> +	for_each_engine(engine, i915, id)
> +		reset_prepare_engine(engine);
> +
> +	/* Even if the GPU reset fails, it should still stop the engines */
> +	if (INTEL_GEN(i915) >= 5)
> +		intel_gpu_reset(i915, ALL_ENGINES);
> +
> +	for_each_engine(engine, i915, id) {
> +		engine->submit_request = nop_submit_request;
> +		engine->schedule = NULL;
> +	}
> +	i915->caps.scheduler = 0;
> +
> +	/*
> +	 * Make sure no request can slip through without getting completed by
> +	 * either this call here to intel_engine_write_global_seqno, or the one
> +	 * in nop_submit_request.
> +	 */
> +	synchronize_rcu();
> +
> +	/* Mark all executing requests as skipped */
> +	for_each_engine(engine, i915, id)
> +		engine->cancel_requests(engine);
> +
> +	for_each_engine(engine, i915, id) {
> +		reset_finish_engine(engine);
> +		intel_engine_wakeup(engine);
> +	}
> +
> +	smp_mb__before_atomic();
> +	set_bit(I915_WEDGED, &error->flags);
> +
> +	GEM_TRACE("end\n");
> +	mutex_unlock(&error->wedge_mutex);
> +
> +	wake_up_all(&error->reset_queue);
> +}
> +
> +bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> +{
> +	struct i915_gpu_error *error = &i915->gpu_error;
> +	struct i915_timeline *tl;
> +	bool ret = false;
> +
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +
> +	if (!test_bit(I915_WEDGED, &error->flags))
> +		return true;
> +
> +	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
> +		return false;
> +
> +	mutex_lock(&error->wedge_mutex);
> +
> +	GEM_TRACE("start\n");
> +
> +	/*
> +	 * Before unwedging, make sure that all pending operations
> +	 * are flushed and errored out - we may have requests waiting upon
> +	 * third party fences. We marked all inflight requests as EIO, and
> +	 * every execbuf since returned EIO, for consistency we want all
> +	 * the currently pending requests to also be marked as EIO, which
> +	 * is done inside our nop_submit_request - and so we must wait.
> +	 *
> +	 * No more can be submitted until we reset the wedged bit.
> +	 */
> +	list_for_each_entry(tl, &i915->gt.timelines, link) {
> +		struct i915_request *rq;
> +
> +		rq = i915_gem_active_peek(&tl->last_request,
> +					  &i915->drm.struct_mutex);
> +		if (!rq)
> +			continue;
> +
> +		/*
> +		 * We can't use our normal waiter as we want to
> +		 * avoid recursively trying to handle the current
> +		 * reset. The basic dma_fence_default_wait() installs
> +		 * a callback for dma_fence_signal(), which is
> +		 * triggered by our nop handler (indirectly, the
> +		 * callback enables the signaler thread which is
> +		 * woken by the nop_submit_request() advancing the seqno
> +		 * and when the seqno passes the fence, the signaler
> +		 * then signals the fence waking us up).
> +		 */
> +		if (dma_fence_default_wait(&rq->fence, true,
> +					   MAX_SCHEDULE_TIMEOUT) < 0)
> +			goto unlock;
> +	}
> +	i915_retire_requests(i915);
> +	GEM_BUG_ON(i915->gt.active_requests);
> +
> +	intel_engines_sanitize(i915, false);
> +
> +	/*
> +	 * Undo nop_submit_request. We prevent all new i915 requests from
> +	 * being queued (by disallowing execbuf whilst wedged) so having
> +	 * waited for all active requests above, we know the system is idle
> +	 * and do not have to worry about a thread being inside
> +	 * engine->submit_request() as we swap over. So unlike installing
> +	 * the nop_submit_request on reset, we can do this from normal
> +	 * context and do not require stop_machine().
> +	 */
> +	intel_engines_reset_default_submission(i915);
> +	i915_gem_contexts_lost(i915);
> +
> +	GEM_TRACE("end\n");
> +
> +	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
> +	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> +	ret = true;
> +unlock:
> +	mutex_unlock(&i915->gpu_error.wedge_mutex);
> +
> +	return ret;
> +}
> +
> +/**
> + * i915_reset - reset chip after a hang
> + * @i915: #drm_i915_private to reset
> + * @stalled_mask: mask of the stalled engines with the guilty requests
> + * @reason: user error message for why we are resetting
> + *
> + * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
> + * on failure.
> + *
> + * Caller must hold the struct_mutex.
> + *
> + * Procedure is fairly simple:
> + *   - reset the chip using the reset reg
> + *   - re-init context state
> + *   - re-init hardware status page
> + *   - re-init ring buffer
> + *   - re-init interrupt state
> + *   - re-init display
> + */
> +void i915_reset(struct drm_i915_private *i915,
> +		unsigned int stalled_mask,
> +		const char *reason)
> +{
> +	struct i915_gpu_error *error = &i915->gpu_error;
> +	int ret;
> +	int i;
> +
> +	GEM_TRACE("flags=%lx\n", error->flags);
> +
> +	might_sleep();
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +	assert_rpm_wakelock_held(i915);
> +	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
> +
> +	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
> +		return;
> +
> +	/* Clear any previous failed attempts at recovery. Time to try again. */
> +	if (!i915_gem_unset_wedged(i915))
> +		goto wakeup;
> +
> +	if (reason)
> +		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
> +	error->reset_count++;
> +
> +	ret = reset_prepare(i915);
> +	if (ret) {
> +		dev_err(i915->drm.dev, "GPU recovery failed\n");
> +		goto taint;
> +	}
> +
> +	if (!intel_has_gpu_reset(i915)) {
> +		if (i915_modparams.reset)
> +			dev_err(i915->drm.dev, "GPU reset not supported\n");
> +		else
> +			DRM_DEBUG_DRIVER("GPU reset disabled\n");
> +		goto error;
> +	}
> +
> +	for (i = 0; i < 3; i++) {
> +		ret = intel_gpu_reset(i915, ALL_ENGINES);
> +		if (ret == 0)
> +			break;
> +
> +		msleep(100);
> +	}
> +	if (ret) {
> +		dev_err(i915->drm.dev, "Failed to reset chip\n");
> +		goto taint;
> +	}
> +
> +	/* Ok, now get things going again... */
> +
> +	/*
> +	 * Everything depends on having the GTT running, so we need to start
> +	 * there.
> +	 */
> +	ret = i915_ggtt_enable_hw(i915);
> +	if (ret) {
> +		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
> +			  ret);
> +		goto error;
> +	}
> +
> +	gt_reset(i915, stalled_mask);
> +	intel_overlay_reset(i915);
> +
> +	/*
> +	 * Next we need to restore the context, but we don't use those
> +	 * yet either...
> +	 *
> +	 * Ring buffer needs to be re-initialized in the KMS case, or if X
> +	 * was running at the time of the reset (i.e. we weren't VT
> +	 * switched away).
> +	 */
> +	ret = i915_gem_init_hw(i915);
> +	if (ret) {
> +		DRM_ERROR("Failed to initialise HW following reset (%d)\n",
> +			  ret);
> +		goto error;
> +	}
> +
> +	i915_queue_hangcheck(i915);
> +
> +finish:
> +	reset_finish(i915);
> +wakeup:
> +	clear_bit(I915_RESET_HANDOFF, &error->flags);
> +	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
> +	return;
> +
> +taint:
> +	/*
> +	 * History tells us that if we cannot reset the GPU now, we
> +	 * never will. This then impacts everything that is run
> +	 * subsequently. On failing the reset, we mark the driver
> +	 * as wedged, preventing further execution on the GPU.
> +	 * We also want to go one step further and add a taint to the
> +	 * kernel so that any subsequent faults can be traced back to
> +	 * this failure. This is important for CI, where if the
> +	 * GPU/driver fails we would like to reboot and restart testing
> +	 * rather than continue on into oblivion. For everyone else,
> +	 * the system should still plod along, but they have been warned!
> +	 */
> +	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
> +error:
> +	i915_gem_set_wedged(i915);
> +	i915_retire_requests(i915);
> +	goto finish;
> +}
> +
> +static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
> +					struct intel_engine_cs *engine)
> +{
> +	return intel_gpu_reset(i915, intel_engine_flag(engine));
> +}
> +
> +/**
> + * i915_reset_engine - reset GPU engine to recover from a hang
> + * @engine: engine to reset
> + * @msg: reason for GPU reset; or NULL for no dev_notice()
> + *
> + * Reset a specific GPU engine. Useful if a hang is detected.
> + * Returns zero on successful reset or otherwise an error code.
> + *
> + * Procedure is:
> + *  - identifies the request that caused the hang and it is dropped
> + *  - reset engine (which will force the engine to idle)
> + *  - re-init/configure engine
> + */
> +int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
> +{
> +	struct i915_gpu_error *error = &engine->i915->gpu_error;
> +	struct i915_request *active_request;
> +	int ret;
> +
> +	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
> +	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
> +
> +	active_request = reset_prepare_engine(engine);
> +	if (IS_ERR_OR_NULL(active_request)) {
> +		/* Either the previous reset failed, or we pardon the reset. */
> +		ret = PTR_ERR(active_request);
> +		goto out;
> +	}
> +
> +	if (msg)
> +		dev_notice(engine->i915->drm.dev,
> +			   "Resetting %s for %s\n", engine->name, msg);
> +	error->reset_engine_count[engine->id]++;
> +
> +	if (!engine->i915->guc.execbuf_client)
> +		ret = intel_gt_reset_engine(engine->i915, engine);
> +	else
> +		ret = intel_guc_reset_engine(&engine->i915->guc, engine);
> +	if (ret) {
> +		/* If we fail here, we expect to fallback to a global reset */
> +		DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
> +				 engine->i915->guc.execbuf_client ? "GuC " : "",
> +				 engine->name, ret);
> +		goto out;
> +	}
> +
> +	/*
> +	 * The request that caused the hang is stuck on elsp, we know the
> +	 * active request and can drop it, adjust head to skip the offending
> +	 * request to resume executing remaining requests in the queue.
> +	 */
> +	reset_engine(engine, active_request, true);
> +
> +	/*
> +	 * The engine and its registers (and workarounds in case of render)
> +	 * have been reset to their default values. Follow the init_ring
> +	 * process to program RING_MODE, HWSP and re-enable submission.
> +	 */
> +	ret = engine->init_hw(engine);
> +	if (ret)
> +		goto out;
> +
> +out:
> +	intel_engine_cancel_stop_cs(engine);
> +	reset_finish_engine(engine);
> +	return ret;
> +}
> +
> +static void i915_reset_device(struct drm_i915_private *i915,
> +			      u32 engine_mask,
> +			      const char *reason)
> +{
> +	struct i915_gpu_error *error = &i915->gpu_error;
> +	struct kobject *kobj = &i915->drm.primary->kdev->kobj;
> +	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
> +	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
> +	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
> +	struct i915_wedge_me w;
> +
> +	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
> +
> +	DRM_DEBUG_DRIVER("resetting chip\n");
> +	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> +
> +	/* Use a watchdog to ensure that our reset completes */
> +	i915_wedge_on_timeout(&w, i915, 5 * HZ) {
> +		intel_prepare_reset(i915);
> +
> +		error->reason = reason;
> +		error->stalled_mask = engine_mask;
> +
> +		/* Signal that locked waiters should reset the GPU */
> +		smp_mb__before_atomic();
> +		set_bit(I915_RESET_HANDOFF, &error->flags);
> +		wake_up_all(&error->wait_queue);
> +
> +		/*
> +		 * Wait for anyone holding the lock to wakeup, without
> +		 * blocking indefinitely on struct_mutex.
> +		 */
> +		do {
> +			if (mutex_trylock(&i915->drm.struct_mutex)) {
> +				i915_reset(i915, engine_mask, reason);
> +				mutex_unlock(&i915->drm.struct_mutex);
> +			}
> +		} while (wait_on_bit_timeout(&error->flags,
> +					     I915_RESET_HANDOFF,
> +					     TASK_UNINTERRUPTIBLE,
> +					     1));
> +
> +		error->stalled_mask = 0;
> +		error->reason = NULL;
> +
> +		intel_finish_reset(i915);
> +	}
> +
> +	if (!test_bit(I915_WEDGED, &error->flags))
> +		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
> +}
> +
> +void i915_clear_error_registers(struct drm_i915_private *dev_priv)
> +{
> +	u32 eir;
> +
> +	if (!IS_GEN(dev_priv, 2))
> +		I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER));
> +
> +	if (INTEL_GEN(dev_priv) < 4)
> +		I915_WRITE(IPEIR, I915_READ(IPEIR));
> +	else
> +		I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965));
> +
> +	I915_WRITE(EIR, I915_READ(EIR));
> +	eir = I915_READ(EIR);
> +	if (eir) {
> +		/*
> +		 * some errors might have become stuck,
> +		 * mask them.
> +		 */
> +		DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
> +		I915_WRITE(EMR, I915_READ(EMR) | eir);
> +		I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT);
> +	}
> +
> +	if (INTEL_GEN(dev_priv) >= 8) {
> +		I915_WRITE(GEN8_RING_FAULT_REG,
> +			   I915_READ(GEN8_RING_FAULT_REG) & ~RING_FAULT_VALID);
> +		POSTING_READ(GEN8_RING_FAULT_REG);
> +	} else if (INTEL_GEN(dev_priv) >= 6) {
> +		struct intel_engine_cs *engine;
> +		enum intel_engine_id id;
> +
> +		for_each_engine(engine, dev_priv, id) {
> +			I915_WRITE(RING_FAULT_REG(engine),
> +				   I915_READ(RING_FAULT_REG(engine)) &
> +				   ~RING_FAULT_VALID);
> +		}
> +		POSTING_READ(RING_FAULT_REG(dev_priv->engine[RCS]));
> +	}
> +}
> +
> +/**
> + * i915_handle_error - handle a gpu error
> + * @i915: i915 device private
> + * @engine_mask: mask representing engines that are hung
> + * @flags: control flags
> + * @fmt: Error message format string
> + *
> + * Do some basic checking of register state at error time and
> + * dump it to the syslog.  Also call i915_capture_error_state() to make
> + * sure we get a record and make it available in debugfs.  Fire a uevent
> + * so userspace knows something bad happened (should trigger collection
> + * of a ring dump etc.).
> + */
> +void i915_handle_error(struct drm_i915_private *i915,
> +		       u32 engine_mask,
> +		       unsigned long flags,
> +		       const char *fmt, ...)
> +{
> +	struct intel_engine_cs *engine;
> +	intel_wakeref_t wakeref;
> +	unsigned int tmp;
> +	char error_msg[80];
> +	char *msg = NULL;
> +
> +	if (fmt) {
> +		va_list args;
> +
> +		va_start(args, fmt);
> +		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> +		va_end(args);
> +
> +		msg = error_msg;
> +	}
> +
> +	/*
> +	 * In most cases it's guaranteed that we get here with an RPM
> +	 * reference held, for example because there is a pending GPU
> +	 * request that won't finish until the reset is done. This
> +	 * isn't the case at least when we get here by doing a
> +	 * simulated reset via debugfs, so get an RPM reference.
> +	 */
> +	wakeref = intel_runtime_pm_get(i915);
> +
> +	engine_mask &= INTEL_INFO(i915)->ring_mask;
> +
> +	if (flags & I915_ERROR_CAPTURE) {
> +		i915_capture_error_state(i915, engine_mask, msg);
> +		i915_clear_error_registers(i915);
> +	}
> +
> +	/*
> +	 * Try engine reset when available. We fall back to full reset if
> +	 * single reset fails.
> +	 */
> +	if (intel_has_reset_engine(i915) &&
> +	    !i915_terminally_wedged(&i915->gpu_error)) {
> +		for_each_engine_masked(engine, i915, engine_mask, tmp) {
> +			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
> +			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> +					     &i915->gpu_error.flags))
> +				continue;
> +
> +			if (i915_reset_engine(engine, msg) == 0)
> +				engine_mask &= ~intel_engine_flag(engine);
> +
> +			clear_bit(I915_RESET_ENGINE + engine->id,
> +				  &i915->gpu_error.flags);
> +			wake_up_bit(&i915->gpu_error.flags,
> +				    I915_RESET_ENGINE + engine->id);
> +		}
> +	}
> +
> +	if (!engine_mask)
> +		goto out;
> +
> +	/* Full reset needs the mutex, stop any other user trying to do so. */
> +	if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
> +		wait_event(i915->gpu_error.reset_queue,
> +			   !test_bit(I915_RESET_BACKOFF,
> +				     &i915->gpu_error.flags));
> +		goto out;
> +	}
> +
> +	/* Prevent any other reset-engine attempt. */
> +	for_each_engine(engine, i915, tmp) {
> +		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> +					&i915->gpu_error.flags))
> +			wait_on_bit(&i915->gpu_error.flags,
> +				    I915_RESET_ENGINE + engine->id,
> +				    TASK_UNINTERRUPTIBLE);
> +	}
> +
> +	i915_reset_device(i915, engine_mask, msg);
> +
> +	for_each_engine(engine, i915, tmp) {
> +		clear_bit(I915_RESET_ENGINE + engine->id,
> +			  &i915->gpu_error.flags);
> +	}
> +
> +	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +	wake_up_all(&i915->gpu_error.reset_queue);
> +
> +out:
> +	intel_runtime_pm_put(i915, wakeref);
> +}
> +
> +static void i915_wedge_me(struct work_struct *work)
> +{
> +	struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
> +
> +	dev_err(w->i915->drm.dev,
> +		"%s timed out, cancelling all in-flight rendering.\n",
> +		w->name);
> +	i915_gem_set_wedged(w->i915);
> +}
> +
> +void __i915_init_wedge(struct i915_wedge_me *w,
> +		       struct drm_i915_private *i915,
> +		       long timeout,
> +		       const char *name)
> +{
> +	w->i915 = i915;
> +	w->name = name;
> +
> +	INIT_DELAYED_WORK_ONSTACK(&w->work, i915_wedge_me);
> +	schedule_delayed_work(&w->work, timeout);
> +}
> +
> +void __i915_fini_wedge(struct i915_wedge_me *w)
> +{
> +	cancel_delayed_work_sync(&w->work);
> +	destroy_delayed_work_on_stack(&w->work);
> +	w->i915 = NULL;
> +}
> diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
> new file mode 100644
> index 000000000000..b6a519bde67d
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_reset.h
> @@ -0,0 +1,56 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2008-2018 Intel Corporation
> + */
> +
> +#ifndef I915_RESET_H
> +#define I915_RESET_H
> +
> +#include <linux/compiler.h>
> +#include <linux/types.h>
> +
> +struct drm_i915_private;
> +struct intel_engine_cs;
> +struct intel_guc;
> +
> +__printf(4, 5)
> +void i915_handle_error(struct drm_i915_private *i915,
> +		       u32 engine_mask,
> +		       unsigned long flags,
> +		       const char *fmt, ...);
> +#define I915_ERROR_CAPTURE BIT(0)
> +
> +void i915_clear_error_registers(struct drm_i915_private *i915);
> +
> +void i915_reset(struct drm_i915_private *i915,
> +		unsigned int stalled_mask,
> +		const char *reason);
> +int i915_reset_engine(struct intel_engine_cs *engine,
> +		      const char *reason);
> +
> +bool intel_has_gpu_reset(struct drm_i915_private *i915);
> +bool intel_has_reset_engine(struct drm_i915_private *i915);
> +
> +int intel_gpu_reset(struct drm_i915_private *i915, u32 engine_mask);
> +
> +int intel_reset_guc(struct drm_i915_private *i915);
> +
> +struct i915_wedge_me {
> +	struct delayed_work work;
> +	struct drm_i915_private *i915;
> +	const char *name;
> +};
> +
> +void __i915_init_wedge(struct i915_wedge_me *w,
> +		       struct drm_i915_private *i915,
> +		       long timeout,
> +		       const char *name);
> +void __i915_fini_wedge(struct i915_wedge_me *w);
> +
> +#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
> +	for (__i915_init_wedge((W), (DEV), (TIMEOUT), __func__);	\
> +	     (W)->i915;							\
> +	     __i915_fini_wedge((W)))
> +
> +#endif /* I915_RESET_H */
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 64dbd06f4ffb..fbe3c3a3b675 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -31,13 +31,7 @@
>  #include <linux/slab.h>
>  #include <linux/vgaarb.h>
>  #include <drm/drm_edid.h>
> -#include "intel_drv.h"
> -#include "intel_frontbuffer.h"
>  #include <drm/i915_drm.h>
> -#include "i915_drv.h"
> -#include "i915_gem_clflush.h"
> -#include "intel_dsi.h"
> -#include "i915_trace.h"
>  #include <drm/drm_atomic.h>
>  #include <drm/drm_atomic_helper.h>
>  #include <drm/drm_dp_helper.h>
> @@ -48,6 +42,15 @@
>  #include <linux/intel-iommu.h>
>  #include <linux/reservation.h>
>  
> +#include "intel_drv.h"
> +#include "intel_dsi.h"
> +#include "intel_frontbuffer.h"
> +
> +#include "i915_drv.h"
> +#include "i915_gem_clflush.h"
> +#include "i915_reset.h"
> +#include "i915_trace.h"
> +
>  /* Primary plane formats for gen <= 3 */
>  static const uint32_t i8xx_primary_formats[] = {
>  	DRM_FORMAT_C8,
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index bf4dae2649ab..9c943bb95cb9 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -25,6 +25,7 @@
>  #include <drm/drm_print.h>
>  
>  #include "i915_drv.h"
> +#include "i915_reset.h"
>  #include "intel_ringbuffer.h"
>  #include "intel_lrc.h"
>  
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index 0f1c4f9ebfd8..744220296653 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -192,4 +192,7 @@ static inline void intel_guc_disable_msg(struct intel_guc *guc, u32 mask)
>  	spin_unlock_irq(&guc->irq_lock);
>  }
>  
> +int intel_guc_reset_engine(struct intel_guc *guc,
> +			   struct intel_engine_cs *engine);
> +
>  #endif
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 51e9efec5116..7dc11fcb13de 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -23,6 +23,7 @@
>   */
>  
>  #include "i915_drv.h"
> +#include "i915_reset.h"
>  
>  static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
>  {
> diff --git a/drivers/gpu/drm/i915/intel_uc.c b/drivers/gpu/drm/i915/intel_uc.c
> index 731b82afe636..e711eb3268bc 100644
> --- a/drivers/gpu/drm/i915/intel_uc.c
> +++ b/drivers/gpu/drm/i915/intel_uc.c
> @@ -26,6 +26,7 @@
>  #include "intel_guc_submission.h"
>  #include "intel_guc.h"
>  #include "i915_drv.h"
> +#include "i915_reset.h"
>  
>  static void guc_free_load_err_log(struct intel_guc *guc);
>  
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index 681ea532585e..e88f0252d77e 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1715,372 +1715,6 @@ int i915_reg_read_ioctl(struct drm_device *dev,
>  	return ret;
>  }
>  
> -static void gen3_stop_engine(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -	const u32 base = engine->mmio_base;
> -
> -	if (intel_engine_stop_cs(engine))
> -		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name);
> -
> -	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
> -	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
> -
> -	I915_WRITE_FW(RING_HEAD(base), 0);
> -	I915_WRITE_FW(RING_TAIL(base), 0);
> -	POSTING_READ_FW(RING_TAIL(base));
> -
> -	/* The ring must be empty before it is disabled */
> -	I915_WRITE_FW(RING_CTL(base), 0);
> -
> -	/* Check acts as a post */
> -	if (I915_READ_FW(RING_HEAD(base)) != 0)
> -		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
> -				 engine->name);
> -}
> -
> -static void i915_stop_engines(struct drm_i915_private *dev_priv,
> -			      unsigned int engine_mask)
> -{
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	if (INTEL_GEN(dev_priv) < 3)
> -		return;
> -
> -	for_each_engine_masked(engine, dev_priv, engine_mask, id)
> -		gen3_stop_engine(engine);
> -}
> -
> -static bool i915_in_reset(struct pci_dev *pdev)
> -{
> -	u8 gdrst;
> -
> -	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
> -	return gdrst & GRDOM_RESET_STATUS;
> -}
> -
> -static int i915_do_reset(struct drm_i915_private *dev_priv,
> -			 unsigned int engine_mask,
> -			 unsigned int retry)
> -{
> -	struct pci_dev *pdev = dev_priv->drm.pdev;
> -	int err;
> -
> -	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
> -	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> -	usleep_range(50, 200);
> -	err = wait_for(i915_in_reset(pdev), 500);
> -
> -	/* Clear the reset request. */
> -	pci_write_config_byte(pdev, I915_GDRST, 0);
> -	usleep_range(50, 200);
> -	if (!err)
> -		err = wait_for(!i915_in_reset(pdev), 500);
> -
> -	return err;
> -}
> -
> -static bool g4x_reset_complete(struct pci_dev *pdev)
> -{
> -	u8 gdrst;
> -
> -	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
> -	return (gdrst & GRDOM_RESET_ENABLE) == 0;
> -}
> -
> -static int g33_do_reset(struct drm_i915_private *dev_priv,
> -			unsigned int engine_mask,
> -			unsigned int retry)
> -{
> -	struct pci_dev *pdev = dev_priv->drm.pdev;
> -
> -	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> -	return wait_for(g4x_reset_complete(pdev), 500);
> -}
> -
> -static int g4x_do_reset(struct drm_i915_private *dev_priv,
> -			unsigned int engine_mask,
> -			unsigned int retry)
> -{
> -	struct pci_dev *pdev = dev_priv->drm.pdev;
> -	int ret;
> -
> -	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
> -	I915_WRITE(VDECCLK_GATE_D,
> -		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
> -	POSTING_READ(VDECCLK_GATE_D);
> -
> -	pci_write_config_byte(pdev, I915_GDRST,
> -			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
> -	ret =  wait_for(g4x_reset_complete(pdev), 500);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
> -		goto out;
> -	}
> -
> -	pci_write_config_byte(pdev, I915_GDRST,
> -			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
> -	ret =  wait_for(g4x_reset_complete(pdev), 500);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
> -		goto out;
> -	}
> -
> -out:
> -	pci_write_config_byte(pdev, I915_GDRST, 0);
> -
> -	I915_WRITE(VDECCLK_GATE_D,
> -		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
> -	POSTING_READ(VDECCLK_GATE_D);
> -
> -	return ret;
> -}
> -
> -static int ironlake_do_reset(struct drm_i915_private *dev_priv,
> -			     unsigned int engine_mask,
> -			     unsigned int retry)
> -{
> -	int ret;
> -
> -	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
> -	ret = intel_wait_for_register(dev_priv,
> -				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> -				      500);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
> -		goto out;
> -	}
> -
> -	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
> -	ret = intel_wait_for_register(dev_priv,
> -				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> -				      500);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
> -		goto out;
> -	}
> -
> -out:
> -	I915_WRITE(ILK_GDSR, 0);
> -	POSTING_READ(ILK_GDSR);
> -	return ret;
> -}
> -
> -/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
> -static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
> -				u32 hw_domain_mask)
> -{
> -	int err;
> -
> -	/* GEN6_GDRST is not in the gt power well, no need to check
> -	 * for fifo space for the write or forcewake the chip for
> -	 * the read
> -	 */
> -	__raw_i915_write32(dev_priv, GEN6_GDRST, hw_domain_mask);
> -
> -	/* Wait for the device to ack the reset requests */
> -	err = __intel_wait_for_register_fw(dev_priv,
> -					   GEN6_GDRST, hw_domain_mask, 0,
> -					   500, 0,
> -					   NULL);
> -	if (err)
> -		DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
> -				 hw_domain_mask);
> -
> -	return err;
> -}
> -
> -/**
> - * gen6_reset_engines - reset individual engines
> - * @dev_priv: i915 device
> - * @engine_mask: mask of intel_ring_flag() engines or ALL_ENGINES for full reset
> - * @retry: the count of of previous attempts to reset.
> - *
> - * This function will reset the individual engines that are set in engine_mask.
> - * If you provide ALL_ENGINES as mask, full global domain reset will be issued.
> - *
> - * Note: It is responsibility of the caller to handle the difference between
> - * asking full domain reset versus reset for all available individual engines.
> - *
> - * Returns 0 on success, nonzero on error.
> - */
> -static int gen6_reset_engines(struct drm_i915_private *dev_priv,
> -			      unsigned int engine_mask,
> -			      unsigned int retry)
> -{
> -	struct intel_engine_cs *engine;
> -	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
> -		[RCS] = GEN6_GRDOM_RENDER,
> -		[BCS] = GEN6_GRDOM_BLT,
> -		[VCS] = GEN6_GRDOM_MEDIA,
> -		[VCS2] = GEN8_GRDOM_MEDIA2,
> -		[VECS] = GEN6_GRDOM_VECS,
> -	};
> -	u32 hw_mask;
> -
> -	if (engine_mask == ALL_ENGINES) {
> -		hw_mask = GEN6_GRDOM_FULL;
> -	} else {
> -		unsigned int tmp;
> -
> -		hw_mask = 0;
> -		for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
> -			hw_mask |= hw_engine_mask[engine->id];
> -	}
> -
> -	return gen6_hw_domain_reset(dev_priv, hw_mask);
> -}
> -
> -static u32 gen11_lock_sfc(struct drm_i915_private *dev_priv,
> -			  struct intel_engine_cs *engine)
> -{
> -	u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
> -	i915_reg_t sfc_forced_lock, sfc_forced_lock_ack;
> -	u32 sfc_forced_lock_bit, sfc_forced_lock_ack_bit;
> -	i915_reg_t sfc_usage;
> -	u32 sfc_usage_bit;
> -	u32 sfc_reset_bit;
> -
> -	switch (engine->class) {
> -	case VIDEO_DECODE_CLASS:
> -		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
> -			return 0;
> -
> -		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
> -		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
> -
> -		sfc_forced_lock_ack = GEN11_VCS_SFC_LOCK_STATUS(engine);
> -		sfc_forced_lock_ack_bit  = GEN11_VCS_SFC_LOCK_ACK_BIT;
> -
> -		sfc_usage = GEN11_VCS_SFC_LOCK_STATUS(engine);
> -		sfc_usage_bit = GEN11_VCS_SFC_USAGE_BIT;
> -		sfc_reset_bit = GEN11_VCS_SFC_RESET_BIT(engine->instance);
> -		break;
> -
> -	case VIDEO_ENHANCEMENT_CLASS:
> -		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
> -		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
> -
> -		sfc_forced_lock_ack = GEN11_VECS_SFC_LOCK_ACK(engine);
> -		sfc_forced_lock_ack_bit  = GEN11_VECS_SFC_LOCK_ACK_BIT;
> -
> -		sfc_usage = GEN11_VECS_SFC_USAGE(engine);
> -		sfc_usage_bit = GEN11_VECS_SFC_USAGE_BIT;
> -		sfc_reset_bit = GEN11_VECS_SFC_RESET_BIT(engine->instance);
> -		break;
> -
> -	default:
> -		return 0;
> -	}
> -
> -	/*
> -	 * Tell the engine that a software reset is going to happen. The engine
> -	 * will then try to force lock the SFC (if currently locked, it will
> -	 * remain so until we tell the engine it is safe to unlock; if currently
> -	 * unlocked, it will ignore this and all new lock requests). If SFC
> -	 * ends up being locked to the engine we want to reset, we have to reset
> -	 * it as well (we will unlock it once the reset sequence is completed).
> -	 */
> -	I915_WRITE_FW(sfc_forced_lock,
> -		      I915_READ_FW(sfc_forced_lock) | sfc_forced_lock_bit);
> -
> -	if (__intel_wait_for_register_fw(dev_priv,
> -					 sfc_forced_lock_ack,
> -					 sfc_forced_lock_ack_bit,
> -					 sfc_forced_lock_ack_bit,
> -					 1000, 0, NULL)) {
> -		DRM_DEBUG_DRIVER("Wait for SFC forced lock ack failed\n");
> -		return 0;
> -	}
> -
> -	if (I915_READ_FW(sfc_usage) & sfc_usage_bit)
> -		return sfc_reset_bit;
> -
> -	return 0;
> -}
> -
> -static void gen11_unlock_sfc(struct drm_i915_private *dev_priv,
> -			     struct intel_engine_cs *engine)
> -{
> -	u8 vdbox_sfc_access = RUNTIME_INFO(dev_priv)->vdbox_sfc_access;
> -	i915_reg_t sfc_forced_lock;
> -	u32 sfc_forced_lock_bit;
> -
> -	switch (engine->class) {
> -	case VIDEO_DECODE_CLASS:
> -		if ((BIT(engine->instance) & vdbox_sfc_access) == 0)
> -			return;
> -
> -		sfc_forced_lock = GEN11_VCS_SFC_FORCED_LOCK(engine);
> -		sfc_forced_lock_bit = GEN11_VCS_SFC_FORCED_LOCK_BIT;
> -		break;
> -
> -	case VIDEO_ENHANCEMENT_CLASS:
> -		sfc_forced_lock = GEN11_VECS_SFC_FORCED_LOCK(engine);
> -		sfc_forced_lock_bit = GEN11_VECS_SFC_FORCED_LOCK_BIT;
> -		break;
> -
> -	default:
> -		return;
> -	}
> -
> -	I915_WRITE_FW(sfc_forced_lock,
> -		      I915_READ_FW(sfc_forced_lock) & ~sfc_forced_lock_bit);
> -}
> -
> -/**
> - * gen11_reset_engines - reset individual engines
> - * @dev_priv: i915 device
> - * @engine_mask: mask of intel_ring_flag() engines or ALL_ENGINES for full reset
> - *
> - * This function will reset the individual engines that are set in engine_mask.
> - * If you provide ALL_ENGINES as mask, full global domain reset will be issued.
> - *
> - * Note: It is responsibility of the caller to handle the difference between
> - * asking full domain reset versus reset for all available individual engines.
> - *
> - * Returns 0 on success, nonzero on error.
> - */
> -static int gen11_reset_engines(struct drm_i915_private *dev_priv,
> -			       unsigned int engine_mask)
> -{
> -	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
> -		[RCS] = GEN11_GRDOM_RENDER,
> -		[BCS] = GEN11_GRDOM_BLT,
> -		[VCS] = GEN11_GRDOM_MEDIA,
> -		[VCS2] = GEN11_GRDOM_MEDIA2,
> -		[VCS3] = GEN11_GRDOM_MEDIA3,
> -		[VCS4] = GEN11_GRDOM_MEDIA4,
> -		[VECS] = GEN11_GRDOM_VECS,
> -		[VECS2] = GEN11_GRDOM_VECS2,
> -	};
> -	struct intel_engine_cs *engine;
> -	unsigned int tmp;
> -	u32 hw_mask;
> -	int ret;
> -
> -	BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES);
> -
> -	if (engine_mask == ALL_ENGINES) {
> -		hw_mask = GEN11_GRDOM_FULL;
> -	} else {
> -		hw_mask = 0;
> -		for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
> -			hw_mask |= hw_engine_mask[engine->id];
> -			hw_mask |= gen11_lock_sfc(dev_priv, engine);
> -		}
> -	}
> -
> -	ret = gen6_hw_domain_reset(dev_priv, hw_mask);
> -
> -	if (engine_mask != ALL_ENGINES)
> -		for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
> -			gen11_unlock_sfc(dev_priv, engine);
> -
> -	return ret;
> -}
> -
>  /**
>   * __intel_wait_for_register_fw - wait until register matches expected state
>   * @dev_priv: the i915 device
> @@ -2191,196 +1825,6 @@ int __intel_wait_for_register(struct drm_i915_private *dev_priv,
>  	return ret;
>  }
>  
> -static int gen8_engine_reset_prepare(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -	int ret;
> -
> -	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
> -		      _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
> -
> -	ret = __intel_wait_for_register_fw(dev_priv,
> -					   RING_RESET_CTL(engine->mmio_base),
> -					   RESET_CTL_READY_TO_RESET,
> -					   RESET_CTL_READY_TO_RESET,
> -					   700, 0,
> -					   NULL);
> -	if (ret)
> -		DRM_ERROR("%s: reset request timeout\n", engine->name);
> -
> -	return ret;
> -}
> -
> -static void gen8_engine_reset_cancel(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -
> -	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
> -		      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
> -}
> -
> -static int reset_engines(struct drm_i915_private *i915,
> -			 unsigned int engine_mask,
> -			 unsigned int retry)
> -{
> -	if (INTEL_GEN(i915) >= 11)
> -		return gen11_reset_engines(i915, engine_mask);
> -	else
> -		return gen6_reset_engines(i915, engine_mask, retry);
> -}
> -
> -static int gen8_reset_engines(struct drm_i915_private *dev_priv,
> -			      unsigned int engine_mask,
> -			      unsigned int retry)
> -{
> -	struct intel_engine_cs *engine;
> -	const bool reset_non_ready = retry >= 1;
> -	unsigned int tmp;
> -	int ret;
> -
> -	for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
> -		ret = gen8_engine_reset_prepare(engine);
> -		if (ret && !reset_non_ready)
> -			goto skip_reset;
> -
> -		/*
> -		 * If this is not the first failed attempt to prepare,
> -		 * we decide to proceed anyway.
> -		 *
> -		 * By doing so we risk context corruption and with
> -		 * some gens (kbl), possible system hang if reset
> -		 * happens during active bb execution.
> -		 *
> -		 * We rather take context corruption instead of
> -		 * failed reset with a wedged driver/gpu. And
> -		 * active bb execution case should be covered by
> -		 * i915_stop_engines we have before the reset.
> -		 */
> -	}
> -
> -	ret = reset_engines(dev_priv, engine_mask, retry);
> -
> -skip_reset:
> -	for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
> -		gen8_engine_reset_cancel(engine);
> -
> -	return ret;
> -}
> -
> -typedef int (*reset_func)(struct drm_i915_private *,
> -			  unsigned int engine_mask, unsigned int retry);
> -
> -static reset_func intel_get_gpu_reset(struct drm_i915_private *dev_priv)
> -{
> -	if (!i915_modparams.reset)
> -		return NULL;
> -
> -	if (INTEL_GEN(dev_priv) >= 8)
> -		return gen8_reset_engines;
> -	else if (INTEL_GEN(dev_priv) >= 6)
> -		return gen6_reset_engines;
> -	else if (IS_GEN(dev_priv, 5))
> -		return ironlake_do_reset;
> -	else if (IS_G4X(dev_priv))
> -		return g4x_do_reset;
> -	else if (IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
> -		return g33_do_reset;
> -	else if (INTEL_GEN(dev_priv) >= 3)
> -		return i915_do_reset;
> -	else
> -		return NULL;
> -}
> -
> -int intel_gpu_reset(struct drm_i915_private *dev_priv,
> -		    const unsigned int engine_mask)
> -{
> -	reset_func reset = intel_get_gpu_reset(dev_priv);
> -	unsigned int retry;
> -	int ret;
> -
> -	GEM_BUG_ON(!engine_mask);
> -
> -	/*
> -	 * We want to perform per-engine reset from atomic context (e.g.
> -	 * softirq), which imposes the constraint that we cannot sleep.
> -	 * However, experience suggests that spending a bit of time waiting
> -	 * for a reset helps in various cases, so for a full-device reset
> -	 * we apply the opposite rule and wait if we want to. As we should
> -	 * always follow up a failed per-engine reset with a full device reset,
> -	 * being a little faster, stricter and more error prone for the
> -	 * atomic case seems an acceptable compromise.
> -	 *
> -	 * Unfortunately this leads to a bimodal routine, when the goal was
> -	 * to have a single reset function that worked for resetting any
> -	 * number of engines simultaneously.
> -	 */
> -	might_sleep_if(engine_mask == ALL_ENGINES);
> -
> -	/*
> -	 * If the power well sleeps during the reset, the reset
> -	 * request may be dropped and never completes (causing -EIO).
> -	 */
> -	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> -	for (retry = 0; retry < 3; retry++) {
> -
> -		/*
> -		 * We stop engines, otherwise we might get failed reset and a
> -		 * dead gpu (on elk). Also as modern gpu as kbl can suffer
> -		 * from system hang if batchbuffer is progressing when
> -		 * the reset is issued, regardless of READY_TO_RESET ack.
> -		 * Thus assume it is best to stop engines on all gens
> -		 * where we have a gpu reset.
> -		 *
> -		 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
> -		 *
> -		 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
> -		 *
> -		 * FIXME: Wa for more modern gens needs to be validated
> -		 */
> -		i915_stop_engines(dev_priv, engine_mask);
> -
> -		ret = -ENODEV;
> -		if (reset) {
> -			ret = reset(dev_priv, engine_mask, retry);
> -			GEM_TRACE("engine_mask=%x, ret=%d, retry=%d\n",
> -				  engine_mask, ret, retry);
> -		}
> -		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
> -			break;
> -
> -		cond_resched();
> -	}
> -	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> -
> -	return ret;
> -}
> -
> -bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
> -{
> -	return intel_get_gpu_reset(dev_priv) != NULL;
> -}
> -
> -bool intel_has_reset_engine(struct drm_i915_private *dev_priv)
> -{
> -	return (INTEL_INFO(dev_priv)->has_reset_engine &&
> -		i915_modparams.reset >= 2);
> -}
> -
> -int intel_reset_guc(struct drm_i915_private *dev_priv)
> -{
> -	u32 guc_domain = INTEL_GEN(dev_priv) >= 11 ? GEN11_GRDOM_GUC :
> -						     GEN9_GRDOM_GUC;
> -	int ret;
> -
> -	GEM_BUG_ON(!HAS_GUC(dev_priv));
> -
> -	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> -	ret = gen6_hw_domain_reset(dev_priv, guc_domain);
> -	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> -
> -	return ret;
> -}
> -
>  bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
>  {
>  	return check_for_unclaimed_mmio(dev_priv);
> diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> index e6073cd4719c..2b2ecd76c2ac 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> @@ -4,6 +4,8 @@
>   * Copyright © 2018 Intel Corporation
>   */
>  
> +#include "../i915_reset.h"
> +
>  #include "../i915_selftest.h"
>  #include "igt_flush_test.h"
>  #include "igt_spinner.h"
> diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> index 9009d7b8b136..a8cac56be835 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> @@ -5,6 +5,7 @@
>   */
>  
>  #include "../i915_selftest.h"
> +#include "../i915_reset.h"
>  
>  #include "igt_flush_test.h"
>  #include "igt_reset.h"
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/8] drm/i915: Pull all the reset functionality together into i915_reset.c
  2019-01-16 15:06   ` Mika Kuoppala
@ 2019-01-16 15:31     ` Chris Wilson
  0 siblings, 0 replies; 20+ messages in thread
From: Chris Wilson @ 2019-01-16 15:31 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2019-01-16 15:06:37)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Currently the code to reset the GPU and our state is spread widely
> > across a few files. Pull the logic together into a common file.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Dunno how it goes but gut feeling that this would have
> been better at the end of series after the dust has settled.

Or perhaps if we applied it last June... :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/8] drm/i915: Make all GPU resets atomic
  2019-01-14 21:04 ` [PATCH 4/8] drm/i915: Make all GPU resets atomic Chris Wilson
@ 2019-01-17 14:14   ` Mika Kuoppala
  0 siblings, 0 replies; 20+ messages in thread
From: Mika Kuoppala @ 2019-01-17 14:14 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> In preparation for the next few commits, make resetting the GPU atomic.
> Currently, we have prepared gen6+ for atomic resetting of individual
> engines, but now there is a requirement to perform the whole device
> level reset (just the register poking) from inside an atomic context.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_reset.c             | 50 ++++++++++---------
>  .../gpu/drm/i915/selftests/mock_gem_device.c  |  4 +-
>  2 files changed, 29 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index e2e40b44a9a8..f9512e07646d 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -144,14 +144,14 @@ static int i915_do_reset(struct drm_i915_private *i915,
>  
>  	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
>  	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> -	usleep_range(50, 200);
> -	err = wait_for(i915_in_reset(pdev), 500);
> +	udelay(50);
> +	err = wait_for_atomic(i915_in_reset(pdev), 50);
>  
>  	/* Clear the reset request. */
>  	pci_write_config_byte(pdev, I915_GDRST, 0);
> -	usleep_range(50, 200);
> +	udelay(50);
>  	if (!err)
> -		err = wait_for(!i915_in_reset(pdev), 500);
> +		err = wait_for_atomic(!i915_in_reset(pdev), 50);

50ms still seems long but I guess you want to play it safe.

Wouldn't it be nice if we would get easily publish a completion
time value to some external database from CI.

>  
>  	return err;
>  }
> @@ -171,7 +171,7 @@ static int g33_do_reset(struct drm_i915_private *i915,
>  	struct pci_dev *pdev = i915->drm.pdev;
>  
>  	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> -	return wait_for(g4x_reset_complete(pdev), 500);
> +	return wait_for_atomic(g4x_reset_complete(pdev), 50);
>  }
>  
>  static int g4x_do_reset(struct drm_i915_private *dev_priv,
> @@ -182,13 +182,13 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
>  	int ret;
>  
>  	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
> -	I915_WRITE(VDECCLK_GATE_D,
> -		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
> -	POSTING_READ(VDECCLK_GATE_D);
> +	I915_WRITE_FW(VDECCLK_GATE_D,
> +		      I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
> +	POSTING_READ_FW(VDECCLK_GATE_D);
>  
>  	pci_write_config_byte(pdev, I915_GDRST,
>  			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
> -	ret =  wait_for(g4x_reset_complete(pdev), 500);
> +	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
>  	if (ret) {
>  		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
>  		goto out;
> @@ -196,7 +196,7 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
>  
>  	pci_write_config_byte(pdev, I915_GDRST,
>  			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
> -	ret =  wait_for(g4x_reset_complete(pdev), 500);
> +	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
>  	if (ret) {
>  		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
>  		goto out;
> @@ -205,9 +205,9 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
>  out:
>  	pci_write_config_byte(pdev, I915_GDRST, 0);
>  
> -	I915_WRITE(VDECCLK_GATE_D,
> -		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
> -	POSTING_READ(VDECCLK_GATE_D);
> +	I915_WRITE_FW(VDECCLK_GATE_D,
> +		      I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
> +	POSTING_READ_FW(VDECCLK_GATE_D);
>  
>  	return ret;
>  }
> @@ -218,27 +218,29 @@ static int ironlake_do_reset(struct drm_i915_private *dev_priv,
>  {
>  	int ret;
>  
> -	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
> -	ret = intel_wait_for_register(dev_priv,
> -				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> -				      500);
> +	I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
> +	ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR,
> +					   ILK_GRDOM_RESET_ENABLE, 0,
> +					   5000, 0,
> +					   NULL);

From 500ms to 5ms. There has been some slack in there for sure...

>  	if (ret) {
>  		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
>  		goto out;
>  	}
>  
> -	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
> -	ret = intel_wait_for_register(dev_priv,
> -				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> -				      500);
> +	I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
> +	ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR,
> +					   ILK_GRDOM_RESET_ENABLE, 0,
> +					   5000, 0,
> +					   NULL);
>  	if (ret) {
>  		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
>  		goto out;
>  	}
>  
>  out:
> -	I915_WRITE(ILK_GDSR, 0);
> -	POSTING_READ(ILK_GDSR);
> +	I915_WRITE_FW(ILK_GDSR, 0);
> +	POSTING_READ_FW(ILK_GDSR);
>  	return ret;
>  }
>  
> @@ -572,7 +574,9 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
>  		ret = -ENODEV;
>  		if (reset) {

Ok, i might have missed some spot but for me it looks
like you can lift the might_sleep_if(engine_mask == ALL_ENGINES)
from above.

>  			GEM_TRACE("engine_mask=%x\n", engine_mask);
> +			preempt_disable();
>  			ret = reset(i915, engine_mask, retry);
> +			preempt_enable();
>  		}
>  		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
>  			break;
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index 3cda66292e76..888c6978bc54 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -58,8 +58,8 @@ static void mock_device_release(struct drm_device *dev)
>  	i915_gem_contexts_lost(i915);
>  	mutex_unlock(&i915->drm.struct_mutex);
>  
> -	cancel_delayed_work_sync(&i915->gt.retire_work);
> -	cancel_delayed_work_sync(&i915->gt.idle_work);
> +	drain_delayed_work(&i915->gt.retire_work);
> +	drain_delayed_work(&i915->gt.idle_work);

Yeah, but why in this patch?
-Mika

>  	i915_gem_drain_workqueue(i915);
>  
>  	mutex_lock(&i915->drm.struct_mutex);
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 5/8] drm/i915/guc: Disable global reset
  2019-01-14 21:04 ` [PATCH 5/8] drm/i915/guc: Disable global reset Chris Wilson
@ 2019-01-17 14:24   ` Mika Kuoppala
  2019-01-17 18:27     ` Daniele Ceraolo Spurio
  0 siblings, 1 reply; 20+ messages in thread
From: Mika Kuoppala @ 2019-01-17 14:24 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> The guc (and huc) currently inexcruitably depend on struct_mutex for
> device reinitialisation from inside the reset, and indeed taking any
> mutex here is verboten (as we must be able to reset from underneath any
> of our mutexes). That makes recovering the guc unviable without, for
> example, reserving contiguous vma space and pages for it to use.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Acked-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

We do want ack from Daniele as well.

-Mika

> ---
>  drivers/gpu/drm/i915/i915_reset.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index f9512e07646d..c9a844d2626f 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -590,6 +590,9 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
>  
>  bool intel_has_gpu_reset(struct drm_i915_private *i915)
>  {
> +	if (USES_GUC(i915))
> +		return false;
> +
>  	return intel_get_gpu_reset(i915);
>  }
>  
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 5/8] drm/i915/guc: Disable global reset
  2019-01-17 14:24   ` Mika Kuoppala
@ 2019-01-17 18:27     ` Daniele Ceraolo Spurio
  0 siblings, 0 replies; 20+ messages in thread
From: Daniele Ceraolo Spurio @ 2019-01-17 18:27 UTC (permalink / raw)
  To: Mika Kuoppala, Chris Wilson, intel-gfx



On 01/17/2019 06:24 AM, Mika Kuoppala wrote:
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
>> The guc (and huc) currently inexcruitably depend on struct_mutex for
>> device reinitialisation from inside the reset, and indeed taking any
>> mutex here is verboten (as we must be able to reset from underneath any
>> of our mutexes). That makes recovering the guc unviable without, for
>> example, reserving contiguous vma space and pages for it to use.
>>
>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Acked-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> 
> We do want ack from Daniele as well.
> 

As long as no one opposes the temporary arrangement we discussed to 
re-enable the reset with guc (perma-pinning the firmware in the GuC 
unaccessible range of the GGTT),

Acked-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>

Daniele

> -Mika
> 
>> ---
>>   drivers/gpu/drm/i915/i915_reset.c | 3 +++
>>   1 file changed, 3 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
>> index f9512e07646d..c9a844d2626f 100644
>> --- a/drivers/gpu/drm/i915/i915_reset.c
>> +++ b/drivers/gpu/drm/i915/i915_reset.c
>> @@ -590,6 +590,9 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
>>   
>>   bool intel_has_gpu_reset(struct drm_i915_private *i915)
>>   {
>> +	if (USES_GUC(i915))
>> +		return false;
>> +
>>   	return intel_get_gpu_reset(i915);
>>   }
>>   
>> -- 
>> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2019-01-17 18:27 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-14 21:04 Mika's reward Chris Wilson
2019-01-14 21:04 ` [PATCH 1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged() Chris Wilson
2019-01-15 11:56   ` Mika Kuoppala
2019-01-15 12:05     ` Chris Wilson
2019-01-16  9:27       ` Chris Wilson
2019-01-16 15:04         ` Mika Kuoppala
2019-01-14 21:04 ` [PATCH 2/8] drm/i915: Differentiate between ggtt->mutex and ppgtt->mutex Chris Wilson
2019-01-14 21:04 ` [PATCH 3/8] drm/i915: Pull all the reset functionality together into i915_reset.c Chris Wilson
2019-01-16 15:06   ` Mika Kuoppala
2019-01-16 15:31     ` Chris Wilson
2019-01-14 21:04 ` [PATCH 4/8] drm/i915: Make all GPU resets atomic Chris Wilson
2019-01-17 14:14   ` Mika Kuoppala
2019-01-14 21:04 ` [PATCH 5/8] drm/i915/guc: Disable global reset Chris Wilson
2019-01-17 14:24   ` Mika Kuoppala
2019-01-17 18:27     ` Daniele Ceraolo Spurio
2019-01-14 21:04 ` [PATCH 6/8] drm/i915: Remove GPU reset dependence on struct_mutex Chris Wilson
2019-01-14 21:04 ` [PATCH 7/8] drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest Chris Wilson
2019-01-14 21:04 ` [PATCH 8/8] drm/i915: Issue engine resets onto idle engines Chris Wilson
2019-01-14 21:26 ` ✗ Fi.CI.BAT: failure for series starting with [1/8] drm/i915: Serialise concurrent calls to i915_gem_set_wedged() Patchwork
2019-01-14 21:59   ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.