All of lore.kernel.org
 help / color / mirror / Atom feed
* Cleanup live_hangcheck flippers
@ 2018-07-11  7:36 Chris Wilson
  2018-07-11  7:36 ` [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex Chris Wilson
                   ` (10 more replies)
  0 siblings, 11 replies; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  7:36 UTC (permalink / raw)
  To: intel-gfx

While killing off struct_mutex inside reset opens the door to killing
off struct_mutex *everywhere*, my ulterior goal for this series to fixup
the live_hangcheck/live_workaround BAT/IGT flip-flops.

Please review kindly,
-Chris


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
@ 2018-07-11  7:36 ` Chris Wilson
  2018-07-11  8:09   ` Daniel Vetter
  2018-07-11  9:33   ` Daniel Vetter
  2018-07-11  7:36 ` [PATCH 2/7] drm/i915: Move fence register tracking to GGTT Chris Wilson
                   ` (9 subsequent siblings)
  10 siblings, 2 replies; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  7:36 UTC (permalink / raw)
  To: intel-gfx

Add a mutex into struct i915_address_space to be used while operating on
the vma and their lists for a particular vm. As this may be called from
the shrinker, we taint the mutex with fs_reclaim so that from the start
lockdep warns us if we are caught holding the mutex across an
allocation. (With such small steps we will eventually rid ourselves of
struct_mutex recursion!)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h          |  2 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c      | 10 ++++++++++
 drivers/gpu/drm/i915/i915_gem_gtt.h      |  2 ++
 drivers/gpu/drm/i915/i915_gem_shrinker.c | 12 ++++++++++++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index eeb002a47032..01dd29837233 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3304,7 +3304,7 @@ unsigned long i915_gem_shrink(struct drm_i915_private *i915,
 unsigned long i915_gem_shrink_all(struct drm_i915_private *i915);
 void i915_gem_shrinker_register(struct drm_i915_private *i915);
 void i915_gem_shrinker_unregister(struct drm_i915_private *i915);
-
+void i915_gem_shrinker_taints_mutex(struct mutex *mutex);
 
 /* i915_gem_tiling.c */
 static inline bool i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index abd81fb9b0b6..d0acef299b9c 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -531,6 +531,14 @@ static void vm_free_page(struct i915_address_space *vm, struct page *page)
 static void i915_address_space_init(struct i915_address_space *vm,
 				    struct drm_i915_private *dev_priv)
 {
+	/*
+	 * The vm->mutex must be reclaim safe (for use in the shrinker).
+	 * Do a dummy acquire now under fs_reclaim so that any allocation
+	 * attempt holding the lock is immediately reported by lockdep.
+	 */
+	mutex_init(&vm->mutex);
+	i915_gem_shrinker_taints_mutex(&vm->mutex);
+
 	GEM_BUG_ON(!vm->total);
 	drm_mm_init(&vm->mm, 0, vm->total);
 	vm->mm.head_node.color = I915_COLOR_UNEVICTABLE;
@@ -551,6 +559,8 @@ static void i915_address_space_fini(struct i915_address_space *vm)
 	spin_unlock(&vm->free_pages.lock);
 
 	drm_mm_takedown(&vm->mm);
+
+	mutex_destroy(&vm->mutex);
 }
 
 static int __setup_page_dma(struct i915_address_space *vm,
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index feda45dfd481..14e62651010b 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -293,6 +293,8 @@ struct i915_address_space {
 
 	bool closed;
 
+	struct mutex mutex; /* protects vma and our lists */
+
 	struct i915_page_dma scratch_page;
 	struct i915_page_table *scratch_pt;
 	struct i915_page_directory *scratch_pd;
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index c61f5b80fee3..ea90d3a0d511 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -23,6 +23,7 @@
  */
 
 #include <linux/oom.h>
+#include <linux/sched/mm.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
@@ -531,3 +532,14 @@ void i915_gem_shrinker_unregister(struct drm_i915_private *i915)
 	WARN_ON(unregister_oom_notifier(&i915->mm.oom_notifier));
 	unregister_shrinker(&i915->mm.shrinker);
 }
+
+void i915_gem_shrinker_taints_mutex(struct mutex *mutex)
+{
+	if (!IS_ENABLED(CONFIG_LOCKDEP))
+		return;
+
+	fs_reclaim_acquire(GFP_KERNEL);
+	mutex_lock(mutex);
+	mutex_unlock(mutex);
+	fs_reclaim_release(GFP_KERNEL);
+}
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 2/7] drm/i915: Move fence register tracking to GGTT
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
  2018-07-11  7:36 ` [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex Chris Wilson
@ 2018-07-11  7:36 ` Chris Wilson
  2018-07-11  8:19   ` Daniel Vetter
  2018-07-11  7:36 ` [PATCH 3/7] drm/i915: Convert fences to use a GGTT lock rather than struct_mutex Chris Wilson
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  7:36 UTC (permalink / raw)
  To: intel-gfx

As the fence registers define special regions of the mappable aperture
inside the Global GTT, and we track those regions using GGTT VMA, it
makes sense to pull that bookkeeping under i915_ggtt. The advantage is
that we can then start using a local GGTT lock to handle the fence
registers (in conjunction with the GGTT VMA) rather than struct_mutex.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gvt/gvt.h            |  2 +-
 drivers/gpu/drm/i915/i915_debugfs.c       | 16 ++---
 drivers/gpu/drm/i915/i915_drv.c           |  4 +-
 drivers/gpu/drm/i915/i915_drv.h           |  7 ---
 drivers/gpu/drm/i915/i915_gem.c           | 33 +++++-----
 drivers/gpu/drm/i915/i915_gem_fence_reg.c | 76 ++++++++++++-----------
 drivers/gpu/drm/i915/i915_gem_fence_reg.h |  9 ++-
 drivers/gpu/drm/i915/i915_gem_gtt.c       |  8 ++-
 drivers/gpu/drm/i915/i915_gem_gtt.h       |  7 ++-
 drivers/gpu/drm/i915/i915_gpu_error.c     |  7 ++-
 10 files changed, 89 insertions(+), 80 deletions(-)

diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
index de2a3a2580be..11609a4003ff 100644
--- a/drivers/gpu/drm/i915/gvt/gvt.h
+++ b/drivers/gpu/drm/i915/gvt/gvt.h
@@ -391,7 +391,7 @@ int intel_gvt_load_firmware(struct intel_gvt *gvt);
 #define gvt_hidden_gmadr_end(gvt) (gvt_hidden_gmadr_base(gvt) \
 				   + gvt_hidden_sz(gvt) - 1)
 
-#define gvt_fence_sz(gvt) (gvt->dev_priv->num_fence_regs)
+#define gvt_fence_sz(gvt) ((gvt)->dev_priv->ggtt.num_fence_regs)
 
 /* Aperture/GM space definitions for vGPU */
 #define vgpu_aperture_offset(vgpu)	((vgpu)->gm.low_gm_node.start)
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 099f97ef2303..75ffed6a3f31 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -914,20 +914,20 @@ static int i915_interrupt_info(struct seq_file *m, void *data)
 
 static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
 {
-	struct drm_i915_private *dev_priv = node_to_i915(m->private);
-	struct drm_device *dev = &dev_priv->drm;
+	struct drm_i915_private *i915 = node_to_i915(m->private);
+	const struct i915_ggtt *ggtt = &i915->ggtt;
 	int i, ret;
 
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
+	ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
 	if (ret)
 		return ret;
 
-	seq_printf(m, "Total fences = %d\n", dev_priv->num_fence_regs);
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct i915_vma *vma = dev_priv->fence_regs[i].vma;
+	seq_printf(m, "Total fences = %d\n", ggtt->num_fence_regs);
+	for (i = 0; i < ggtt->num_fence_regs; i++) {
+		struct i915_vma *vma = ggtt->fence_regs[i].vma;
 
 		seq_printf(m, "Fence %d, pin count = %d, object = ",
-			   i, dev_priv->fence_regs[i].pin_count);
+			   i, ggtt->fence_regs[i].pin_count);
 		if (!vma)
 			seq_puts(m, "unused");
 		else
@@ -935,7 +935,7 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
 		seq_putc(m, '\n');
 	}
 
-	mutex_unlock(&dev->struct_mutex);
+	mutex_unlock(&i915->drm.struct_mutex);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 3eba3d1ab5b8..97a2054c38d4 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -321,7 +321,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data,
 		value = pdev->revision;
 		break;
 	case I915_PARAM_NUM_FENCES_AVAIL:
-		value = dev_priv->num_fence_regs;
+		value = dev_priv->ggtt.num_fence_regs;
 		break;
 	case I915_PARAM_HAS_OVERLAY:
 		value = dev_priv->overlay ? 1 : 0;
@@ -1154,8 +1154,6 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
 
 	intel_uncore_sanitize(dev_priv);
 
-	i915_gem_load_init_fences(dev_priv);
-
 	/* On the 945G/GM, the chipset reports the MSI capability on the
 	 * integrated graphics even though the support isn't actually there
 	 * according to the published specs.  It doesn't appear to function
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 01dd29837233..a7f2d747e221 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -966,9 +966,6 @@ struct i915_gem_mm {
 	struct notifier_block vmap_notifier;
 	struct shrinker shrinker;
 
-	/** LRU list of objects with fence regs on them. */
-	struct list_head fence_list;
-
 	/**
 	 * Workqueue to fault in userptr pages, flushed by the execbuf
 	 * when required but otherwise left to userspace to try again
@@ -1678,9 +1675,6 @@ struct drm_i915_private {
 	/* protects panel power sequencer state */
 	struct mutex pps_mutex;
 
-	struct drm_i915_fence_reg fence_regs[I915_MAX_NUM_FENCES]; /* assume 965 */
-	int num_fence_regs; /* 8 on pre-965, 16 otherwise */
-
 	unsigned int fsb_freq, mem_freq, is_ddr3;
 	unsigned int skl_preferred_vco_freq;
 	unsigned int max_cdclk_freq;
@@ -2886,7 +2880,6 @@ int i915_gem_wait_ioctl(struct drm_device *dev, void *data,
 void i915_gem_sanitize(struct drm_i915_private *i915);
 int i915_gem_init_early(struct drm_i915_private *dev_priv);
 void i915_gem_cleanup_early(struct drm_i915_private *dev_priv);
-void i915_gem_load_init_fences(struct drm_i915_private *dev_priv);
 int i915_gem_freeze(struct drm_i915_private *dev_priv);
 int i915_gem_freeze_late(struct drm_i915_private *dev_priv);
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 07a92ca61dbf..356c86071ccc 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2214,8 +2214,9 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
 	intel_runtime_pm_put(i915);
 }
 
-void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
+void i915_gem_runtime_suspend(struct drm_i915_private *i915)
 {
+	struct i915_ggtt *ggtt = &i915->ggtt;
 	struct drm_i915_gem_object *obj, *on;
 	int i;
 
@@ -2227,15 +2228,15 @@ void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
 	 */
 
 	list_for_each_entry_safe(obj, on,
-				 &dev_priv->mm.userfault_list, userfault_link)
+				 &i915->mm.userfault_list, userfault_link)
 		__i915_gem_object_release_mmap(obj);
 
 	/* The fence will be lost when the device powers down. If any were
 	 * in use by hardware (i.e. they are pinned), we should not be powering
 	 * down! All other fences will be reacquired by the user upon waking.
 	 */
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
+	for (i = 0; i < ggtt->num_fence_regs; i++) {
+		struct drm_i915_fence_reg *reg = &ggtt->fence_regs[i];
 
 		/* Ideally we want to assert that the fence register is not
 		 * live at this point (i.e. that no piece of code will be
@@ -5630,32 +5631,33 @@ i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
 		dev_priv->gt.cleanup_engine(engine);
 }
 
-void
-i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
+void i915_ggtt_init_fences(struct i915_ggtt *ggtt)
 {
+	struct drm_i915_private *dev_priv = ggtt->vm.i915;
 	int i;
 
 	if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
 	    !IS_CHERRYVIEW(dev_priv))
-		dev_priv->num_fence_regs = 32;
+		ggtt->num_fence_regs = 32;
 	else if (INTEL_GEN(dev_priv) >= 4 ||
 		 IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
 		 IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
-		dev_priv->num_fence_regs = 16;
+		ggtt->num_fence_regs = 16;
 	else
-		dev_priv->num_fence_regs = 8;
+		ggtt->num_fence_regs = 8;
 
 	if (intel_vgpu_active(dev_priv))
-		dev_priv->num_fence_regs =
-				I915_READ(vgtif_reg(avail_rs.fence_num));
+		ggtt->num_fence_regs = I915_READ(vgtif_reg(avail_rs.fence_num));
+
+	INIT_LIST_HEAD(&ggtt->fence_list);
 
 	/* Initialize fence registers to zero */
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
+	for (i = 0; i < ggtt->num_fence_regs; i++) {
+		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
 
-		fence->i915 = dev_priv;
+		fence->ggtt = ggtt;
 		fence->id = i;
-		list_add_tail(&fence->link, &dev_priv->mm.fence_list);
+		list_add_tail(&fence->link, &ggtt->fence_list);
 	}
 	i915_gem_restore_fences(dev_priv);
 
@@ -5672,7 +5674,6 @@ static void i915_gem_init__mm(struct drm_i915_private *i915)
 
 	INIT_LIST_HEAD(&i915->mm.unbound_list);
 	INIT_LIST_HEAD(&i915->mm.bound_list);
-	INIT_LIST_HEAD(&i915->mm.fence_list);
 	INIT_LIST_HEAD(&i915->mm.userfault_list);
 
 	INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index d548ac05ccd7..60fa5a8276cb 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -64,7 +64,7 @@ static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
 	int fence_pitch_shift;
 	u64 val;
 
-	if (INTEL_GEN(fence->i915) >= 6) {
+	if (INTEL_GEN(fence->ggtt->vm.i915) >= 6) {
 		fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
 		fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
 		fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
@@ -93,7 +93,7 @@ static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
 	}
 
 	if (!pipelined) {
-		struct drm_i915_private *dev_priv = fence->i915;
+		struct drm_i915_private *dev_priv = fence->ggtt->vm.i915;
 
 		/* To w/a incoherency with non-atomic 64-bit register updates,
 		 * we split the 64-bit update into two 32-bit writes. In order
@@ -129,7 +129,7 @@ static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
 		GEM_BUG_ON(!is_power_of_2(vma->fence_size));
 		GEM_BUG_ON(!IS_ALIGNED(vma->node.start, vma->fence_size));
 
-		if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence->i915))
+		if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence->ggtt->vm.i915))
 			stride /= 128;
 		else
 			stride /= 512;
@@ -145,7 +145,7 @@ static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
 	}
 
 	if (!pipelined) {
-		struct drm_i915_private *dev_priv = fence->i915;
+		struct drm_i915_private *dev_priv = fence->ggtt->vm.i915;
 		i915_reg_t reg = FENCE_REG(fence->id);
 
 		I915_WRITE(reg, val);
@@ -177,7 +177,7 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
 	}
 
 	if (!pipelined) {
-		struct drm_i915_private *dev_priv = fence->i915;
+		struct drm_i915_private *dev_priv = fence->ggtt->vm.i915;
 		i915_reg_t reg = FENCE_REG(fence->id);
 
 		I915_WRITE(reg, val);
@@ -193,9 +193,9 @@ static void fence_write(struct drm_i915_fence_reg *fence,
 	 * and explicitly managed for internal users.
 	 */
 
-	if (IS_GEN2(fence->i915))
+	if (IS_GEN2(fence->ggtt->vm.i915))
 		i830_write_fence_reg(fence, vma);
-	else if (IS_GEN3(fence->i915))
+	else if (IS_GEN3(fence->ggtt->vm.i915))
 		i915_write_fence_reg(fence, vma);
 	else
 		i965_write_fence_reg(fence, vma);
@@ -210,6 +210,7 @@ static void fence_write(struct drm_i915_fence_reg *fence,
 static int fence_update(struct drm_i915_fence_reg *fence,
 			struct i915_vma *vma)
 {
+	struct i915_ggtt *ggtt = fence->ggtt;
 	int ret;
 
 	if (vma) {
@@ -250,16 +251,16 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		fence->vma->fence = NULL;
 		fence->vma = NULL;
 
-		list_move(&fence->link, &fence->i915->mm.fence_list);
+		list_move(&fence->link, &ggtt->fence_list);
 	}
 
 	/* We only need to update the register itself if the device is awake.
 	 * If the device is currently powered down, we will defer the write
 	 * to the runtime resume, see i915_gem_restore_fences().
 	 */
-	if (intel_runtime_pm_get_if_in_use(fence->i915)) {
+	if (intel_runtime_pm_get_if_in_use(ggtt->vm.i915)) {
 		fence_write(fence, vma);
-		intel_runtime_pm_put(fence->i915);
+		intel_runtime_pm_put(ggtt->vm.i915);
 	}
 
 	if (vma) {
@@ -268,7 +269,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 			fence->vma = vma;
 		}
 
-		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
+		list_move_tail(&fence->link, &ggtt->fence_list);
 	}
 
 	return 0;
@@ -298,11 +299,11 @@ int i915_vma_put_fence(struct i915_vma *vma)
 	return fence_update(fence, NULL);
 }
 
-static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
+static struct drm_i915_fence_reg *fence_find(struct i915_ggtt *ggtt)
 {
 	struct drm_i915_fence_reg *fence;
 
-	list_for_each_entry(fence, &dev_priv->mm.fence_list, link) {
+	list_for_each_entry(fence, &ggtt->fence_list, link) {
 		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
 
 		if (fence->pin_count)
@@ -312,7 +313,7 @@ static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
 	}
 
 	/* Wait for completion of pending flips which consume fences */
-	if (intel_has_pending_fb_unpin(dev_priv))
+	if (intel_has_pending_fb_unpin(ggtt->vm.i915))
 		return ERR_PTR(-EAGAIN);
 
 	return ERR_PTR(-EDEADLK);
@@ -339,14 +340,15 @@ static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
 int
 i915_vma_pin_fence(struct i915_vma *vma)
 {
-	struct drm_i915_fence_reg *fence;
+	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vma->vm);
 	struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
+	struct drm_i915_fence_reg *fence;
 	int err;
 
 	/* Note that we revoke fences on runtime suspend. Therefore the user
 	 * must keep the device awake whilst using the fence.
 	 */
-	assert_rpm_wakelock_held(vma->vm->i915);
+	assert_rpm_wakelock_held(ggtt->vm.i915);
 
 	/* Just update our place in the LRU if our fence is getting reused. */
 	if (vma->fence) {
@@ -354,12 +356,11 @@ i915_vma_pin_fence(struct i915_vma *vma)
 		GEM_BUG_ON(fence->vma != vma);
 		fence->pin_count++;
 		if (!fence->dirty) {
-			list_move_tail(&fence->link,
-				       &fence->i915->mm.fence_list);
+			list_move_tail(&fence->link, &ggtt->fence_list);
 			return 0;
 		}
 	} else if (set) {
-		fence = fence_find(vma->vm->i915);
+		fence = fence_find(ggtt);
 		if (IS_ERR(fence))
 			return PTR_ERR(fence);
 
@@ -385,28 +386,29 @@ i915_vma_pin_fence(struct i915_vma *vma)
 
 /**
  * i915_reserve_fence - Reserve a fence for vGPU
- * @dev_priv: i915 device private
+ * @i915: i915 device private
  *
  * This function walks the fence regs looking for a free one and remove
  * it from the fence_list. It is used to reserve fence for vGPU to use.
  */
 struct drm_i915_fence_reg *
-i915_reserve_fence(struct drm_i915_private *dev_priv)
+i915_reserve_fence(struct drm_i915_private *i915)
 {
+	struct i915_ggtt *ggtt = &i915->ggtt;
 	struct drm_i915_fence_reg *fence;
 	int count;
 	int ret;
 
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+	lockdep_assert_held(&i915->drm.struct_mutex);
 
 	/* Keep at least one fence available for the display engine. */
 	count = 0;
-	list_for_each_entry(fence, &dev_priv->mm.fence_list, link)
+	list_for_each_entry(fence, &ggtt->fence_list, link)
 		count += !fence->pin_count;
 	if (count <= 1)
 		return ERR_PTR(-ENOSPC);
 
-	fence = fence_find(dev_priv);
+	fence = fence_find(ggtt);
 	if (IS_ERR(fence))
 		return fence;
 
@@ -429,14 +431,14 @@ i915_reserve_fence(struct drm_i915_private *dev_priv)
  */
 void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
 {
-	lockdep_assert_held(&fence->i915->drm.struct_mutex);
+	lockdep_assert_held(&fence->ggtt->vm.i915->drm.struct_mutex);
 
-	list_add(&fence->link, &fence->i915->mm.fence_list);
+	list_add(&fence->link, &fence->ggtt->fence_list);
 }
 
 /**
  * i915_gem_revoke_fences - revoke fence state
- * @dev_priv: i915 device private
+ * @i915: i915 device private
  *
  * Removes all GTT mmappings via the fence registers. This forces any user
  * of the fence to reacquire that fence before continuing with their access.
@@ -444,14 +446,15 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
  * revoke concurrent userspace access via GTT mmaps until the hardware has been
  * reset and the fence registers have been restored.
  */
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
+void i915_gem_revoke_fences(struct drm_i915_private *i915)
 {
+	struct i915_ggtt *ggtt = &i915->ggtt;
 	int i;
 
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
+	lockdep_assert_held(&i915->drm.struct_mutex);
 
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
+	for (i = 0; i < ggtt->num_fence_regs; i++) {
+		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
 
 		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
 
@@ -462,18 +465,19 @@ void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
 
 /**
  * i915_gem_restore_fences - restore fence state
- * @dev_priv: i915 device private
+ * @i915: i915 device private
  *
  * Restore the hw fence state to match the software tracking again, to be called
  * after a gpu reset and on resume. Note that on runtime suspend we only cancel
  * the fences, to be reacquired by the user later.
  */
-void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
+void i915_gem_restore_fences(struct drm_i915_private *i915)
 {
+	struct i915_ggtt *ggtt = &i915->ggtt;
 	int i;
 
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
+	for (i = 0; i < ggtt->num_fence_regs; i++) {
+		struct drm_i915_fence_reg *reg = &ggtt->fence_regs[i];
 		struct i915_vma *vma = reg->vma;
 
 		GEM_BUG_ON(vma && vma->fence != reg);
@@ -486,7 +490,7 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
 			GEM_BUG_ON(!reg->dirty);
 			GEM_BUG_ON(i915_vma_has_userfault(vma));
 
-			list_move(&reg->link, &dev_priv->mm.fence_list);
+			list_move(&reg->link, &ggtt->fence_list);
 			vma->fence = NULL;
 			vma = NULL;
 		}
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
index 99a31ded4dfd..c8f1d0cdfa90 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -28,16 +28,20 @@
 #include <linux/list.h>
 
 struct drm_i915_private;
+struct i915_ggtt;
 struct i915_vma;
 
 #define I965_FENCE_PAGE 4096UL
 
 struct drm_i915_fence_reg {
 	struct list_head link;
-	struct drm_i915_private *i915;
+
+	struct i915_ggtt *ggtt;
 	struct i915_vma *vma;
+
 	int pin_count;
 	int id;
+
 	/**
 	 * Whether the tiling parameters for the currently
 	 * associated fence register have changed. Note that
@@ -49,5 +53,6 @@ struct drm_i915_fence_reg {
 	bool dirty;
 };
 
-#endif
+void i915_ggtt_init_fences(struct i915_ggtt *ggtt);
 
+#endif
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index d0acef299b9c..abf41f90a925 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -3595,9 +3595,11 @@ int i915_ggtt_init_hw(struct drm_i915_private *dev_priv)
 		ggtt->vm.mm.color_adjust = i915_gtt_color_adjust;
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
-	if (!io_mapping_init_wc(&dev_priv->ggtt.iomap,
-				dev_priv->ggtt.gmadr.start,
-				dev_priv->ggtt.mappable_end)) {
+	i915_ggtt_init_fences(ggtt);
+
+	if (!io_mapping_init_wc(&ggtt->iomap,
+				ggtt->gmadr.start,
+				ggtt->mappable_end)) {
 		ret = -EIO;
 		goto out_gtt_cleanup;
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 14e62651010b..f35a85284b1a 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -38,6 +38,7 @@
 #include <linux/mm.h>
 #include <linux/pagevec.h>
 
+#include "i915_gem_fence_reg.h"
 #include "i915_request.h"
 #include "i915_selftest.h"
 #include "i915_timeline.h"
@@ -57,7 +58,6 @@
 #define I915_MAX_NUM_FENCE_BITS 6
 
 struct drm_i915_file_private;
-struct drm_i915_fence_reg;
 struct i915_vma;
 
 typedef u32 gen6_pte_t;
@@ -396,6 +396,11 @@ struct i915_ggtt {
 
 	int mtrr;
 
+	/** LRU list of objects with fence regs on them. */
+	struct list_head fence_list;
+	struct drm_i915_fence_reg fence_regs[I915_MAX_NUM_FENCES];
+	int num_fence_regs;
+
 	struct drm_mm_node error_capture;
 };
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 8c81cf3aa182..9dfe1d02f098 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1093,16 +1093,17 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
 static void gem_record_fences(struct i915_gpu_state *error)
 {
 	struct drm_i915_private *dev_priv = error->i915;
+	const struct i915_ggtt *ggtt = &error->i915->ggtt;
 	int i;
 
 	if (INTEL_GEN(dev_priv) >= 6) {
-		for (i = 0; i < dev_priv->num_fence_regs; i++)
+		for (i = 0; i < ggtt->num_fence_regs; i++)
 			error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i));
 	} else if (INTEL_GEN(dev_priv) >= 4) {
-		for (i = 0; i < dev_priv->num_fence_regs; i++)
+		for (i = 0; i < ggtt->num_fence_regs; i++)
 			error->fence[i] = I915_READ64(FENCE_REG_965_LO(i));
 	} else {
-		for (i = 0; i < dev_priv->num_fence_regs; i++)
+		for (i = 0; i < ggtt->num_fence_regs; i++)
 			error->fence[i] = I915_READ(FENCE_REG(i));
 	}
 	error->nfence = i;
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 3/7] drm/i915: Convert fences to use a GGTT lock rather than struct_mutex
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
  2018-07-11  7:36 ` [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex Chris Wilson
  2018-07-11  7:36 ` [PATCH 2/7] drm/i915: Move fence register tracking to GGTT Chris Wilson
@ 2018-07-11  7:36 ` Chris Wilson
  2018-07-11  9:08   ` Daniel Vetter
  2018-07-11  7:36 ` [PATCH 4/7] drm/i915: Move fence-reg interface to i915_gem_fence_reg.h Chris Wilson
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  7:36 UTC (permalink / raw)
  To: intel-gfx

Introduce a new mutex to guard all of the vma operations within a vm (as
opposed to the BKL struct_mutex) and start by using it to guard the
fence operations for a GGTT VMA.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |  9 ++-
 drivers/gpu/drm/i915/i915_gem.c            | 11 +++-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  5 +-
 drivers/gpu/drm/i915/i915_gem_fence_reg.c  | 68 +++++++++++++++++-----
 drivers/gpu/drm/i915/i915_vma.c            | 12 ++--
 drivers/gpu/drm/i915/i915_vma.h            | 23 +++++++-
 6 files changed, 96 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 75ffed6a3f31..e2ba298a5d88 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -80,7 +80,7 @@ static char get_tiling_flag(struct drm_i915_gem_object *obj)
 
 static char get_global_flag(struct drm_i915_gem_object *obj)
 {
-	return obj->userfault_count ? 'g' : ' ';
+	return READ_ONCE(obj->userfault_count) ? 'g' : ' ';
 }
 
 static char get_pin_mapped_flag(struct drm_i915_gem_object *obj)
@@ -914,11 +914,10 @@ static int i915_interrupt_info(struct seq_file *m, void *data)
 
 static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
 {
-	struct drm_i915_private *i915 = node_to_i915(m->private);
-	const struct i915_ggtt *ggtt = &i915->ggtt;
+	struct i915_ggtt *ggtt = &node_to_i915(m->private)->ggtt;
 	int i, ret;
 
-	ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
+	ret = mutex_lock_interruptible(&ggtt->vm.mutex);
 	if (ret)
 		return ret;
 
@@ -935,7 +934,7 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
 		seq_putc(m, '\n');
 	}
 
-	mutex_unlock(&i915->drm.struct_mutex);
+	mutex_unlock(&ggtt->vm.mutex);
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 356c86071ccc..cbcba613b175 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2193,8 +2193,8 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
 	 * requirement that operations to the GGTT be made holding the RPM
 	 * wakeref.
 	 */
-	lockdep_assert_held(&i915->drm.struct_mutex);
 	intel_runtime_pm_get(i915);
+	mutex_lock(&i915->ggtt.vm.mutex);
 
 	if (!obj->userfault_count)
 		goto out;
@@ -2211,6 +2211,7 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
 	wmb();
 
 out:
+	mutex_unlock(&i915->ggtt.vm.mutex);
 	intel_runtime_pm_put(i915);
 }
 
@@ -2223,10 +2224,12 @@ void i915_gem_runtime_suspend(struct drm_i915_private *i915)
 	/*
 	 * Only called during RPM suspend. All users of the userfault_list
 	 * must be holding an RPM wakeref to ensure that this can not
-	 * run concurrently with themselves (and use the struct_mutex for
+	 * run concurrently with themselves (and use the ggtt->mutex for
 	 * protection between themselves).
 	 */
 
+	mutex_lock(&i915->ggtt.vm.mutex);
+
 	list_for_each_entry_safe(obj, on,
 				 &i915->mm.userfault_list, userfault_link)
 		__i915_gem_object_release_mmap(obj);
@@ -2255,6 +2258,8 @@ void i915_gem_runtime_suspend(struct drm_i915_private *i915)
 		GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
 		reg->dirty = true;
 	}
+
+	mutex_unlock(&i915->ggtt.vm.mutex);
 }
 
 static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
@@ -4861,7 +4866,7 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		mutex_unlock(&i915->drm.struct_mutex);
 
 		GEM_BUG_ON(obj->bind_count);
-		GEM_BUG_ON(obj->userfault_count);
+		GEM_BUG_ON(READ_ONCE(obj->userfault_count));
 		GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
 		GEM_BUG_ON(!list_empty(&obj->lut_list));
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 3f0c612d42e7..e1d65b165bf1 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -426,8 +426,11 @@ static inline void __eb_unreserve_vma(struct i915_vma *vma, unsigned int flags)
 {
 	GEM_BUG_ON(!(flags & __EXEC_OBJECT_HAS_PIN));
 
-	if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE))
+	if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE)) {
+		mutex_lock(&vma->vm->mutex);
 		__i915_vma_unpin_fence(vma);
+		mutex_unlock(&vma->vm->mutex);
+	}
 
 	__i915_vma_unpin(vma);
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 60fa5a8276cb..9313a8e675c8 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -188,6 +188,8 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
 static void fence_write(struct drm_i915_fence_reg *fence,
 			struct i915_vma *vma)
 {
+	lockdep_assert_held(&fence->ggtt->vm.mutex);
+
 	/* Previous access through the fence register is marshalled by
 	 * the mb() inside the fault handlers (i915_gem_release_mmaps)
 	 * and explicitly managed for internal users.
@@ -213,6 +215,8 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 	struct i915_ggtt *ggtt = fence->ggtt;
 	int ret;
 
+	lockdep_assert_held(&ggtt->vm.mutex);
+
 	if (vma) {
 		if (!i915_vma_is_map_and_fenceable(vma))
 			return -EINVAL;
@@ -289,14 +293,39 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 int i915_vma_put_fence(struct i915_vma *vma)
 {
 	struct drm_i915_fence_reg *fence = vma->fence;
+	int err;
 
 	if (!fence)
 		return 0;
 
-	if (fence->pin_count)
-		return -EBUSY;
+	mutex_lock(&vma->vm->mutex);
+	if (!fence->pin_count)
+		err = fence_update(fence, NULL);
+	else
+		err = -EBUSY;
+	mutex_unlock(&vma->vm->mutex);
 
-	return fence_update(fence, NULL);
+	return err;
+}
+
+void i915_vma_revoke_fence(struct i915_vma *vma)
+{
+	struct drm_i915_fence_reg *fence;
+
+	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
+	lockdep_assert_held(&vma->vm->mutex);
+
+	fence = vma->fence;
+	if (!fence)
+		return;
+
+	GEM_BUG_ON(fence->pin_count);
+
+	list_move(&fence->link, &i915_vm_to_ggtt(vma->vm)->fence_list);
+	vma->fence = NULL;
+
+	fence_write(fence, NULL);
+	fence->vma = NULL;
 }
 
 static struct drm_i915_fence_reg *fence_find(struct i915_ggtt *ggtt)
@@ -337,8 +366,7 @@ static struct drm_i915_fence_reg *fence_find(struct i915_ggtt *ggtt)
  *
  * 0 on success, negative error code on failure.
  */
-int
-i915_vma_pin_fence(struct i915_vma *vma)
+int __i915_vma_pin_fence(struct i915_vma *vma)
 {
 	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vma->vm);
 	struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
@@ -349,6 +377,7 @@ i915_vma_pin_fence(struct i915_vma *vma)
 	 * must keep the device awake whilst using the fence.
 	 */
 	assert_rpm_wakelock_held(ggtt->vm.i915);
+	lockdep_assert_held(&ggtt->vm.mutex);
 
 	/* Just update our place in the LRU if our fence is getting reused. */
 	if (vma->fence) {
@@ -399,27 +428,34 @@ i915_reserve_fence(struct drm_i915_private *i915)
 	int count;
 	int ret;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
+	mutex_lock(&i915->ggtt.vm.mutex);
 
 	/* Keep at least one fence available for the display engine. */
 	count = 0;
 	list_for_each_entry(fence, &ggtt->fence_list, link)
 		count += !fence->pin_count;
-	if (count <= 1)
-		return ERR_PTR(-ENOSPC);
+	if (count <= 1) {
+		fence = ERR_PTR(-ENOSPC);
+		goto out_unlock;
+	}
 
 	fence = fence_find(ggtt);
 	if (IS_ERR(fence))
-		return fence;
+		goto out_unlock;
 
 	if (fence->vma) {
 		/* Force-remove fence from VMA */
 		ret = fence_update(fence, NULL);
-		if (ret)
-			return ERR_PTR(ret);
+		if (ret) {
+			fence = ERR_PTR(ret);
+			goto out_unlock;
+		}
 	}
 
 	list_del(&fence->link);
+
+out_unlock:
+	mutex_unlock(&i915->ggtt.vm.mutex);
 	return fence;
 }
 
@@ -431,9 +467,9 @@ i915_reserve_fence(struct drm_i915_private *i915)
  */
 void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
 {
-	lockdep_assert_held(&fence->ggtt->vm.i915->drm.struct_mutex);
-
+	mutex_lock(&fence->ggtt->vm.mutex);
 	list_add(&fence->link, &fence->ggtt->fence_list);
+	mutex_unlock(&fence->ggtt->vm.mutex);
 }
 
 /**
@@ -451,8 +487,7 @@ void i915_gem_revoke_fences(struct drm_i915_private *i915)
 	struct i915_ggtt *ggtt = &i915->ggtt;
 	int i;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
-
+	mutex_lock(&ggtt->vm.mutex);
 	for (i = 0; i < ggtt->num_fence_regs; i++) {
 		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
 
@@ -461,6 +496,7 @@ void i915_gem_revoke_fences(struct drm_i915_private *i915)
 		if (fence->vma)
 			i915_vma_revoke_mmap(fence->vma);
 	}
+	mutex_unlock(&ggtt->vm.mutex);
 }
 
 /**
@@ -476,6 +512,7 @@ void i915_gem_restore_fences(struct drm_i915_private *i915)
 	struct i915_ggtt *ggtt = &i915->ggtt;
 	int i;
 
+	mutex_lock(&ggtt->vm.mutex);
 	for (i = 0; i < ggtt->num_fence_regs; i++) {
 		struct drm_i915_fence_reg *reg = &ggtt->fence_regs[i];
 		struct i915_vma *vma = reg->vma;
@@ -498,6 +535,7 @@ void i915_gem_restore_fences(struct drm_i915_private *i915)
 		fence_write(reg, vma);
 		reg->vma = vma;
 	}
+	mutex_unlock(&ggtt->vm.mutex);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index ed4e0fb558f7..045b75d79f60 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -860,7 +860,7 @@ void i915_vma_revoke_mmap(struct i915_vma *vma)
 	struct drm_vma_offset_node *node = &vma->obj->base.vma_node;
 	u64 vma_offset;
 
-	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
+	lockdep_assert_held(&vma->vm->mutex);
 
 	if (!i915_vma_has_userfault(vma))
 		return;
@@ -1082,6 +1082,8 @@ int i915_vma_unbind(struct i915_vma *vma)
 		return 0;
 
 	if (i915_vma_is_map_and_fenceable(vma)) {
+		mutex_lock(&vma->vm->mutex);
+
 		/*
 		 * Check that we have flushed all writes through the GGTT
 		 * before the unbind, other due to non-strict nature of those
@@ -1091,16 +1093,14 @@ int i915_vma_unbind(struct i915_vma *vma)
 		i915_vma_flush_writes(vma);
 		GEM_BUG_ON(i915_vma_has_ggtt_write(vma));
 
-		/* release the fence reg _after_ flushing */
-		ret = i915_vma_put_fence(vma);
-		if (ret)
-			return ret;
-
 		/* Force a pagefault for domain tracking on next user access */
 		i915_vma_revoke_mmap(vma);
+		i915_vma_revoke_fence(vma);
 
 		__i915_vma_iounmap(vma);
 		vma->flags &= ~I915_VMA_CAN_FENCE;
+
+		mutex_unlock(&vma->vm->mutex);
 	}
 	GEM_BUG_ON(vma->fence);
 	GEM_BUG_ON(i915_vma_has_userfault(vma));
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index f06d66377107..422d90c686b5 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -190,6 +190,7 @@ static inline bool i915_vma_set_userfault(struct i915_vma *vma)
 
 static inline void i915_vma_unset_userfault(struct i915_vma *vma)
 {
+	lockdep_assert_held(&vma->vm->mutex);
 	return __clear_bit(I915_VMA_USERFAULT_BIT, &vma->flags);
 }
 
@@ -378,11 +379,26 @@ static inline struct page *i915_vma_first_page(struct i915_vma *vma)
  *
  * True if the vma has a fence, false otherwise.
  */
-int i915_vma_pin_fence(struct i915_vma *vma);
+int __i915_vma_pin_fence(struct i915_vma *vma);
+static inline int i915_vma_pin_fence(struct i915_vma *vma)
+{
+	int err;
+
+	mutex_lock(&vma->vm->mutex);
+	err = __i915_vma_pin_fence(vma);
+	mutex_unlock(&vma->vm->mutex);
+
+	return err;
+}
+
 int __must_check i915_vma_put_fence(struct i915_vma *vma);
+void i915_vma_revoke_fence(struct i915_vma *vma);
 
 static inline void __i915_vma_unpin_fence(struct i915_vma *vma)
 {
+	lockdep_assert_held(&vma->vm->mutex);
+	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
+
 	GEM_BUG_ON(vma->fence->pin_count <= 0);
 	vma->fence->pin_count--;
 }
@@ -399,8 +415,11 @@ static inline void
 i915_vma_unpin_fence(struct i915_vma *vma)
 {
 	/* lockdep_assert_held(&vma->vm->i915->drm.struct_mutex); */
-	if (vma->fence)
+	if (vma->fence) {
+		mutex_lock(&vma->vm->mutex);
 		__i915_vma_unpin_fence(vma);
+		mutex_unlock(&vma->vm->mutex);
+	}
 }
 
 void i915_vma_parked(struct drm_i915_private *i915);
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 4/7] drm/i915: Move fence-reg interface to i915_gem_fence_reg.h
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
                   ` (2 preceding siblings ...)
  2018-07-11  7:36 ` [PATCH 3/7] drm/i915: Convert fences to use a GGTT lock rather than struct_mutex Chris Wilson
@ 2018-07-11  7:36 ` Chris Wilson
  2018-07-11  7:36 ` [PATCH 5/7] drm/i915: Dynamically allocate the array of drm_i915_gem_fence_reg Chris Wilson
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  7:36 UTC (permalink / raw)
  To: intel-gfx

Since we have a header file for i915_gem_fence_reg, let's use it for the
interface prototypes currently hidden away in the huge i915_drv.h

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h           | 15 ---------------
 drivers/gpu/drm/i915/i915_gem_fence_reg.h | 16 ++++++++++++++++
 drivers/gpu/drm/i915/i915_vma.h           |  2 +-
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a7f2d747e221..43f545add21c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -70,7 +70,6 @@
 
 #include "i915_gem.h"
 #include "i915_gem_context.h"
-#include "i915_gem_fence_reg.h"
 #include "i915_gem_object.h"
 #include "i915_gem_gtt.h"
 #include "i915_gpu_error.h"
@@ -3193,20 +3192,6 @@ i915_vm_to_ppgtt(struct i915_address_space *vm)
 	return container_of(vm, struct i915_hw_ppgtt, vm);
 }
 
-/* i915_gem_fence_reg.c */
-struct drm_i915_fence_reg *
-i915_reserve_fence(struct drm_i915_private *dev_priv);
-void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
-
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv);
-void i915_gem_restore_fences(struct drm_i915_private *dev_priv);
-
-void i915_gem_detect_bit_6_swizzle(struct drm_i915_private *dev_priv);
-void i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
-				       struct sg_table *pages);
-void i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj,
-					 struct sg_table *pages);
-
 static inline struct i915_gem_context *
 __i915_gem_context_lookup_rcu(struct drm_i915_file_private *file_priv, u32 id)
 {
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
index c8f1d0cdfa90..c510f8efc1bb 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -27,7 +27,10 @@
 
 #include <linux/list.h>
 
+struct sg_table;
+
 struct drm_i915_private;
+struct drm_i915_gem_object;
 struct i915_ggtt;
 struct i915_vma;
 
@@ -55,4 +58,17 @@ struct drm_i915_fence_reg {
 
 void i915_ggtt_init_fences(struct i915_ggtt *ggtt);
 
+struct drm_i915_fence_reg *
+i915_reserve_fence(struct drm_i915_private *i915);
+void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
+
+void i915_gem_revoke_fences(struct drm_i915_private *i915);
+void i915_gem_restore_fences(struct drm_i915_private *i915);
+
+void i915_gem_detect_bit_6_swizzle(struct drm_i915_private *i915);
+void i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
+				       struct sg_table *pages);
+void i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj,
+					 struct sg_table *pages);
+
 #endif
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 422d90c686b5..925af79cc6d6 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -31,12 +31,12 @@
 #include <drm/drm_mm.h>
 
 #include "i915_gem_gtt.h"
-#include "i915_gem_fence_reg.h"
 #include "i915_gem_object.h"
 
 #include "i915_request.h"
 
 enum i915_cache_level;
+struct drm_i915_fence_reg;
 
 /**
  * A VMA represents a GEM BO that is bound into an address space. Therefore, a
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 5/7] drm/i915: Dynamically allocate the array of drm_i915_gem_fence_reg
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
                   ` (3 preceding siblings ...)
  2018-07-11  7:36 ` [PATCH 4/7] drm/i915: Move fence-reg interface to i915_gem_fence_reg.h Chris Wilson
@ 2018-07-11  7:36 ` Chris Wilson
  2018-07-11  9:11   ` Daniel Vetter
  2018-07-11  7:36 ` [PATCH 6/7] drm/i915: Pull all the reset functionality together into i915_reset.c Chris Wilson
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  7:36 UTC (permalink / raw)
  To: intel-gfx

If we dynamically allocate the correct sized array for the fence
registers, we can avoid the 4x overallocation on older, typically
smaller devices and avoid having to know the static layout in advance.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c           | 33 ------------
 drivers/gpu/drm/i915/i915_gem_fence_reg.h |  2 -
 drivers/gpu/drm/i915/i915_gem_gtt.c       | 64 +++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_gem_gtt.h       |  3 +-
 drivers/gpu/drm/i915/i915_vma.h           |  1 +
 5 files changed, 62 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cbcba613b175..8eecd68f9e23 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5636,39 +5636,6 @@ i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
 		dev_priv->gt.cleanup_engine(engine);
 }
 
-void i915_ggtt_init_fences(struct i915_ggtt *ggtt)
-{
-	struct drm_i915_private *dev_priv = ggtt->vm.i915;
-	int i;
-
-	if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
-	    !IS_CHERRYVIEW(dev_priv))
-		ggtt->num_fence_regs = 32;
-	else if (INTEL_GEN(dev_priv) >= 4 ||
-		 IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
-		 IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
-		ggtt->num_fence_regs = 16;
-	else
-		ggtt->num_fence_regs = 8;
-
-	if (intel_vgpu_active(dev_priv))
-		ggtt->num_fence_regs = I915_READ(vgtif_reg(avail_rs.fence_num));
-
-	INIT_LIST_HEAD(&ggtt->fence_list);
-
-	/* Initialize fence registers to zero */
-	for (i = 0; i < ggtt->num_fence_regs; i++) {
-		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
-
-		fence->ggtt = ggtt;
-		fence->id = i;
-		list_add_tail(&fence->link, &ggtt->fence_list);
-	}
-	i915_gem_restore_fences(dev_priv);
-
-	i915_gem_detect_bit_6_swizzle(dev_priv);
-}
-
 static void i915_gem_init__mm(struct drm_i915_private *i915)
 {
 	spin_lock_init(&i915->mm.object_stat_lock);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
index c510f8efc1bb..6e66f6b3f851 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -56,8 +56,6 @@ struct drm_i915_fence_reg {
 	bool dirty;
 };
 
-void i915_ggtt_init_fences(struct i915_ggtt *ggtt);
-
 struct drm_i915_fence_reg *
 i915_reserve_fence(struct drm_i915_private *i915);
 void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index abf41f90a925..e6787c3af544 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -37,6 +37,7 @@
 #include <drm/i915_drm.h>
 
 #include "i915_drv.h"
+#include "i915_gem_fence_reg.h"
 #include "i915_vgpu.h"
 #include "i915_trace.h"
 #include "intel_drv.h"
@@ -2901,6 +2902,51 @@ void i915_gem_fini_aliasing_ppgtt(struct drm_i915_private *i915)
 	ggtt->vm.vma_ops.unbind_vma = ggtt_unbind_vma;
 }
 
+static int i915_ggtt_init_fences(struct i915_ggtt *ggtt)
+{
+	struct drm_i915_private *dev_priv = ggtt->vm.i915;
+	int i;
+
+	if (INTEL_GEN(dev_priv) >= 7 &&
+	    !(IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)))
+		ggtt->num_fence_regs = 32;
+	else if (INTEL_GEN(dev_priv) >= 4 ||
+		 IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
+		 IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
+		ggtt->num_fence_regs = 16;
+	else
+		ggtt->num_fence_regs = 8;
+
+	if (intel_vgpu_active(dev_priv))
+		ggtt->num_fence_regs = I915_READ(vgtif_reg(avail_rs.fence_num));
+
+	ggtt->fence_regs = kcalloc(ggtt->num_fence_regs,
+				   sizeof(*ggtt->fence_regs),
+				   GFP_KERNEL);
+	if (!ggtt->fence_regs)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&ggtt->fence_list);
+
+	/* Initialize fence registers to zero */
+	for (i = 0; i < ggtt->num_fence_regs; i++) {
+		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
+
+		fence->ggtt = ggtt;
+		fence->id = i;
+		list_add_tail(&fence->link, &ggtt->fence_list);
+	}
+	i915_gem_restore_fences(dev_priv);
+
+	i915_gem_detect_bit_6_swizzle(dev_priv);
+	return 0;
+}
+
+static void i915_ggtt_cleanup_fences(struct i915_ggtt *ggtt)
+{
+	kfree(ggtt->fence_regs);
+}
+
 int i915_gem_init_ggtt(struct drm_i915_private *dev_priv)
 {
 	/* Let GEM Manage all of the aperture.
@@ -2990,6 +3036,8 @@ void i915_ggtt_cleanup_hw(struct drm_i915_private *dev_priv)
 
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
+	i915_ggtt_cleanup_fences(ggtt);
+
 	arch_phys_wc_del(ggtt->mtrr);
 	io_mapping_fini(&ggtt->iomap);
 
@@ -3595,13 +3643,15 @@ int i915_ggtt_init_hw(struct drm_i915_private *dev_priv)
 		ggtt->vm.mm.color_adjust = i915_gtt_color_adjust;
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
-	i915_ggtt_init_fences(ggtt);
+	ret = i915_ggtt_init_fences(ggtt);
+	if (ret)
+		goto err_fini;
 
 	if (!io_mapping_init_wc(&ggtt->iomap,
 				ggtt->gmadr.start,
 				ggtt->mappable_end)) {
 		ret = -EIO;
-		goto out_gtt_cleanup;
+		goto err_fences;
 	}
 
 	ggtt->mtrr = arch_phys_wc_add(ggtt->gmadr.start, ggtt->mappable_end);
@@ -3612,12 +3662,18 @@ int i915_ggtt_init_hw(struct drm_i915_private *dev_priv)
 	 */
 	ret = i915_gem_init_stolen(dev_priv);
 	if (ret)
-		goto out_gtt_cleanup;
+		goto err_io;
 
 	return 0;
 
-out_gtt_cleanup:
+err_io:
+	arch_phys_wc_del(ggtt->mtrr);
+	io_mapping_fini(&ggtt->iomap);
+err_fences:
+	i915_ggtt_cleanup_fences(ggtt);
+err_fini:
 	ggtt->vm.cleanup(&ggtt->vm);
+	i915_address_space_fini(&ggtt->vm);
 	return ret;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index f35a85284b1a..f8c372dd6362 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -38,7 +38,6 @@
 #include <linux/mm.h>
 #include <linux/pagevec.h>
 
-#include "i915_gem_fence_reg.h"
 #include "i915_request.h"
 #include "i915_selftest.h"
 #include "i915_timeline.h"
@@ -398,7 +397,7 @@ struct i915_ggtt {
 
 	/** LRU list of objects with fence regs on them. */
 	struct list_head fence_list;
-	struct drm_i915_fence_reg fence_regs[I915_MAX_NUM_FENCES];
+	struct drm_i915_fence_reg *fence_regs;
 	int num_fence_regs;
 
 	struct drm_mm_node error_capture;
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 925af79cc6d6..7df156e1ca06 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -30,6 +30,7 @@
 
 #include <drm/drm_mm.h>
 
+#include "i915_gem_fence_reg.h"
 #include "i915_gem_gtt.h"
 #include "i915_gem_object.h"
 
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 6/7] drm/i915: Pull all the reset functionality together into i915_reset.c
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
                   ` (4 preceding siblings ...)
  2018-07-11  7:36 ` [PATCH 5/7] drm/i915: Dynamically allocate the array of drm_i915_gem_fence_reg Chris Wilson
@ 2018-07-11  7:36 ` Chris Wilson
  2018-07-11  9:17   ` Daniel Vetter
  2018-07-11  7:36 ` [PATCH 7/7] drm/i915: Remove GPU reset dependence on struct_mutex Chris Wilson
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  7:36 UTC (permalink / raw)
  To: intel-gfx

Currently the code to reset the GPU and our state is spread widely
across a few files. Pull the logic together into a common file.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Makefile                 |    3 +-
 drivers/gpu/drm/i915/i915_debugfs.c           |    2 +
 drivers/gpu/drm/i915/i915_drv.c               |  207 +--
 drivers/gpu/drm/i915/i915_drv.h               |   31 +-
 drivers/gpu/drm/i915/i915_gem.c               |  465 +-----
 drivers/gpu/drm/i915/i915_irq.c               |  220 ---
 drivers/gpu/drm/i915/i915_request.c           |    1 +
 drivers/gpu/drm/i915/i915_reset.c             | 1271 +++++++++++++++++
 drivers/gpu/drm/i915/i915_reset.h             |   37 +
 drivers/gpu/drm/i915/intel_display.c          |   15 +-
 drivers/gpu/drm/i915/intel_guc.h              |    3 +
 drivers/gpu/drm/i915/intel_hangcheck.c        |    1 +
 drivers/gpu/drm/i915/intel_uc.c               |    1 +
 drivers/gpu/drm/i915/intel_uncore.c           |  415 ------
 .../drm/i915/selftests/intel_workarounds.c    |    1 +
 15 files changed, 1342 insertions(+), 1331 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_reset.c
 create mode 100644 drivers/gpu/drm/i915/i915_reset.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 5794f102f9b8..d09799e79893 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -40,7 +40,8 @@ i915-y := i915_drv.o \
 	  i915_mm.o \
 	  i915_params.o \
 	  i915_pci.o \
-          i915_suspend.o \
+	  i915_reset.o \
+	  i915_suspend.o \
 	  i915_syncmap.o \
 	  i915_sw_fence.o \
 	  i915_sysfs.o \
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index e2ba298a5d88..a0f519c44410 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -32,6 +32,8 @@
 #include "intel_drv.h"
 #include "intel_guc_submission.h"
 
+#include "i915_reset.h"
+
 static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
 {
 	return to_i915(node->minor->dev);
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 97a2054c38d4..fa3b4144a7fa 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -49,6 +49,7 @@
 #include "i915_drv.h"
 #include "i915_trace.h"
 #include "i915_pmu.h"
+#include "i915_reset.h"
 #include "i915_query.h"
 #include "i915_vgpu.h"
 #include "intel_drv.h"
@@ -1878,212 +1879,6 @@ static int i915_resume_switcheroo(struct drm_device *dev)
 	return i915_drm_resume(dev);
 }
 
-/**
- * i915_reset - reset chip after a hang
- * @i915: #drm_i915_private to reset
- * @stalled_mask: mask of the stalled engines with the guilty requests
- * @reason: user error message for why we are resetting
- *
- * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
- * on failure.
- *
- * Caller must hold the struct_mutex.
- *
- * Procedure is fairly simple:
- *   - reset the chip using the reset reg
- *   - re-init context state
- *   - re-init hardware status page
- *   - re-init ring buffer
- *   - re-init interrupt state
- *   - re-init display
- */
-void i915_reset(struct drm_i915_private *i915,
-		unsigned int stalled_mask,
-		const char *reason)
-{
-	struct i915_gpu_error *error = &i915->gpu_error;
-	int ret;
-	int i;
-
-	GEM_TRACE("flags=%lx\n", error->flags);
-
-	might_sleep();
-	lockdep_assert_held(&i915->drm.struct_mutex);
-	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
-
-	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
-		return;
-
-	/* Clear any previous failed attempts at recovery. Time to try again. */
-	if (!i915_gem_unset_wedged(i915))
-		goto wakeup;
-
-	if (reason)
-		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
-	error->reset_count++;
-
-	disable_irq(i915->drm.irq);
-	ret = i915_gem_reset_prepare(i915);
-	if (ret) {
-		dev_err(i915->drm.dev, "GPU recovery failed\n");
-		goto taint;
-	}
-
-	if (!intel_has_gpu_reset(i915)) {
-		if (i915_modparams.reset)
-			dev_err(i915->drm.dev, "GPU reset not supported\n");
-		else
-			DRM_DEBUG_DRIVER("GPU reset disabled\n");
-		goto error;
-	}
-
-	for (i = 0; i < 3; i++) {
-		ret = intel_gpu_reset(i915, ALL_ENGINES);
-		if (ret == 0)
-			break;
-
-		msleep(100);
-	}
-	if (ret) {
-		dev_err(i915->drm.dev, "Failed to reset chip\n");
-		goto taint;
-	}
-
-	/* Ok, now get things going again... */
-
-	/*
-	 * Everything depends on having the GTT running, so we need to start
-	 * there.
-	 */
-	ret = i915_ggtt_enable_hw(i915);
-	if (ret) {
-		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
-			  ret);
-		goto error;
-	}
-
-	i915_gem_reset(i915, stalled_mask);
-	intel_overlay_reset(i915);
-
-	/*
-	 * Next we need to restore the context, but we don't use those
-	 * yet either...
-	 *
-	 * Ring buffer needs to be re-initialized in the KMS case, or if X
-	 * was running at the time of the reset (i.e. we weren't VT
-	 * switched away).
-	 */
-	ret = i915_gem_init_hw(i915);
-	if (ret) {
-		DRM_ERROR("Failed to initialise HW following reset (%d)\n",
-			  ret);
-		goto error;
-	}
-
-	i915_queue_hangcheck(i915);
-
-finish:
-	i915_gem_reset_finish(i915);
-	enable_irq(i915->drm.irq);
-
-wakeup:
-	clear_bit(I915_RESET_HANDOFF, &error->flags);
-	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
-	return;
-
-taint:
-	/*
-	 * History tells us that if we cannot reset the GPU now, we
-	 * never will. This then impacts everything that is run
-	 * subsequently. On failing the reset, we mark the driver
-	 * as wedged, preventing further execution on the GPU.
-	 * We also want to go one step further and add a taint to the
-	 * kernel so that any subsequent faults can be traced back to
-	 * this failure. This is important for CI, where if the
-	 * GPU/driver fails we would like to reboot and restart testing
-	 * rather than continue on into oblivion. For everyone else,
-	 * the system should still plod along, but they have been warned!
-	 */
-	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
-error:
-	i915_gem_set_wedged(i915);
-	i915_retire_requests(i915);
-	goto finish;
-}
-
-static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
-					struct intel_engine_cs *engine)
-{
-	return intel_gpu_reset(dev_priv, intel_engine_flag(engine));
-}
-
-/**
- * i915_reset_engine - reset GPU engine to recover from a hang
- * @engine: engine to reset
- * @msg: reason for GPU reset; or NULL for no dev_notice()
- *
- * Reset a specific GPU engine. Useful if a hang is detected.
- * Returns zero on successful reset or otherwise an error code.
- *
- * Procedure is:
- *  - identifies the request that caused the hang and it is dropped
- *  - reset engine (which will force the engine to idle)
- *  - re-init/configure engine
- */
-int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
-{
-	struct i915_gpu_error *error = &engine->i915->gpu_error;
-	struct i915_request *active_request;
-	int ret;
-
-	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
-	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
-
-	active_request = i915_gem_reset_prepare_engine(engine);
-	if (IS_ERR_OR_NULL(active_request)) {
-		/* Either the previous reset failed, or we pardon the reset. */
-		ret = PTR_ERR(active_request);
-		goto out;
-	}
-
-	if (msg)
-		dev_notice(engine->i915->drm.dev,
-			   "Resetting %s for %s\n", engine->name, msg);
-	error->reset_engine_count[engine->id]++;
-
-	if (!engine->i915->guc.execbuf_client)
-		ret = intel_gt_reset_engine(engine->i915, engine);
-	else
-		ret = intel_guc_reset_engine(&engine->i915->guc, engine);
-	if (ret) {
-		/* If we fail here, we expect to fallback to a global reset */
-		DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
-				 engine->i915->guc.execbuf_client ? "GuC " : "",
-				 engine->name, ret);
-		goto out;
-	}
-
-	/*
-	 * The request that caused the hang is stuck on elsp, we know the
-	 * active request and can drop it, adjust head to skip the offending
-	 * request to resume executing remaining requests in the queue.
-	 */
-	i915_gem_reset_engine(engine, active_request, true);
-
-	/*
-	 * The engine and its registers (and workarounds in case of render)
-	 * have been reset to their default values. Follow the init_ring
-	 * process to program RING_MODE, HWSP and re-enable submission.
-	 */
-	ret = engine->init_hw(engine);
-	if (ret)
-		goto out;
-
-out:
-	i915_gem_reset_finish_engine(engine);
-	return ret;
-}
-
 static int i915_pm_prepare(struct device *kdev)
 {
 	struct pci_dev *pdev = to_pci_dev(kdev);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 43f545add21c..84b1073eacd8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2700,19 +2700,7 @@ extern const struct dev_pm_ops i915_pm_ops;
 extern int i915_driver_load(struct pci_dev *pdev,
 			    const struct pci_device_id *ent);
 extern void i915_driver_unload(struct drm_device *dev);
-extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
-extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
-
-extern void i915_reset(struct drm_i915_private *i915,
-		       unsigned int stalled_mask,
-		       const char *reason);
-extern int i915_reset_engine(struct intel_engine_cs *engine,
-			     const char *reason);
-
-extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
-extern int intel_reset_guc(struct drm_i915_private *dev_priv);
-extern int intel_guc_reset_engine(struct intel_guc *guc,
-				  struct intel_engine_cs *engine);
+
 extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
 extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
 extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
@@ -2757,13 +2745,6 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
 			   &dev_priv->gpu_error.hangcheck_work, delay);
 }
 
-__printf(4, 5)
-void i915_handle_error(struct drm_i915_private *dev_priv,
-		       u32 engine_mask,
-		       unsigned long flags,
-		       const char *fmt, ...);
-#define I915_ERROR_CAPTURE BIT(0)
-
 extern void intel_irq_init(struct drm_i915_private *dev_priv);
 extern void intel_irq_fini(struct drm_i915_private *dev_priv);
 int intel_irq_install(struct drm_i915_private *dev_priv);
@@ -3126,18 +3107,8 @@ static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
 	return READ_ONCE(error->reset_engine_count[engine->id]);
 }
 
-struct i915_request *
-i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
-int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
-void i915_gem_reset(struct drm_i915_private *dev_priv,
-		    unsigned int stalled_mask);
-void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
-void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
 void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
 bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
-void i915_gem_reset_engine(struct intel_engine_cs *engine,
-			   struct i915_request *request,
-			   bool stalled);
 
 void i915_gem_init_mmio(struct drm_i915_private *i915);
 int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 8eecd68f9e23..b5822cc36221 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -28,15 +28,6 @@
 #include <drm/drmP.h>
 #include <drm/drm_vma_manager.h>
 #include <drm/i915_drm.h>
-#include "i915_drv.h"
-#include "i915_gem_clflush.h"
-#include "i915_vgpu.h"
-#include "i915_trace.h"
-#include "intel_drv.h"
-#include "intel_frontbuffer.h"
-#include "intel_mocs.h"
-#include "intel_workarounds.h"
-#include "i915_gemfs.h"
 #include <linux/dma-fence-array.h>
 #include <linux/kthread.h>
 #include <linux/reservation.h>
@@ -47,6 +38,18 @@
 #include <linux/pci.h>
 #include <linux/dma-buf.h>
 
+#include "i915_drv.h"
+#include "i915_gem_clflush.h"
+#include "i915_gemfs.h"
+#include "i915_reset.h"
+#include "i915_trace.h"
+#include "i915_vgpu.h"
+
+#include "intel_drv.h"
+#include "intel_frontbuffer.h"
+#include "intel_mocs.h"
+#include "intel_workarounds.h"
+
 static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
 
 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
@@ -2960,61 +2963,6 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
 	return 0;
 }
 
-static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
-					const struct i915_gem_context *ctx)
-{
-	unsigned int score;
-	unsigned long prev_hang;
-
-	if (i915_gem_context_is_banned(ctx))
-		score = I915_CLIENT_SCORE_CONTEXT_BAN;
-	else
-		score = 0;
-
-	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
-	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
-		score += I915_CLIENT_SCORE_HANG_FAST;
-
-	if (score) {
-		atomic_add(score, &file_priv->ban_score);
-
-		DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
-				 ctx->name, score,
-				 atomic_read(&file_priv->ban_score));
-	}
-}
-
-static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
-{
-	unsigned int score;
-	bool banned, bannable;
-
-	atomic_inc(&ctx->guilty_count);
-
-	bannable = i915_gem_context_is_bannable(ctx);
-	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
-	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
-
-	/* Cool contexts don't accumulate client ban score */
-	if (!bannable)
-		return;
-
-	if (banned) {
-		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
-				 ctx->name, atomic_read(&ctx->guilty_count),
-				 score);
-		i915_gem_context_set_banned(ctx);
-	}
-
-	if (!IS_ERR_OR_NULL(ctx->file_priv))
-		i915_gem_client_mark_guilty(ctx->file_priv, ctx);
-}
-
-static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
-{
-	atomic_inc(&ctx->active_count);
-}
-
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine)
 {
@@ -3045,395 +2993,6 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
 	return active;
 }
 
-/*
- * Ensure irq handler finishes, and not run again.
- * Also return the active request so that we only search for it once.
- */
-struct i915_request *
-i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
-{
-	struct i915_request *request;
-
-	/*
-	 * During the reset sequence, we must prevent the engine from
-	 * entering RC6. As the context state is undefined until we restart
-	 * the engine, if it does enter RC6 during the reset, the state
-	 * written to the powercontext is undefined and so we may lose
-	 * GPU state upon resume, i.e. fail to restart after a reset.
-	 */
-	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
-
-	request = engine->reset.prepare(engine);
-	if (request && request->fence.error == -EIO)
-		request = ERR_PTR(-EIO); /* Previous reset failed! */
-
-	return request;
-}
-
-int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
-{
-	struct intel_engine_cs *engine;
-	struct i915_request *request;
-	enum intel_engine_id id;
-	int err = 0;
-
-	for_each_engine(engine, dev_priv, id) {
-		request = i915_gem_reset_prepare_engine(engine);
-		if (IS_ERR(request)) {
-			err = PTR_ERR(request);
-			continue;
-		}
-
-		engine->hangcheck.active_request = request;
-	}
-
-	i915_gem_revoke_fences(dev_priv);
-	intel_uc_sanitize(dev_priv);
-
-	return err;
-}
-
-static void engine_skip_context(struct i915_request *request)
-{
-	struct intel_engine_cs *engine = request->engine;
-	struct i915_gem_context *hung_ctx = request->gem_context;
-	struct i915_timeline *timeline = request->timeline;
-	unsigned long flags;
-
-	GEM_BUG_ON(timeline == &engine->timeline);
-
-	spin_lock_irqsave(&engine->timeline.lock, flags);
-	spin_lock(&timeline->lock);
-
-	list_for_each_entry_continue(request, &engine->timeline.requests, link)
-		if (request->gem_context == hung_ctx)
-			i915_request_skip(request, -EIO);
-
-	list_for_each_entry(request, &timeline->requests, link)
-		i915_request_skip(request, -EIO);
-
-	spin_unlock(&timeline->lock);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
-}
-
-/* Returns the request if it was guilty of the hang */
-static struct i915_request *
-i915_gem_reset_request(struct intel_engine_cs *engine,
-		       struct i915_request *request,
-		       bool stalled)
-{
-	/* The guilty request will get skipped on a hung engine.
-	 *
-	 * Users of client default contexts do not rely on logical
-	 * state preserved between batches so it is safe to execute
-	 * queued requests following the hang. Non default contexts
-	 * rely on preserved state, so skipping a batch loses the
-	 * evolution of the state and it needs to be considered corrupted.
-	 * Executing more queued batches on top of corrupted state is
-	 * risky. But we take the risk by trying to advance through
-	 * the queued requests in order to make the client behaviour
-	 * more predictable around resets, by not throwing away random
-	 * amount of batches it has prepared for execution. Sophisticated
-	 * clients can use gem_reset_stats_ioctl and dma fence status
-	 * (exported via sync_file info ioctl on explicit fences) to observe
-	 * when it loses the context state and should rebuild accordingly.
-	 *
-	 * The context ban, and ultimately the client ban, mechanism are safety
-	 * valves if client submission ends up resulting in nothing more than
-	 * subsequent hangs.
-	 */
-
-	if (i915_request_completed(request)) {
-		GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
-			  engine->name, request->global_seqno,
-			  request->fence.context, request->fence.seqno,
-			  intel_engine_get_seqno(engine));
-		stalled = false;
-	}
-
-	if (stalled) {
-		i915_gem_context_mark_guilty(request->gem_context);
-		i915_request_skip(request, -EIO);
-
-		/* If this context is now banned, skip all pending requests. */
-		if (i915_gem_context_is_banned(request->gem_context))
-			engine_skip_context(request);
-	} else {
-		/*
-		 * Since this is not the hung engine, it may have advanced
-		 * since the hang declaration. Double check by refinding
-		 * the active request at the time of the reset.
-		 */
-		request = i915_gem_find_active_request(engine);
-		if (request) {
-			unsigned long flags;
-
-			i915_gem_context_mark_innocent(request->gem_context);
-			dma_fence_set_error(&request->fence, -EAGAIN);
-
-			/* Rewind the engine to replay the incomplete rq */
-			spin_lock_irqsave(&engine->timeline.lock, flags);
-			request = list_prev_entry(request, link);
-			if (&request->link == &engine->timeline.requests)
-				request = NULL;
-			spin_unlock_irqrestore(&engine->timeline.lock, flags);
-		}
-	}
-
-	return request;
-}
-
-void i915_gem_reset_engine(struct intel_engine_cs *engine,
-			   struct i915_request *request,
-			   bool stalled)
-{
-	/*
-	 * Make sure this write is visible before we re-enable the interrupt
-	 * handlers on another CPU, as tasklet_enable() resolves to just
-	 * a compiler barrier which is insufficient for our purpose here.
-	 */
-	smp_store_mb(engine->irq_posted, 0);
-
-	if (request)
-		request = i915_gem_reset_request(engine, request, stalled);
-
-	/* Setup the CS to resume from the breadcrumb of the hung request */
-	engine->reset.reset(engine, request);
-}
-
-void i915_gem_reset(struct drm_i915_private *dev_priv,
-		    unsigned int stalled_mask)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
-	i915_retire_requests(dev_priv);
-
-	for_each_engine(engine, dev_priv, id) {
-		struct intel_context *ce;
-
-		i915_gem_reset_engine(engine,
-				      engine->hangcheck.active_request,
-				      stalled_mask & ENGINE_MASK(id));
-		ce = fetch_and_zero(&engine->last_retired_context);
-		if (ce)
-			intel_context_unpin(ce);
-
-		/*
-		 * Ostensibily, we always want a context loaded for powersaving,
-		 * so if the engine is idle after the reset, send a request
-		 * to load our scratch kernel_context.
-		 *
-		 * More mysteriously, if we leave the engine idle after a reset,
-		 * the next userspace batch may hang, with what appears to be
-		 * an incoherent read by the CS (presumably stale TLB). An
-		 * empty request appears sufficient to paper over the glitch.
-		 */
-		if (intel_engine_is_idle(engine)) {
-			struct i915_request *rq;
-
-			rq = i915_request_alloc(engine,
-						dev_priv->kernel_context);
-			if (!IS_ERR(rq))
-				i915_request_add(rq);
-		}
-	}
-
-	i915_gem_restore_fences(dev_priv);
-}
-
-void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
-{
-	engine->reset.finish(engine);
-
-	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
-}
-
-void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
-	for_each_engine(engine, dev_priv, id) {
-		engine->hangcheck.active_request = NULL;
-		i915_gem_reset_finish_engine(engine);
-	}
-}
-
-static void nop_submit_request(struct i915_request *request)
-{
-	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
-		  request->engine->name,
-		  request->fence.context, request->fence.seqno);
-	dma_fence_set_error(&request->fence, -EIO);
-
-	i915_request_submit(request);
-}
-
-static void nop_complete_submit_request(struct i915_request *request)
-{
-	unsigned long flags;
-
-	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
-		  request->engine->name,
-		  request->fence.context, request->fence.seqno);
-	dma_fence_set_error(&request->fence, -EIO);
-
-	spin_lock_irqsave(&request->engine->timeline.lock, flags);
-	__i915_request_submit(request);
-	intel_engine_init_global_seqno(request->engine, request->global_seqno);
-	spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
-}
-
-void i915_gem_set_wedged(struct drm_i915_private *i915)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	GEM_TRACE("start\n");
-
-	if (GEM_SHOW_DEBUG()) {
-		struct drm_printer p = drm_debug_printer(__func__);
-
-		for_each_engine(engine, i915, id)
-			intel_engine_dump(engine, &p, "%s\n", engine->name);
-	}
-
-	set_bit(I915_WEDGED, &i915->gpu_error.flags);
-	smp_mb__after_atomic();
-
-	/*
-	 * First, stop submission to hw, but do not yet complete requests by
-	 * rolling the global seqno forward (since this would complete requests
-	 * for which we haven't set the fence error to EIO yet).
-	 */
-	for_each_engine(engine, i915, id) {
-		i915_gem_reset_prepare_engine(engine);
-
-		engine->submit_request = nop_submit_request;
-		engine->schedule = NULL;
-	}
-	i915->caps.scheduler = 0;
-
-	/* Even if the GPU reset fails, it should still stop the engines */
-	intel_gpu_reset(i915, ALL_ENGINES);
-
-	/*
-	 * Make sure no one is running the old callback before we proceed with
-	 * cancelling requests and resetting the completion tracking. Otherwise
-	 * we might submit a request to the hardware which never completes.
-	 */
-	synchronize_rcu();
-
-	for_each_engine(engine, i915, id) {
-		/* Mark all executing requests as skipped */
-		engine->cancel_requests(engine);
-
-		/*
-		 * Only once we've force-cancelled all in-flight requests can we
-		 * start to complete all requests.
-		 */
-		engine->submit_request = nop_complete_submit_request;
-	}
-
-	/*
-	 * Make sure no request can slip through without getting completed by
-	 * either this call here to intel_engine_init_global_seqno, or the one
-	 * in nop_complete_submit_request.
-	 */
-	synchronize_rcu();
-
-	for_each_engine(engine, i915, id) {
-		unsigned long flags;
-
-		/*
-		 * Mark all pending requests as complete so that any concurrent
-		 * (lockless) lookup doesn't try and wait upon the request as we
-		 * reset it.
-		 */
-		spin_lock_irqsave(&engine->timeline.lock, flags);
-		intel_engine_init_global_seqno(engine,
-					       intel_engine_last_submit(engine));
-		spin_unlock_irqrestore(&engine->timeline.lock, flags);
-
-		i915_gem_reset_finish_engine(engine);
-	}
-
-	GEM_TRACE("end\n");
-
-	wake_up_all(&i915->gpu_error.reset_queue);
-}
-
-bool i915_gem_unset_wedged(struct drm_i915_private *i915)
-{
-	struct i915_timeline *tl;
-
-	lockdep_assert_held(&i915->drm.struct_mutex);
-	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
-		return true;
-
-	GEM_TRACE("start\n");
-
-	/*
-	 * Before unwedging, make sure that all pending operations
-	 * are flushed and errored out - we may have requests waiting upon
-	 * third party fences. We marked all inflight requests as EIO, and
-	 * every execbuf since returned EIO, for consistency we want all
-	 * the currently pending requests to also be marked as EIO, which
-	 * is done inside our nop_submit_request - and so we must wait.
-	 *
-	 * No more can be submitted until we reset the wedged bit.
-	 */
-	list_for_each_entry(tl, &i915->gt.timelines, link) {
-		struct i915_request *rq;
-
-		rq = i915_gem_active_peek(&tl->last_request,
-					  &i915->drm.struct_mutex);
-		if (!rq)
-			continue;
-
-		/*
-		 * We can't use our normal waiter as we want to
-		 * avoid recursively trying to handle the current
-		 * reset. The basic dma_fence_default_wait() installs
-		 * a callback for dma_fence_signal(), which is
-		 * triggered by our nop handler (indirectly, the
-		 * callback enables the signaler thread which is
-		 * woken by the nop_submit_request() advancing the seqno
-		 * and when the seqno passes the fence, the signaler
-		 * then signals the fence waking us up).
-		 */
-		if (dma_fence_default_wait(&rq->fence, true,
-					   MAX_SCHEDULE_TIMEOUT) < 0)
-			return false;
-	}
-	i915_retire_requests(i915);
-	GEM_BUG_ON(i915->gt.active_requests);
-
-	/*
-	 * Undo nop_submit_request. We prevent all new i915 requests from
-	 * being queued (by disallowing execbuf whilst wedged) so having
-	 * waited for all active requests above, we know the system is idle
-	 * and do not have to worry about a thread being inside
-	 * engine->submit_request() as we swap over. So unlike installing
-	 * the nop_submit_request on reset, we can do this from normal
-	 * context and do not require stop_machine().
-	 */
-	intel_engines_reset_default_submission(i915);
-	i915_gem_contexts_lost(i915);
-
-	GEM_TRACE("end\n");
-
-	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
-	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
-
-	return true;
-}
-
 static void
 i915_gem_retire_work_handler(struct work_struct *work)
 {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 495b9d27990e..76daa31dc2ba 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -2928,46 +2928,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-struct wedge_me {
-	struct delayed_work work;
-	struct drm_i915_private *i915;
-	const char *name;
-};
-
-static void wedge_me(struct work_struct *work)
-{
-	struct wedge_me *w = container_of(work, typeof(*w), work.work);
-
-	dev_err(w->i915->drm.dev,
-		"%s timed out, cancelling all in-flight rendering.\n",
-		w->name);
-	i915_gem_set_wedged(w->i915);
-}
-
-static void __init_wedge(struct wedge_me *w,
-			 struct drm_i915_private *i915,
-			 long timeout,
-			 const char *name)
-{
-	w->i915 = i915;
-	w->name = name;
-
-	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
-	schedule_delayed_work(&w->work, timeout);
-}
-
-static void __fini_wedge(struct wedge_me *w)
-{
-	cancel_delayed_work_sync(&w->work);
-	destroy_delayed_work_on_stack(&w->work);
-	w->i915 = NULL;
-}
-
-#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
-	for (__init_wedge((W), (DEV), (TIMEOUT), __func__);		\
-	     (W)->i915;							\
-	     __fini_wedge((W)))
-
 static u32
 gen11_gt_engine_identity(struct drm_i915_private * const i915,
 			 const unsigned int bank, const unsigned int bit)
@@ -3172,186 +3132,6 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-static void i915_reset_device(struct drm_i915_private *dev_priv,
-			      u32 engine_mask,
-			      const char *reason)
-{
-	struct i915_gpu_error *error = &dev_priv->gpu_error;
-	struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
-	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
-	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
-	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
-	struct wedge_me w;
-
-	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
-
-	DRM_DEBUG_DRIVER("resetting chip\n");
-	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
-
-	/* Use a watchdog to ensure that our reset completes */
-	i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
-		intel_prepare_reset(dev_priv);
-
-		error->reason = reason;
-		error->stalled_mask = engine_mask;
-
-		/* Signal that locked waiters should reset the GPU */
-		smp_mb__before_atomic();
-		set_bit(I915_RESET_HANDOFF, &error->flags);
-		wake_up_all(&error->wait_queue);
-
-		/* Wait for anyone holding the lock to wakeup, without
-		 * blocking indefinitely on struct_mutex.
-		 */
-		do {
-			if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
-				i915_reset(dev_priv, engine_mask, reason);
-				mutex_unlock(&dev_priv->drm.struct_mutex);
-			}
-		} while (wait_on_bit_timeout(&error->flags,
-					     I915_RESET_HANDOFF,
-					     TASK_UNINTERRUPTIBLE,
-					     1));
-
-		error->stalled_mask = 0;
-		error->reason = NULL;
-
-		intel_finish_reset(dev_priv);
-	}
-
-	if (!test_bit(I915_WEDGED, &error->flags))
-		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
-}
-
-static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
-{
-	u32 eir;
-
-	if (!IS_GEN2(dev_priv))
-		I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER));
-
-	if (INTEL_GEN(dev_priv) < 4)
-		I915_WRITE(IPEIR, I915_READ(IPEIR));
-	else
-		I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965));
-
-	I915_WRITE(EIR, I915_READ(EIR));
-	eir = I915_READ(EIR);
-	if (eir) {
-		/*
-		 * some errors might have become stuck,
-		 * mask them.
-		 */
-		DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
-		I915_WRITE(EMR, I915_READ(EMR) | eir);
-		I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT);
-	}
-}
-
-/**
- * i915_handle_error - handle a gpu error
- * @dev_priv: i915 device private
- * @engine_mask: mask representing engines that are hung
- * @flags: control flags
- * @fmt: Error message format string
- *
- * Do some basic checking of register state at error time and
- * dump it to the syslog.  Also call i915_capture_error_state() to make
- * sure we get a record and make it available in debugfs.  Fire a uevent
- * so userspace knows something bad happened (should trigger collection
- * of a ring dump etc.).
- */
-void i915_handle_error(struct drm_i915_private *dev_priv,
-		       u32 engine_mask,
-		       unsigned long flags,
-		       const char *fmt, ...)
-{
-	struct intel_engine_cs *engine;
-	unsigned int tmp;
-	char error_msg[80];
-	char *msg = NULL;
-
-	if (fmt) {
-		va_list args;
-
-		va_start(args, fmt);
-		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
-		va_end(args);
-
-		msg = error_msg;
-	}
-
-	/*
-	 * In most cases it's guaranteed that we get here with an RPM
-	 * reference held, for example because there is a pending GPU
-	 * request that won't finish until the reset is done. This
-	 * isn't the case at least when we get here by doing a
-	 * simulated reset via debugfs, so get an RPM reference.
-	 */
-	intel_runtime_pm_get(dev_priv);
-
-	engine_mask &= INTEL_INFO(dev_priv)->ring_mask;
-
-	if (flags & I915_ERROR_CAPTURE) {
-		i915_capture_error_state(dev_priv, engine_mask, msg);
-		i915_clear_error_registers(dev_priv);
-	}
-
-	/*
-	 * Try engine reset when available. We fall back to full reset if
-	 * single reset fails.
-	 */
-	if (intel_has_reset_engine(dev_priv)) {
-		for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
-			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
-			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					     &dev_priv->gpu_error.flags))
-				continue;
-
-			if (i915_reset_engine(engine, msg) == 0)
-				engine_mask &= ~intel_engine_flag(engine);
-
-			clear_bit(I915_RESET_ENGINE + engine->id,
-				  &dev_priv->gpu_error.flags);
-			wake_up_bit(&dev_priv->gpu_error.flags,
-				    I915_RESET_ENGINE + engine->id);
-		}
-	}
-
-	if (!engine_mask)
-		goto out;
-
-	/* Full reset needs the mutex, stop any other user trying to do so. */
-	if (test_and_set_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags)) {
-		wait_event(dev_priv->gpu_error.reset_queue,
-			   !test_bit(I915_RESET_BACKOFF,
-				     &dev_priv->gpu_error.flags));
-		goto out;
-	}
-
-	/* Prevent any other reset-engine attempt. */
-	for_each_engine(engine, dev_priv, tmp) {
-		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					&dev_priv->gpu_error.flags))
-			wait_on_bit(&dev_priv->gpu_error.flags,
-				    I915_RESET_ENGINE + engine->id,
-				    TASK_UNINTERRUPTIBLE);
-	}
-
-	i915_reset_device(dev_priv, engine_mask, msg);
-
-	for_each_engine(engine, dev_priv, tmp) {
-		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &dev_priv->gpu_error.flags);
-	}
-
-	clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
-	wake_up_all(&dev_priv->gpu_error.reset_queue);
-
-out:
-	intel_runtime_pm_put(dev_priv);
-}
-
 /* Called from drm generic code, passed 'crtc' which
  * we use as a pipe index
  */
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5c2c93cbab12..9bbea7baa55d 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -29,6 +29,7 @@
 #include <linux/sched/signal.h>
 
 #include "i915_drv.h"
+#include "i915_reset.h"
 
 static const char *i915_fence_get_driver_name(struct dma_fence *fence)
 {
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
new file mode 100644
index 000000000000..edf29da15a99
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -0,0 +1,1271 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2008-2018 Intel Corporation
+ */
+
+#include "i915_drv.h"
+#include "i915_gpu_error.h"
+#include "i915_reset.h"
+
+#include "intel_guc.h"
+
+static void engine_skip_context(struct i915_request *rq)
+{
+	struct intel_engine_cs *engine = rq->engine;
+	struct i915_gem_context *hung_ctx = rq->gem_context;
+	struct i915_timeline *timeline = rq->timeline;
+	unsigned long flags;
+
+	GEM_BUG_ON(timeline == &engine->timeline);
+
+	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock(&timeline->lock);
+
+	list_for_each_entry_continue(rq, &engine->timeline.requests, link)
+		if (rq->gem_context == hung_ctx)
+			i915_request_skip(rq, -EIO);
+
+	list_for_each_entry(rq, &timeline->requests, link)
+		i915_request_skip(rq, -EIO);
+
+	spin_unlock(&timeline->lock);
+	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+}
+
+static void client_mark_guilty(struct drm_i915_file_private *file_priv,
+			       const struct i915_gem_context *ctx)
+{
+	unsigned int score;
+	unsigned long prev_hang;
+
+	if (i915_gem_context_is_banned(ctx))
+		score = I915_CLIENT_SCORE_CONTEXT_BAN;
+	else
+		score = 0;
+
+	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
+	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
+		score += I915_CLIENT_SCORE_HANG_FAST;
+
+	if (score) {
+		atomic_add(score, &file_priv->ban_score);
+
+		DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
+				 ctx->name, score,
+				 atomic_read(&file_priv->ban_score));
+	}
+}
+
+static void context_mark_guilty(struct i915_gem_context *ctx)
+{
+	unsigned int score;
+	bool banned, bannable;
+
+	atomic_inc(&ctx->guilty_count);
+
+	bannable = i915_gem_context_is_bannable(ctx);
+	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
+	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
+
+	/* Cool contexts don't accumulate client ban score */
+	if (!bannable)
+		return;
+
+	if (banned) {
+		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
+				 ctx->name, atomic_read(&ctx->guilty_count),
+				 score);
+		i915_gem_context_set_banned(ctx);
+	}
+
+	if (!IS_ERR_OR_NULL(ctx->file_priv))
+		client_mark_guilty(ctx->file_priv, ctx);
+}
+
+static void context_mark_innocent(struct i915_gem_context *ctx)
+{
+	atomic_inc(&ctx->active_count);
+}
+
+static void gen3_stop_engine(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	const u32 base = engine->mmio_base;
+
+	if (intel_engine_stop_cs(engine))
+		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name);
+
+	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
+	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
+
+	I915_WRITE_FW(RING_HEAD(base), 0);
+	I915_WRITE_FW(RING_TAIL(base), 0);
+	POSTING_READ_FW(RING_TAIL(base));
+
+	/* The ring must be empty before it is disabled */
+	I915_WRITE_FW(RING_CTL(base), 0);
+
+	/* Check acts as a post */
+	if (I915_READ_FW(RING_HEAD(base)) != 0)
+		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
+				 engine->name);
+}
+
+static void i915_stop_engines(struct drm_i915_private *i915,
+			      unsigned int engine_mask)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	if (INTEL_GEN(i915) < 3)
+		return;
+
+	for_each_engine_masked(engine, i915, engine_mask, id)
+		gen3_stop_engine(engine);
+}
+
+static bool i915_in_reset(struct pci_dev *pdev)
+{
+	u8 gdrst;
+
+	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
+	return gdrst & GRDOM_RESET_STATUS;
+}
+
+static int i915_do_reset(struct drm_i915_private *i915,
+			 unsigned int engine_mask)
+{
+	struct pci_dev *pdev = i915->drm.pdev;
+	int err;
+
+	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
+	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
+	usleep_range(50, 200);
+	err = wait_for(i915_in_reset(pdev), 500);
+
+	/* Clear the reset request. */
+	pci_write_config_byte(pdev, I915_GDRST, 0);
+	usleep_range(50, 200);
+	if (!err)
+		err = wait_for(!i915_in_reset(pdev), 500);
+
+	return err;
+}
+
+static bool g4x_reset_complete(struct pci_dev *pdev)
+{
+	u8 gdrst;
+
+	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
+	return (gdrst & GRDOM_RESET_ENABLE) == 0;
+}
+
+static int g33_do_reset(struct drm_i915_private *i915, unsigned int engine_mask)
+{
+	struct pci_dev *pdev = i915->drm.pdev;
+
+	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
+	return wait_for(g4x_reset_complete(pdev), 500);
+}
+
+static int g4x_do_reset(struct drm_i915_private *dev_priv,
+			unsigned int engine_mask)
+{
+	struct pci_dev *pdev = dev_priv->drm.pdev;
+	int ret;
+
+	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
+	I915_WRITE(VDECCLK_GATE_D,
+		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
+	POSTING_READ(VDECCLK_GATE_D);
+
+	pci_write_config_byte(pdev, I915_GDRST,
+			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
+	ret =  wait_for(g4x_reset_complete(pdev), 500);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
+		goto out;
+	}
+
+	pci_write_config_byte(pdev, I915_GDRST,
+			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
+	ret =  wait_for(g4x_reset_complete(pdev), 500);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
+		goto out;
+	}
+
+out:
+	pci_write_config_byte(pdev, I915_GDRST, 0);
+
+	I915_WRITE(VDECCLK_GATE_D,
+		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
+	POSTING_READ(VDECCLK_GATE_D);
+
+	return ret;
+}
+
+static int ironlake_do_reset(struct drm_i915_private *dev_priv,
+			     unsigned int engine_mask)
+{
+	int ret;
+
+	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
+	ret = intel_wait_for_register(dev_priv,
+				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
+				      500);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
+		goto out;
+	}
+
+	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
+	ret = intel_wait_for_register(dev_priv,
+				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
+				      500);
+	if (ret) {
+		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
+		goto out;
+	}
+
+out:
+	I915_WRITE(ILK_GDSR, 0);
+	POSTING_READ(ILK_GDSR);
+	return ret;
+}
+
+/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
+static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
+				u32 hw_domain_mask)
+{
+	int err;
+
+	/*
+	 * GEN6_GDRST is not in the gt power well, no need to check
+	 * for fifo space for the write or forcewake the chip for
+	 * the read
+	 */
+	I915_WRITE_FW(GEN6_GDRST, hw_domain_mask);
+
+	/* Wait for the device to ack the reset requests */
+	err = __intel_wait_for_register_fw(dev_priv,
+					   GEN6_GDRST, hw_domain_mask, 0,
+					   500, 0,
+					   NULL);
+	if (err)
+		DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
+				 hw_domain_mask);
+
+	return err;
+}
+
+static int gen6_reset_engines(struct drm_i915_private *i915,
+			      unsigned int engine_mask)
+{
+	struct intel_engine_cs *engine;
+	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
+		[RCS] = GEN6_GRDOM_RENDER,
+		[BCS] = GEN6_GRDOM_BLT,
+		[VCS] = GEN6_GRDOM_MEDIA,
+		[VCS2] = GEN8_GRDOM_MEDIA2,
+		[VECS] = GEN6_GRDOM_VECS,
+	};
+	u32 hw_mask;
+
+	if (engine_mask == ALL_ENGINES) {
+		hw_mask = GEN6_GRDOM_FULL;
+	} else {
+		unsigned int tmp;
+
+		hw_mask = 0;
+		for_each_engine_masked(engine, i915, engine_mask, tmp)
+			hw_mask |= hw_engine_mask[engine->id];
+	}
+
+	return gen6_hw_domain_reset(i915, hw_mask);
+}
+
+static int gen11_reset_engines(struct drm_i915_private *i915,
+			       unsigned int engine_mask)
+{
+	struct intel_engine_cs *engine;
+	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
+		[RCS] = GEN11_GRDOM_RENDER,
+		[BCS] = GEN11_GRDOM_BLT,
+		[VCS] = GEN11_GRDOM_MEDIA,
+		[VCS2] = GEN11_GRDOM_MEDIA2,
+		[VCS3] = GEN11_GRDOM_MEDIA3,
+		[VCS4] = GEN11_GRDOM_MEDIA4,
+		[VECS] = GEN11_GRDOM_VECS,
+		[VECS2] = GEN11_GRDOM_VECS2,
+	};
+	u32 hw_mask;
+
+	BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES);
+
+	if (engine_mask == ALL_ENGINES) {
+		hw_mask = GEN11_GRDOM_FULL;
+	} else {
+		unsigned int tmp;
+
+		hw_mask = 0;
+		for_each_engine_masked(engine, i915, engine_mask, tmp)
+			hw_mask |= hw_engine_mask[engine->id];
+	}
+
+	return gen6_hw_domain_reset(i915, hw_mask);
+}
+
+static int gen8_reset_engine_start(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+	int ret;
+
+	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
+		      _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
+
+	ret = __intel_wait_for_register_fw(dev_priv,
+					   RING_RESET_CTL(engine->mmio_base),
+					   RESET_CTL_READY_TO_RESET,
+					   RESET_CTL_READY_TO_RESET,
+					   700, 0,
+					   NULL);
+	if (ret)
+		DRM_ERROR("%s: reset request timeout\n", engine->name);
+
+	return ret;
+}
+
+static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
+{
+	struct drm_i915_private *dev_priv = engine->i915;
+
+	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
+		      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
+}
+
+static int gen8_reset_engines(struct drm_i915_private *i915,
+			      unsigned int engine_mask)
+{
+	struct intel_engine_cs *engine;
+	unsigned int tmp;
+	int ret;
+
+	for_each_engine_masked(engine, i915, engine_mask, tmp) {
+		if (gen8_reset_engine_start(engine)) {
+			ret = -EIO;
+			goto not_ready;
+		}
+	}
+
+	if (INTEL_GEN(i915) >= 11)
+		ret = gen11_reset_engines(i915, engine_mask);
+	else
+		ret = gen6_reset_engines(i915, engine_mask);
+
+not_ready:
+	for_each_engine_masked(engine, i915, engine_mask, tmp)
+		gen8_reset_engine_cancel(engine);
+
+	return ret;
+}
+
+typedef int (*reset_func)(struct drm_i915_private *, unsigned int engine_mask);
+
+static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
+{
+	if (!i915_modparams.reset)
+		return NULL;
+
+	if (INTEL_GEN(i915) >= 8)
+		return gen8_reset_engines;
+	else if (INTEL_GEN(i915) >= 6)
+		return gen6_reset_engines;
+	else if (IS_GEN5(i915))
+		return ironlake_do_reset;
+	else if (IS_G4X(i915))
+		return g4x_do_reset;
+	else if (IS_G33(i915) || IS_PINEVIEW(i915))
+		return g33_do_reset;
+	else if (INTEL_GEN(i915) >= 3)
+		return i915_do_reset;
+	else
+		return NULL;
+}
+
+int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
+{
+	reset_func reset = intel_get_gpu_reset(i915);
+	int retry;
+	int ret;
+
+	/*
+	 * We want to perform per-engine reset from atomic context (e.g.
+	 * softirq), which imposes the constraint that we cannot sleep.
+	 * However, experience suggests that spending a bit of time waiting
+	 * for a reset helps in various cases, so for a full-device reset
+	 * we apply the opposite rule and wait if we want to. As we should
+	 * always follow up a failed per-engine reset with a full device reset,
+	 * being a little faster, stricter and more error prone for the
+	 * atomic case seems an acceptable compromise.
+	 *
+	 * Unfortunately this leads to a bimodal routine, when the goal was
+	 * to have a single reset function that worked for resetting any
+	 * number of engines simultaneously.
+	 */
+	might_sleep_if(engine_mask == ALL_ENGINES);
+
+	/*
+	 * If the power well sleeps during the reset, the reset
+	 * request may be dropped and never completes (causing -EIO).
+	 */
+	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
+	for (retry = 0; retry < 3; retry++) {
+		/*
+		 * We stop engines, otherwise we might get failed reset and a
+		 * dead gpu (on elk). Also as modern gpu as kbl can suffer
+		 * from system hang if batchbuffer is progressing when
+		 * the reset is issued, regardless of READY_TO_RESET ack.
+		 * Thus assume it is best to stop engines on all gens
+		 * where we have a gpu reset.
+		 *
+		 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
+		 *
+		 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
+		 *
+		 * FIXME: Wa for more modern gens needs to be validated
+		 */
+		i915_stop_engines(i915, engine_mask);
+
+		ret = -ENODEV;
+		if (reset) {
+			GEM_TRACE("engine_mask=%x\n", engine_mask);
+			ret = reset(i915, engine_mask);
+		}
+		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
+			break;
+
+		cond_resched();
+	}
+	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
+
+	return ret;
+}
+
+bool intel_has_gpu_reset(struct drm_i915_private *i915)
+{
+	return intel_get_gpu_reset(i915);
+}
+
+bool intel_has_reset_engine(struct drm_i915_private *i915)
+{
+	return i915->info.has_reset_engine && i915_modparams.reset >= 2;
+}
+
+int intel_reset_guc(struct drm_i915_private *i915)
+{
+	u32 guc_domain =
+		INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
+	int ret;
+
+	GEM_BUG_ON(!HAS_GUC(i915));
+
+	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
+	ret = gen6_hw_domain_reset(i915, guc_domain);
+	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
+
+	return ret;
+}
+
+/*
+ * Ensure irq handler finishes, and not run again.
+ * Also return the active request so that we only search for it once.
+ */
+static struct i915_request *
+reset_prepare_engine(struct intel_engine_cs *engine)
+{
+	struct i915_request *rq;
+
+	/*
+	 * During the reset sequence, we must prevent the engine from
+	 * entering RC6. As the context state is undefined until we restart
+	 * the engine, if it does enter RC6 during the reset, the state
+	 * written to the powercontext is undefined and so we may lose
+	 * GPU state upon resume, i.e. fail to restart after a reset.
+	 */
+	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
+
+	rq = engine->reset.prepare(engine);
+	if (rq && rq->fence.error == -EIO)
+		rq = ERR_PTR(-EIO); /* Previous reset failed! */
+
+	return rq;
+}
+
+static int reset_prepare(struct drm_i915_private *i915)
+{
+	struct intel_engine_cs *engine;
+	struct i915_request *rq;
+	enum intel_engine_id id;
+	int err = 0;
+
+	disable_irq(i915->drm.irq);
+
+	for_each_engine(engine, i915, id) {
+		rq = reset_prepare_engine(engine);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			continue;
+		}
+
+		engine->hangcheck.active_request = rq;
+	}
+
+	i915_gem_revoke_fences(i915);
+	intel_uc_sanitize(i915);
+
+	return err;
+}
+
+/* Returns the request if it was guilty of the hang */
+static struct i915_request *
+reset_request(struct intel_engine_cs *engine,
+	      struct i915_request *rq,
+	      bool stalled)
+{
+	/*
+	 * The guilty request will get skipped on a hung engine.
+	 *
+	 * Users of client default contexts do not rely on logical
+	 * state preserved between batches so it is safe to execute
+	 * queued requests following the hang. Non default contexts
+	 * rely on preserved state, so skipping a batch loses the
+	 * evolution of the state and it needs to be considered corrupted.
+	 * Executing more queued batches on top of corrupted state is
+	 * risky. But we take the risk by trying to advance through
+	 * the queued requests in order to make the client behaviour
+	 * more predictable around resets, by not throwing away random
+	 * amount of batches it has prepared for execution. Sophisticated
+	 * clients can use gem_reset_stats_ioctl and dma fence status
+	 * (exported via sync_file info ioctl on explicit fences) to observe
+	 * when it loses the context state and should rebuild accordingly.
+	 *
+	 * The context ban, and ultimately the client ban, mechanism are safety
+	 * valves if client submission ends up resulting in nothing more than
+	 * subsequent hangs.
+	 */
+
+	if (i915_request_completed(rq)) {
+		GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
+			  engine->name, rq->global_seqno,
+			  rq->fence.context, rq->fence.seqno,
+			  intel_engine_get_seqno(engine));
+		stalled = false;
+	}
+
+	if (stalled) {
+		context_mark_guilty(rq->gem_context);
+		i915_request_skip(rq, -EIO);
+
+		/* If this context is now banned, skip all pending requests. */
+		if (i915_gem_context_is_banned(rq->gem_context))
+			engine_skip_context(rq);
+	} else {
+		/*
+		 * Since this is not the hung engine, it may have advanced
+		 * since the hang declaration. Double check by refinding
+		 * the active request at the time of the reset.
+		 */
+		rq = i915_gem_find_active_request(engine);
+		if (rq) {
+			unsigned long flags;
+
+			context_mark_innocent(rq->gem_context);
+			dma_fence_set_error(&rq->fence, -EAGAIN);
+
+			/* Rewind the engine to replay the incomplete rq */
+			spin_lock_irqsave(&engine->timeline.lock, flags);
+			rq = list_prev_entry(rq, link);
+			if (&rq->link == &engine->timeline.requests)
+				rq = NULL;
+			spin_unlock_irqrestore(&engine->timeline.lock, flags);
+		}
+	}
+
+	return rq;
+}
+
+static void reset_engine(struct intel_engine_cs *engine,
+			 struct i915_request *rq,
+			 bool stalled)
+{
+	/*
+	 * Make sure this write is visible before we re-enable the interrupt
+	 * handlers on another CPU, as tasklet_enable() resolves to just
+	 * a compiler barrier which is insufficient for our purpose here.
+	 */
+	smp_store_mb(engine->irq_posted, 0);
+
+	if (rq)
+		rq = reset_request(engine, rq, stalled);
+
+	/* Setup the CS to resume from the breadcrumb of the hung request */
+	engine->reset.reset(engine, rq);
+}
+
+static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+
+	i915_retire_requests(i915);
+
+	for_each_engine(engine, i915, id) {
+		struct intel_context *ce;
+
+		reset_engine(engine,
+			     engine->hangcheck.active_request,
+			     stalled_mask & ENGINE_MASK(id));
+		ce = fetch_and_zero(&engine->last_retired_context);
+		if (ce)
+			intel_context_unpin(ce);
+
+		/*
+		 * Ostensibily, we always want a context loaded for powersaving,
+		 * so if the engine is idle after the reset, send a request
+		 * to load our scratch kernel_context.
+		 *
+		 * More mysteriously, if we leave the engine idle after a reset,
+		 * the next userspace batch may hang, with what appears to be
+		 * an incoherent read by the CS (presumably stale TLB). An
+		 * empty request appears sufficient to paper over the glitch.
+		 */
+		if (intel_engine_is_idle(engine)) {
+			struct i915_request *rq;
+
+			rq = i915_request_alloc(engine, i915->kernel_context);
+			if (!IS_ERR(rq))
+				i915_request_add(rq);
+		}
+	}
+
+	i915_gem_restore_fences(i915);
+}
+
+static void reset_finish_engine(struct intel_engine_cs *engine)
+{
+	engine->reset.finish(engine);
+
+	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
+}
+
+static void reset_finish(struct drm_i915_private *i915)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+
+	for_each_engine(engine, i915, id) {
+		engine->hangcheck.active_request = NULL;
+		reset_finish_engine(engine);
+	}
+
+	enable_irq(i915->drm.irq);
+}
+
+static void nop_submit_request(struct i915_request *rq)
+{
+	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
+		  rq->engine->name, rq->fence.context, rq->fence.seqno);
+	dma_fence_set_error(&rq->fence, -EIO);
+
+	i915_request_submit(rq);
+}
+
+static void nop_complete_submit_request(struct i915_request *rq)
+{
+	unsigned long flags;
+
+	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
+		  rq->engine->name,
+		  rq->fence.context, rq->fence.seqno);
+	dma_fence_set_error(&rq->fence, -EIO);
+
+	spin_lock_irqsave(&rq->engine->timeline.lock, flags);
+	__i915_request_submit(rq);
+	intel_engine_init_global_seqno(rq->engine, rq->global_seqno);
+	spin_unlock_irqrestore(&rq->engine->timeline.lock, flags);
+}
+
+void i915_gem_set_wedged(struct drm_i915_private *i915)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	GEM_TRACE("start\n");
+
+	if (GEM_SHOW_DEBUG()) {
+		struct drm_printer p = drm_debug_printer(__func__);
+
+		for_each_engine(engine, i915, id)
+			intel_engine_dump(engine, &p, "%s\n", engine->name);
+	}
+
+	set_bit(I915_WEDGED, &i915->gpu_error.flags);
+	smp_mb__after_atomic();
+
+	/*
+	 * First, stop submission to hw, but do not yet complete requests by
+	 * rolling the global seqno forward (since this would complete requests
+	 * for which we haven't set the fence error to EIO yet).
+	 */
+	for_each_engine(engine, i915, id) {
+		reset_prepare_engine(engine);
+
+		engine->submit_request = nop_submit_request;
+		engine->schedule = NULL;
+	}
+	i915->caps.scheduler = 0;
+
+	/* Even if the GPU reset fails, it should still stop the engines */
+	intel_gpu_reset(i915, ALL_ENGINES);
+
+	/*
+	 * Make sure no one is running the old callback before we proceed with
+	 * cancelling requests and resetting the completion tracking. Otherwise
+	 * we might submit a request to the hardware which never completes.
+	 */
+	synchronize_rcu();
+
+	for_each_engine(engine, i915, id) {
+		/* Mark all executing requests as skipped */
+		engine->cancel_requests(engine);
+
+		/*
+		 * Only once we've force-cancelled all in-flight requests can we
+		 * start to complete all requests.
+		 */
+		engine->submit_request = nop_complete_submit_request;
+	}
+
+	/*
+	 * Make sure no request can slip through without getting completed by
+	 * either this call here to intel_engine_init_global_seqno, or the one
+	 * in nop_complete_submit_request.
+	 */
+	synchronize_rcu();
+
+	for_each_engine(engine, i915, id) {
+		unsigned long flags;
+
+		/*
+		 * Mark all pending requests as complete so that any concurrent
+		 * (lockless) lookup doesn't try and wait upon the request as we
+		 * reset it.
+		 */
+		spin_lock_irqsave(&engine->timeline.lock, flags);
+		intel_engine_init_global_seqno(engine,
+					       intel_engine_last_submit(engine));
+		spin_unlock_irqrestore(&engine->timeline.lock, flags);
+
+		reset_finish_engine(engine);
+	}
+
+	GEM_TRACE("end\n");
+
+	wake_up_all(&i915->gpu_error.reset_queue);
+}
+
+bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+{
+	struct i915_timeline *tl;
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
+		return true;
+
+	GEM_TRACE("start\n");
+
+	/*
+	 * Before unwedging, make sure that all pending operations
+	 * are flushed and errored out - we may have requests waiting upon
+	 * third party fences. We marked all inflight requests as EIO, and
+	 * every execbuf since returned EIO, for consistency we want all
+	 * the currently pending requests to also be marked as EIO, which
+	 * is done inside our nop_submit_request - and so we must wait.
+	 *
+	 * No more can be submitted until we reset the wedged bit.
+	 */
+	list_for_each_entry(tl, &i915->gt.timelines, link) {
+		struct i915_request *rq;
+
+		rq = i915_gem_active_peek(&tl->last_request,
+					  &i915->drm.struct_mutex);
+		if (!rq)
+			continue;
+
+		/*
+		 * We can't use our normal waiter as we want to
+		 * avoid recursively trying to handle the current
+		 * reset. The basic dma_fence_default_wait() installs
+		 * a callback for dma_fence_signal(), which is
+		 * triggered by our nop handler (indirectly, the
+		 * callback enables the signaler thread which is
+		 * woken by the nop_submit_request() advancing the seqno
+		 * and when the seqno passes the fence, the signaler
+		 * then signals the fence waking us up).
+		 */
+		if (dma_fence_default_wait(&rq->fence, true,
+					   MAX_SCHEDULE_TIMEOUT) < 0)
+			return false;
+	}
+	i915_retire_requests(i915);
+	GEM_BUG_ON(i915->gt.active_requests);
+
+	/*
+	 * Undo nop_submit_request. We prevent all new i915 requests from
+	 * being queued (by disallowing execbuf whilst wedged) so having
+	 * waited for all active requests above, we know the system is idle
+	 * and do not have to worry about a thread being inside
+	 * engine->submit_request() as we swap over. So unlike installing
+	 * the nop_submit_request on reset, we can do this from normal
+	 * context and do not require stop_machine().
+	 */
+	intel_engines_reset_default_submission(i915);
+	i915_gem_contexts_lost(i915);
+
+	GEM_TRACE("end\n");
+
+	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
+	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
+
+	return true;
+}
+
+/**
+ * i915_reset - reset chip after a hang
+ * @i915: #drm_i915_private to reset
+ * @stalled_mask: mask of the stalled engines with the guilty requests
+ * @reason: user error message for why we are resetting
+ *
+ * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
+ * on failure.
+ *
+ * Caller must hold the struct_mutex.
+ *
+ * Procedure is fairly simple:
+ *   - reset the chip using the reset reg
+ *   - re-init context state
+ *   - re-init hardware status page
+ *   - re-init ring buffer
+ *   - re-init interrupt state
+ *   - re-init display
+ */
+void i915_reset(struct drm_i915_private *i915,
+		unsigned int stalled_mask,
+		const char *reason)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	int ret;
+	int i;
+
+	GEM_TRACE("flags=%lx\n", error->flags);
+
+	might_sleep();
+	lockdep_assert_held(&i915->drm.struct_mutex);
+	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
+
+	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
+		return;
+
+	/* Clear any previous failed attempts at recovery. Time to try again. */
+	if (!i915_gem_unset_wedged(i915))
+		goto wakeup;
+
+	if (reason)
+		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
+	error->reset_count++;
+
+	ret = reset_prepare(i915);
+	if (ret) {
+		dev_err(i915->drm.dev, "GPU recovery failed\n");
+		goto taint;
+	}
+
+	if (!intel_has_gpu_reset(i915)) {
+		if (i915_modparams.reset)
+			dev_err(i915->drm.dev, "GPU reset not supported\n");
+		else
+			DRM_DEBUG_DRIVER("GPU reset disabled\n");
+		goto error;
+	}
+
+	for (i = 0; i < 3; i++) {
+		ret = intel_gpu_reset(i915, ALL_ENGINES);
+		if (ret == 0)
+			break;
+
+		msleep(100);
+	}
+	if (ret) {
+		dev_err(i915->drm.dev, "Failed to reset chip\n");
+		goto taint;
+	}
+
+	/* Ok, now get things going again... */
+
+	/*
+	 * Everything depends on having the GTT running, so we need to start
+	 * there.
+	 */
+	ret = i915_ggtt_enable_hw(i915);
+	if (ret) {
+		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
+			  ret);
+		goto error;
+	}
+
+	gt_reset(i915, stalled_mask);
+	intel_overlay_reset(i915);
+
+	/*
+	 * Next we need to restore the context, but we don't use those
+	 * yet either...
+	 *
+	 * Ring buffer needs to be re-initialized in the KMS case, or if X
+	 * was running at the time of the reset (i.e. we weren't VT
+	 * switched away).
+	 */
+	ret = i915_gem_init_hw(i915);
+	if (ret) {
+		DRM_ERROR("Failed to initialise HW following reset (%d)\n",
+			  ret);
+		goto error;
+	}
+
+	i915_queue_hangcheck(i915);
+
+finish:
+	reset_finish(i915);
+wakeup:
+	clear_bit(I915_RESET_HANDOFF, &error->flags);
+	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
+	return;
+
+taint:
+	/*
+	 * History tells us that if we cannot reset the GPU now, we
+	 * never will. This then impacts everything that is run
+	 * subsequently. On failing the reset, we mark the driver
+	 * as wedged, preventing further execution on the GPU.
+	 * We also want to go one step further and add a taint to the
+	 * kernel so that any subsequent faults can be traced back to
+	 * this failure. This is important for CI, where if the
+	 * GPU/driver fails we would like to reboot and restart testing
+	 * rather than continue on into oblivion. For everyone else,
+	 * the system should still plod along, but they have been warned!
+	 */
+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+error:
+	i915_gem_set_wedged(i915);
+	i915_retire_requests(i915);
+	goto finish;
+}
+
+static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
+					struct intel_engine_cs *engine)
+{
+	return intel_gpu_reset(i915, intel_engine_flag(engine));
+}
+
+/**
+ * i915_reset_engine - reset GPU engine to recover from a hang
+ * @engine: engine to reset
+ * @msg: reason for GPU reset; or NULL for no dev_notice()
+ *
+ * Reset a specific GPU engine. Useful if a hang is detected.
+ * Returns zero on successful reset or otherwise an error code.
+ *
+ * Procedure is:
+ *  - identifies the request that caused the hang and it is dropped
+ *  - reset engine (which will force the engine to idle)
+ *  - re-init/configure engine
+ */
+int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
+{
+	struct i915_gpu_error *error = &engine->i915->gpu_error;
+	struct i915_request *active_request;
+	int ret;
+
+	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
+	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
+
+	active_request = reset_prepare_engine(engine);
+	if (IS_ERR_OR_NULL(active_request)) {
+		/* Either the previous reset failed, or we pardon the reset. */
+		ret = PTR_ERR(active_request);
+		goto out;
+	}
+
+	if (msg)
+		dev_notice(engine->i915->drm.dev,
+			   "Resetting %s for %s\n", engine->name, msg);
+	error->reset_engine_count[engine->id]++;
+
+	if (!engine->i915->guc.execbuf_client)
+		ret = intel_gt_reset_engine(engine->i915, engine);
+	else
+		ret = intel_guc_reset_engine(&engine->i915->guc, engine);
+	if (ret) {
+		/* If we fail here, we expect to fallback to a global reset */
+		DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
+				 engine->i915->guc.execbuf_client ? "GuC " : "",
+				 engine->name, ret);
+		goto out;
+	}
+
+	/*
+	 * The request that caused the hang is stuck on elsp, we know the
+	 * active request and can drop it, adjust head to skip the offending
+	 * request to resume executing remaining requests in the queue.
+	 */
+	reset_engine(engine, active_request, true);
+
+	/*
+	 * The engine and its registers (and workarounds in case of render)
+	 * have been reset to their default values. Follow the init_ring
+	 * process to program RING_MODE, HWSP and re-enable submission.
+	 */
+	ret = engine->init_hw(engine);
+	if (ret)
+		goto out;
+
+out:
+	reset_finish_engine(engine);
+	return ret;
+}
+
+struct wedge_me {
+	struct delayed_work work;
+	struct drm_i915_private *i915;
+	const char *name;
+};
+
+static void wedge_me(struct work_struct *work)
+{
+	struct wedge_me *w = container_of(work, typeof(*w), work.work);
+
+	dev_err(w->i915->drm.dev,
+		"%s timed out, cancelling all in-flight rendering.\n",
+		w->name);
+	i915_gem_set_wedged(w->i915);
+}
+
+static void __init_wedge(struct wedge_me *w,
+			 struct drm_i915_private *i915,
+			 long timeout,
+			 const char *name)
+{
+	w->i915 = i915;
+	w->name = name;
+
+	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
+	schedule_delayed_work(&w->work, timeout);
+}
+
+static void __fini_wedge(struct wedge_me *w)
+{
+	cancel_delayed_work_sync(&w->work);
+	destroy_delayed_work_on_stack(&w->work);
+	w->i915 = NULL;
+}
+
+#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
+	for (__init_wedge((W), (DEV), (TIMEOUT), __func__);		\
+	     (W)->i915;							\
+	     __fini_wedge((W)))
+
+static void i915_reset_device(struct drm_i915_private *i915,
+			      u32 engine_mask,
+			      const char *reason)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	struct kobject *kobj = &i915->drm.primary->kdev->kobj;
+	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
+	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
+	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
+	struct wedge_me w;
+
+	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
+
+	DRM_DEBUG_DRIVER("resetting chip\n");
+	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
+
+	/* Use a watchdog to ensure that our reset completes */
+	i915_wedge_on_timeout(&w, i915, 5 * HZ) {
+		intel_prepare_reset(i915);
+
+		error->reason = reason;
+		error->stalled_mask = engine_mask;
+
+		/* Signal that locked waiters should reset the GPU */
+		smp_mb__before_atomic();
+		set_bit(I915_RESET_HANDOFF, &error->flags);
+		wake_up_all(&error->wait_queue);
+
+		/*
+		 * Wait for anyone holding the lock to wakeup, without
+		 * blocking indefinitely on struct_mutex.
+		 */
+		do {
+			if (mutex_trylock(&i915->drm.struct_mutex)) {
+				i915_reset(i915, engine_mask, reason);
+				mutex_unlock(&i915->drm.struct_mutex);
+			}
+		} while (wait_on_bit_timeout(&error->flags,
+					     I915_RESET_HANDOFF,
+					     TASK_UNINTERRUPTIBLE,
+					     1));
+
+		error->stalled_mask = 0;
+		error->reason = NULL;
+
+		intel_finish_reset(i915);
+	}
+
+	if (!test_bit(I915_WEDGED, &error->flags))
+		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
+}
+
+static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
+{
+	u32 eir;
+
+	if (!IS_GEN2(dev_priv))
+		I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER));
+
+	if (INTEL_GEN(dev_priv) < 4)
+		I915_WRITE(IPEIR, I915_READ(IPEIR));
+	else
+		I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965));
+
+	I915_WRITE(EIR, I915_READ(EIR));
+	eir = I915_READ(EIR);
+	if (eir) {
+		/*
+		 * some errors might have become stuck,
+		 * mask them.
+		 */
+		DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
+		I915_WRITE(EMR, I915_READ(EMR) | eir);
+		I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT);
+	}
+}
+
+/**
+ * i915_handle_error - handle a gpu error
+ * @i915: i915 device private
+ * @engine_mask: mask representing engines that are hung
+ * @flags: control flags
+ * @fmt: Error message format string
+ *
+ * Do some basic checking of register state at error time and
+ * dump it to the syslog.  Also call i915_capture_error_state() to make
+ * sure we get a record and make it available in debugfs.  Fire a uevent
+ * so userspace knows something bad happened (should trigger collection
+ * of a ring dump etc.).
+ */
+void i915_handle_error(struct drm_i915_private *i915,
+		       u32 engine_mask,
+		       unsigned long flags,
+		       const char *fmt, ...)
+{
+	struct intel_engine_cs *engine;
+	unsigned int tmp;
+	char error_msg[80];
+	char *msg = NULL;
+
+	if (fmt) {
+		va_list args;
+
+		va_start(args, fmt);
+		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
+		va_end(args);
+
+		msg = error_msg;
+	}
+
+	/*
+	 * In most cases it's guaranteed that we get here with an RPM
+	 * reference held, for example because there is a pending GPU
+	 * request that won't finish until the reset is done. This
+	 * isn't the case at least when we get here by doing a
+	 * simulated reset via debugfs, so get an RPM reference.
+	 */
+	intel_runtime_pm_get(i915);
+
+	engine_mask &= INTEL_INFO(i915)->ring_mask;
+
+	if (flags & I915_ERROR_CAPTURE) {
+		i915_capture_error_state(i915, engine_mask, msg);
+		i915_clear_error_registers(i915);
+	}
+
+	/*
+	 * Try engine reset when available. We fall back to full reset if
+	 * single reset fails.
+	 */
+	if (intel_has_reset_engine(i915)) {
+		for_each_engine_masked(engine, i915, engine_mask, tmp) {
+			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
+			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
+					     &i915->gpu_error.flags))
+				continue;
+
+			if (i915_reset_engine(engine, msg) == 0)
+				engine_mask &= ~intel_engine_flag(engine);
+
+			clear_bit(I915_RESET_ENGINE + engine->id,
+				  &i915->gpu_error.flags);
+			wake_up_bit(&i915->gpu_error.flags,
+				    I915_RESET_ENGINE + engine->id);
+		}
+	}
+
+	if (!engine_mask)
+		goto out;
+
+	/* Full reset needs the mutex, stop any other user trying to do so. */
+	if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
+		wait_event(i915->gpu_error.reset_queue,
+			   !test_bit(I915_RESET_BACKOFF,
+				     &i915->gpu_error.flags));
+		goto out;
+	}
+
+	/* Prevent any other reset-engine attempt. */
+	for_each_engine(engine, i915, tmp) {
+		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
+					&i915->gpu_error.flags))
+			wait_on_bit(&i915->gpu_error.flags,
+				    I915_RESET_ENGINE + engine->id,
+				    TASK_UNINTERRUPTIBLE);
+	}
+
+	i915_reset_device(i915, engine_mask, msg);
+
+	for_each_engine(engine, i915, tmp) {
+		clear_bit(I915_RESET_ENGINE + engine->id,
+			  &i915->gpu_error.flags);
+	}
+
+	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
+	wake_up_all(&i915->gpu_error.reset_queue);
+
+out:
+	intel_runtime_pm_put(i915);
+}
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
new file mode 100644
index 000000000000..09422c4772dd
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -0,0 +1,37 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2008-2018 Intel Corporation
+ */
+
+#ifndef I915_RESET_H
+#define I915_RESET_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+struct drm_i915_private;
+struct intel_engine_cs;
+struct intel_guc;
+
+__printf(4, 5)
+void i915_handle_error(struct drm_i915_private *i915,
+		       u32 engine_mask,
+		       unsigned long flags,
+		       const char *fmt, ...);
+#define I915_ERROR_CAPTURE BIT(0)
+
+void i915_reset(struct drm_i915_private *i915,
+		unsigned int stalled_mask,
+		const char *reason);
+int i915_reset_engine(struct intel_engine_cs *engine,
+		      const char *reason);
+
+bool intel_has_gpu_reset(struct drm_i915_private *i915);
+bool intel_has_reset_engine(struct drm_i915_private *i915);
+
+int intel_gpu_reset(struct drm_i915_private *i915, u32 engine_mask);
+
+int intel_reset_guc(struct drm_i915_private *i915);
+
+#endif /* I915_RESET_H */
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 7998e70a3174..8a07de5ac740 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -33,13 +33,7 @@
 #include <linux/vgaarb.h>
 #include <drm/drm_edid.h>
 #include <drm/drmP.h>
-#include "intel_drv.h"
-#include "intel_frontbuffer.h"
 #include <drm/i915_drm.h>
-#include "i915_drv.h"
-#include "i915_gem_clflush.h"
-#include "intel_dsi.h"
-#include "i915_trace.h"
 #include <drm/drm_atomic.h>
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_dp_helper.h>
@@ -49,6 +43,15 @@
 #include <linux/dma_remapping.h>
 #include <linux/reservation.h>
 
+#include "intel_drv.h"
+#include "intel_dsi.h"
+#include "intel_frontbuffer.h"
+
+#include "i915_drv.h"
+#include "i915_gem_clflush.h"
+#include "i915_reset.h"
+#include "i915_trace.h"
+
 /* Primary plane formats for gen <= 3 */
 static const uint32_t i8xx_primary_formats[] = {
 	DRM_FORMAT_C8,
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 4121928a495e..df1a384c2f92 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -189,4 +189,7 @@ static inline void intel_guc_disable_msg(struct intel_guc *guc, u32 mask)
 	spin_unlock_irq(&guc->irq_lock);
 }
 
+int intel_guc_reset_engine(struct intel_guc *guc,
+			   struct intel_engine_cs *engine);
+
 #endif
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 2fc7a0dd0df9..5141df342884 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -23,6 +23,7 @@
  */
 
 #include "i915_drv.h"
+#include "i915_reset.h"
 
 static bool
 ipehr_is_semaphore_wait(struct intel_engine_cs *engine, u32 ipehr)
diff --git a/drivers/gpu/drm/i915/intel_uc.c b/drivers/gpu/drm/i915/intel_uc.c
index 7c95697e1a35..88352ff7164a 100644
--- a/drivers/gpu/drm/i915/intel_uc.c
+++ b/drivers/gpu/drm/i915/intel_uc.c
@@ -26,6 +26,7 @@
 #include "intel_guc_submission.h"
 #include "intel_guc.h"
 #include "i915_drv.h"
+#include "i915_reset.h"
 
 static void guc_free_load_err_log(struct intel_guc *guc);
 
diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
index b892ca8396e8..1abd342e9cce 100644
--- a/drivers/gpu/drm/i915/intel_uncore.c
+++ b/drivers/gpu/drm/i915/intel_uncore.c
@@ -1698,258 +1698,6 @@ int i915_reg_read_ioctl(struct drm_device *dev,
 	return ret;
 }
 
-static void gen3_stop_engine(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	const u32 base = engine->mmio_base;
-
-	if (intel_engine_stop_cs(engine))
-		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name);
-
-	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
-	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
-
-	I915_WRITE_FW(RING_HEAD(base), 0);
-	I915_WRITE_FW(RING_TAIL(base), 0);
-	POSTING_READ_FW(RING_TAIL(base));
-
-	/* The ring must be empty before it is disabled */
-	I915_WRITE_FW(RING_CTL(base), 0);
-
-	/* Check acts as a post */
-	if (I915_READ_FW(RING_HEAD(base)) != 0)
-		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
-				 engine->name);
-}
-
-static void i915_stop_engines(struct drm_i915_private *dev_priv,
-			      unsigned engine_mask)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	if (INTEL_GEN(dev_priv) < 3)
-		return;
-
-	for_each_engine_masked(engine, dev_priv, engine_mask, id)
-		gen3_stop_engine(engine);
-}
-
-static bool i915_in_reset(struct pci_dev *pdev)
-{
-	u8 gdrst;
-
-	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
-	return gdrst & GRDOM_RESET_STATUS;
-}
-
-static int i915_do_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
-{
-	struct pci_dev *pdev = dev_priv->drm.pdev;
-	int err;
-
-	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
-	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
-	usleep_range(50, 200);
-	err = wait_for(i915_in_reset(pdev), 500);
-
-	/* Clear the reset request. */
-	pci_write_config_byte(pdev, I915_GDRST, 0);
-	usleep_range(50, 200);
-	if (!err)
-		err = wait_for(!i915_in_reset(pdev), 500);
-
-	return err;
-}
-
-static bool g4x_reset_complete(struct pci_dev *pdev)
-{
-	u8 gdrst;
-
-	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
-	return (gdrst & GRDOM_RESET_ENABLE) == 0;
-}
-
-static int g33_do_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
-{
-	struct pci_dev *pdev = dev_priv->drm.pdev;
-
-	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
-	return wait_for(g4x_reset_complete(pdev), 500);
-}
-
-static int g4x_do_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
-{
-	struct pci_dev *pdev = dev_priv->drm.pdev;
-	int ret;
-
-	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
-	I915_WRITE(VDECCLK_GATE_D,
-		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
-	POSTING_READ(VDECCLK_GATE_D);
-
-	pci_write_config_byte(pdev, I915_GDRST,
-			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
-	ret =  wait_for(g4x_reset_complete(pdev), 500);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
-		goto out;
-	}
-
-	pci_write_config_byte(pdev, I915_GDRST,
-			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
-	ret =  wait_for(g4x_reset_complete(pdev), 500);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
-		goto out;
-	}
-
-out:
-	pci_write_config_byte(pdev, I915_GDRST, 0);
-
-	I915_WRITE(VDECCLK_GATE_D,
-		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
-	POSTING_READ(VDECCLK_GATE_D);
-
-	return ret;
-}
-
-static int ironlake_do_reset(struct drm_i915_private *dev_priv,
-			     unsigned engine_mask)
-{
-	int ret;
-
-	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
-	ret = intel_wait_for_register(dev_priv,
-				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
-				      500);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
-		goto out;
-	}
-
-	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
-	ret = intel_wait_for_register(dev_priv,
-				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
-				      500);
-	if (ret) {
-		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
-		goto out;
-	}
-
-out:
-	I915_WRITE(ILK_GDSR, 0);
-	POSTING_READ(ILK_GDSR);
-	return ret;
-}
-
-/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
-static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
-				u32 hw_domain_mask)
-{
-	int err;
-
-	/* GEN6_GDRST is not in the gt power well, no need to check
-	 * for fifo space for the write or forcewake the chip for
-	 * the read
-	 */
-	__raw_i915_write32(dev_priv, GEN6_GDRST, hw_domain_mask);
-
-	/* Wait for the device to ack the reset requests */
-	err = __intel_wait_for_register_fw(dev_priv,
-					   GEN6_GDRST, hw_domain_mask, 0,
-					   500, 0,
-					   NULL);
-	if (err)
-		DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
-				 hw_domain_mask);
-
-	return err;
-}
-
-/**
- * gen6_reset_engines - reset individual engines
- * @dev_priv: i915 device
- * @engine_mask: mask of intel_ring_flag() engines or ALL_ENGINES for full reset
- *
- * This function will reset the individual engines that are set in engine_mask.
- * If you provide ALL_ENGINES as mask, full global domain reset will be issued.
- *
- * Note: It is responsibility of the caller to handle the difference between
- * asking full domain reset versus reset for all available individual engines.
- *
- * Returns 0 on success, nonzero on error.
- */
-static int gen6_reset_engines(struct drm_i915_private *dev_priv,
-			      unsigned engine_mask)
-{
-	struct intel_engine_cs *engine;
-	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
-		[RCS] = GEN6_GRDOM_RENDER,
-		[BCS] = GEN6_GRDOM_BLT,
-		[VCS] = GEN6_GRDOM_MEDIA,
-		[VCS2] = GEN8_GRDOM_MEDIA2,
-		[VECS] = GEN6_GRDOM_VECS,
-	};
-	u32 hw_mask;
-
-	if (engine_mask == ALL_ENGINES) {
-		hw_mask = GEN6_GRDOM_FULL;
-	} else {
-		unsigned int tmp;
-
-		hw_mask = 0;
-		for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-			hw_mask |= hw_engine_mask[engine->id];
-	}
-
-	return gen6_hw_domain_reset(dev_priv, hw_mask);
-}
-
-/**
- * gen11_reset_engines - reset individual engines
- * @dev_priv: i915 device
- * @engine_mask: mask of intel_ring_flag() engines or ALL_ENGINES for full reset
- *
- * This function will reset the individual engines that are set in engine_mask.
- * If you provide ALL_ENGINES as mask, full global domain reset will be issued.
- *
- * Note: It is responsibility of the caller to handle the difference between
- * asking full domain reset versus reset for all available individual engines.
- *
- * Returns 0 on success, nonzero on error.
- */
-static int gen11_reset_engines(struct drm_i915_private *dev_priv,
-			       unsigned engine_mask)
-{
-	struct intel_engine_cs *engine;
-	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
-		[RCS] = GEN11_GRDOM_RENDER,
-		[BCS] = GEN11_GRDOM_BLT,
-		[VCS] = GEN11_GRDOM_MEDIA,
-		[VCS2] = GEN11_GRDOM_MEDIA2,
-		[VCS3] = GEN11_GRDOM_MEDIA3,
-		[VCS4] = GEN11_GRDOM_MEDIA4,
-		[VECS] = GEN11_GRDOM_VECS,
-		[VECS2] = GEN11_GRDOM_VECS2,
-	};
-	u32 hw_mask;
-
-	BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES);
-
-	if (engine_mask == ALL_ENGINES) {
-		hw_mask = GEN11_GRDOM_FULL;
-	} else {
-		unsigned int tmp;
-
-		hw_mask = 0;
-		for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-			hw_mask |= hw_engine_mask[engine->id];
-	}
-
-	return gen6_hw_domain_reset(dev_priv, hw_mask);
-}
-
 /**
  * __intel_wait_for_register_fw - wait until register matches expected state
  * @dev_priv: the i915 device
@@ -2060,169 +1808,6 @@ int __intel_wait_for_register(struct drm_i915_private *dev_priv,
 	return ret;
 }
 
-static int gen8_reset_engine_start(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-	int ret;
-
-	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
-		      _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
-
-	ret = __intel_wait_for_register_fw(dev_priv,
-					   RING_RESET_CTL(engine->mmio_base),
-					   RESET_CTL_READY_TO_RESET,
-					   RESET_CTL_READY_TO_RESET,
-					   700, 0,
-					   NULL);
-	if (ret)
-		DRM_ERROR("%s: reset request timeout\n", engine->name);
-
-	return ret;
-}
-
-static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->i915;
-
-	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
-		      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
-}
-
-static int gen8_reset_engines(struct drm_i915_private *dev_priv,
-			      unsigned engine_mask)
-{
-	struct intel_engine_cs *engine;
-	unsigned int tmp;
-	int ret;
-
-	for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
-		if (gen8_reset_engine_start(engine)) {
-			ret = -EIO;
-			goto not_ready;
-		}
-	}
-
-	if (INTEL_GEN(dev_priv) >= 11)
-		ret = gen11_reset_engines(dev_priv, engine_mask);
-	else
-		ret = gen6_reset_engines(dev_priv, engine_mask);
-
-not_ready:
-	for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
-		gen8_reset_engine_cancel(engine);
-
-	return ret;
-}
-
-typedef int (*reset_func)(struct drm_i915_private *, unsigned engine_mask);
-
-static reset_func intel_get_gpu_reset(struct drm_i915_private *dev_priv)
-{
-	if (!i915_modparams.reset)
-		return NULL;
-
-	if (INTEL_GEN(dev_priv) >= 8)
-		return gen8_reset_engines;
-	else if (INTEL_GEN(dev_priv) >= 6)
-		return gen6_reset_engines;
-	else if (IS_GEN5(dev_priv))
-		return ironlake_do_reset;
-	else if (IS_G4X(dev_priv))
-		return g4x_do_reset;
-	else if (IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
-		return g33_do_reset;
-	else if (INTEL_GEN(dev_priv) >= 3)
-		return i915_do_reset;
-	else
-		return NULL;
-}
-
-int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
-{
-	reset_func reset = intel_get_gpu_reset(dev_priv);
-	int retry;
-	int ret;
-
-	/*
-	 * We want to perform per-engine reset from atomic context (e.g.
-	 * softirq), which imposes the constraint that we cannot sleep.
-	 * However, experience suggests that spending a bit of time waiting
-	 * for a reset helps in various cases, so for a full-device reset
-	 * we apply the opposite rule and wait if we want to. As we should
-	 * always follow up a failed per-engine reset with a full device reset,
-	 * being a little faster, stricter and more error prone for the
-	 * atomic case seems an acceptable compromise.
-	 *
-	 * Unfortunately this leads to a bimodal routine, when the goal was
-	 * to have a single reset function that worked for resetting any
-	 * number of engines simultaneously.
-	 */
-	might_sleep_if(engine_mask == ALL_ENGINES);
-
-	/*
-	 * If the power well sleeps during the reset, the reset
-	 * request may be dropped and never completes (causing -EIO).
-	 */
-	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
-	for (retry = 0; retry < 3; retry++) {
-
-		/*
-		 * We stop engines, otherwise we might get failed reset and a
-		 * dead gpu (on elk). Also as modern gpu as kbl can suffer
-		 * from system hang if batchbuffer is progressing when
-		 * the reset is issued, regardless of READY_TO_RESET ack.
-		 * Thus assume it is best to stop engines on all gens
-		 * where we have a gpu reset.
-		 *
-		 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
-		 *
-		 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
-		 *
-		 * FIXME: Wa for more modern gens needs to be validated
-		 */
-		i915_stop_engines(dev_priv, engine_mask);
-
-		ret = -ENODEV;
-		if (reset) {
-			GEM_TRACE("engine_mask=%x\n", engine_mask);
-			ret = reset(dev_priv, engine_mask);
-		}
-		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
-			break;
-
-		cond_resched();
-	}
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-
-	return ret;
-}
-
-bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
-{
-	return intel_get_gpu_reset(dev_priv) != NULL;
-}
-
-bool intel_has_reset_engine(struct drm_i915_private *dev_priv)
-{
-	return (dev_priv->info.has_reset_engine &&
-		i915_modparams.reset >= 2);
-}
-
-int intel_reset_guc(struct drm_i915_private *dev_priv)
-{
-	u32 guc_domain = INTEL_GEN(dev_priv) >= 11 ? GEN11_GRDOM_GUC :
-						     GEN9_GRDOM_GUC;
-	int ret;
-
-	GEM_BUG_ON(!HAS_GUC(dev_priv));
-
-	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
-	ret = gen6_hw_domain_reset(dev_priv, guc_domain);
-	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
-
-	return ret;
-}
-
 bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
 {
 	return check_for_unclaimed_mmio(dev_priv);
diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
index fafdec3fe83e..7f842dbbea1f 100644
--- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
@@ -5,6 +5,7 @@
  */
 
 #include "../i915_selftest.h"
+#include "../i915_reset.h"
 
 #include "mock_context.h"
 
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* [PATCH 7/7] drm/i915: Remove GPU reset dependence on struct_mutex
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
                   ` (5 preceding siblings ...)
  2018-07-11  7:36 ` [PATCH 6/7] drm/i915: Pull all the reset functionality together into i915_reset.c Chris Wilson
@ 2018-07-11  7:36 ` Chris Wilson
  2018-07-11  7:46 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/7] drm/i915: Introduce i915_address_space.mutex Patchwork
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  7:36 UTC (permalink / raw)
  To: intel-gfx

Now that the submission backends are controlled via their own spinlocks,
with a wave of a magic wand we can lift the struct_mutex requirement
around GPU reset. That is we allow the submission frontend (userspace)
to keep on submitting while we process the GPU reset as we can suspend
the backend independently.

The major change is around the backoff/handoff strategy for performing
the reset. With no mutex deadlock, we no longer have to coordinate with
any waiter, and just perform the reset immediately.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |   7 -
 drivers/gpu/drm/i915/i915_drv.h               |   5 -
 drivers/gpu/drm/i915/i915_gem.c               |  18 +-
 drivers/gpu/drm/i915/i915_gem_fence_reg.c     |  39 ++-
 drivers/gpu/drm/i915/i915_gem_fence_reg.h     |   6 +-
 drivers/gpu/drm/i915/i915_gpu_error.h         |  22 +-
 drivers/gpu/drm/i915/i915_request.c           |  46 ---
 drivers/gpu/drm/i915/i915_reset.c             | 316 +++++++-----------
 drivers/gpu/drm/i915/i915_reset.h             |   3 +
 drivers/gpu/drm/i915/intel_engine_cs.c        |   6 +-
 drivers/gpu/drm/i915/intel_guc_submission.c   |   5 +-
 drivers/gpu/drm/i915/intel_lrc.c              |  98 ++----
 drivers/gpu/drm/i915/intel_overlay.c          |   2 -
 drivers/gpu/drm/i915/intel_ringbuffer.c       |  89 +++--
 drivers/gpu/drm/i915/intel_ringbuffer.h       |  13 +-
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |  29 +-
 16 files changed, 271 insertions(+), 433 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a0f519c44410..347af23ff6b6 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1318,8 +1318,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_puts(m, "Wedged\n");
 	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
 		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-	if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Reset in progress: reset handoff to waiter\n");
 	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
 		seq_puts(m, "Waiter holding struct mutex\n");
 	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
@@ -4081,11 +4079,6 @@ i915_wedged_set(void *data, u64 val)
 
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
-
-	wait_on_bit(&i915->gpu_error.flags,
-		    I915_RESET_HANDOFF,
-		    TASK_UNINTERRUPTIBLE);
-
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 84b1073eacd8..93996484419f 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3081,11 +3081,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
 	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
 }
 
-static inline bool i915_reset_handoff(struct i915_gpu_error *error)
-{
-	return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
-}
-
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b5822cc36221..f010c35e5ce9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -658,11 +658,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
 		     struct intel_rps_client *rps_client)
 {
 	might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
-	GEM_BUG_ON(debug_locks &&
-		   !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
-		   !!(flags & I915_WAIT_LOCKED));
-#endif
 	GEM_BUG_ON(timeout < 0);
 
 	timeout = i915_gem_object_wait_reservation(obj->resv,
@@ -4559,8 +4554,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 
 	GEM_TRACE("\n");
 
-	mutex_lock(&i915->drm.struct_mutex);
-
 	intel_runtime_pm_get(i915);
 	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
 
@@ -4590,6 +4583,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 	intel_runtime_pm_put(i915);
 
+	mutex_lock(&i915->drm.struct_mutex);
 	i915_gem_contexts_lost(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 }
@@ -4603,6 +4597,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	intel_runtime_pm_get(i915);
 	intel_suspend_gt_powersave(i915);
 
+	flush_workqueue(i915->wq);
+
 	mutex_lock(&i915->drm.struct_mutex);
 
 	/*
@@ -4630,11 +4626,9 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 		assert_kernel_context_is_current(i915);
 	}
 	mutex_unlock(&i915->drm.struct_mutex);
+	i915_reset_flush(i915);
 
-	intel_uc_suspend(i915);
-
-	cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
-	cancel_delayed_work_sync(&i915->gt.retire_work);
+	drain_delayed_work(&i915->gt.retire_work);
 
 	/*
 	 * As the idle_work is rearming if it detects a race, play safe and
@@ -4642,6 +4636,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	 */
 	drain_delayed_work(&i915->gt.idle_work);
 
+	intel_uc_suspend(i915);
+
 	/*
 	 * Assert that we successfully flushed all the work and
 	 * reset the GPU back to its idle, low power state.
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 9313a8e675c8..e458ae189f82 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -473,7 +473,7 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
 }
 
 /**
- * i915_gem_revoke_fences - revoke fence state
+ * __i915_gem_revoke_fences - revoke fence state
  * @i915: i915 device private
  *
  * Removes all GTT mmappings via the fence registers. This forces any user
@@ -482,12 +482,13 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
  * revoke concurrent userspace access via GTT mmaps until the hardware has been
  * reset and the fence registers have been restored.
  */
-void i915_gem_revoke_fences(struct drm_i915_private *i915)
+void __i915_gem_revoke_fences(struct drm_i915_private *i915)
 {
 	struct i915_ggtt *ggtt = &i915->ggtt;
 	int i;
 
-	mutex_lock(&ggtt->vm.mutex);
+	lockdep_assert_held(&ggtt->vm.mutex);
+
 	for (i = 0; i < ggtt->num_fence_regs; i++) {
 		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
 
@@ -496,23 +497,15 @@ void i915_gem_revoke_fences(struct drm_i915_private *i915)
 		if (fence->vma)
 			i915_vma_revoke_mmap(fence->vma);
 	}
-	mutex_unlock(&ggtt->vm.mutex);
 }
 
-/**
- * i915_gem_restore_fences - restore fence state
- * @i915: i915 device private
- *
- * Restore the hw fence state to match the software tracking again, to be called
- * after a gpu reset and on resume. Note that on runtime suspend we only cancel
- * the fences, to be reacquired by the user later.
- */
-void i915_gem_restore_fences(struct drm_i915_private *i915)
+void __i915_gem_restore_fences(struct drm_i915_private *i915)
 {
 	struct i915_ggtt *ggtt = &i915->ggtt;
 	int i;
 
-	mutex_lock(&ggtt->vm.mutex);
+	lockdep_assert_held(&ggtt->vm.mutex);
+
 	for (i = 0; i < ggtt->num_fence_regs; i++) {
 		struct drm_i915_fence_reg *reg = &ggtt->fence_regs[i];
 		struct i915_vma *vma = reg->vma;
@@ -535,6 +528,24 @@ void i915_gem_restore_fences(struct drm_i915_private *i915)
 		fence_write(reg, vma);
 		reg->vma = vma;
 	}
+}
+
+/**
+ * i915_gem_restore_fences - restore fence state
+ * @i915: i915 device private
+ *
+ * Restore the hw fence state to match the software tracking again, to be called
+ * after a gpu reset and on resume. Note that on runtime suspend we only cancel
+ * the fences, to be reacquired by the user later.
+ */
+void i915_gem_restore_fences(struct drm_i915_private *i915)
+{
+	struct i915_ggtt *ggtt = &i915->ggtt;
+
+	mutex_lock(&ggtt->vm.mutex);
+
+	__i915_gem_restore_fences(i915);
+
 	mutex_unlock(&ggtt->vm.mutex);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
index 6e66f6b3f851..cd50a9ec36fc 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -60,8 +60,10 @@ struct drm_i915_fence_reg *
 i915_reserve_fence(struct drm_i915_private *i915);
 void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
 
-void i915_gem_revoke_fences(struct drm_i915_private *i915);
-void i915_gem_restore_fences(struct drm_i915_private *i915);
+void __i915_gem_revoke_fences(struct drm_i915_private *dev_priv);
+void __i915_gem_restore_fences(struct drm_i915_private *dev_priv);
+
+void i915_gem_restore_fences(struct drm_i915_private *dev_priv);
 
 void i915_gem_detect_bit_6_swizzle(struct drm_i915_private *i915);
 void i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index f893a4e8b783..9819952a6e49 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -193,6 +193,8 @@ struct i915_gpu_state {
 	struct i915_address_space *active_vm[I915_NUM_ENGINES];
 };
 
+struct i915_gpu_restart;
+
 struct i915_gpu_error {
 	/* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
@@ -243,15 +245,6 @@ struct i915_gpu_error {
 	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
 	 * secondary role in preventing two concurrent global reset attempts.
 	 *
-	 * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
-	 * struct_mutex. We try to acquire the struct_mutex in the reset worker,
-	 * but it may be held by some long running waiter (that we cannot
-	 * interrupt without causing trouble). Once we are ready to do the GPU
-	 * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
-	 * they already hold the struct_mutex and want to participate they can
-	 * inspect the bit and do the reset directly, otherwise the worker
-	 * waits for the struct_mutex.
-	 *
 	 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
 	 * acquire the struct_mutex to reset an engine, we need an explicit
 	 * flag to prevent two concurrent reset attempts in the same engine.
@@ -265,20 +258,13 @@ struct i915_gpu_error {
 	 */
 	unsigned long flags;
 #define I915_RESET_BACKOFF	0
-#define I915_RESET_HANDOFF	1
-#define I915_RESET_MODESET	2
+#define I915_RESET_MODESET	1
 #define I915_WEDGED		(BITS_PER_LONG - 1)
 #define I915_RESET_ENGINE	(I915_WEDGED - I915_NUM_ENGINES)
 
 	/** Number of times an engine has been reset */
 	u32 reset_engine_count[I915_NUM_ENGINES];
 
-	/** Set of stalled engines with guilty requests, in the current reset */
-	u32 stalled_mask;
-
-	/** Reason for the current *global* reset */
-	const char *reason;
-
 	/**
 	 * Waitqueue to signal when a hang is detected. Used to for waiters
 	 * to release the struct_mutex for the reset to procede.
@@ -293,6 +279,8 @@ struct i915_gpu_error {
 
 	/* For missed irq/seqno simulation. */
 	unsigned long test_irq_rings;
+
+	struct i915_gpu_restart *restart;
 };
 
 struct drm_i915_error_state_buf {
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 9bbea7baa55d..4877da9be3c0 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1247,18 +1247,6 @@ static bool __i915_spin_request(const struct i915_request *rq,
 	return false;
 }
 
-static bool __i915_wait_request_check_and_reset(struct i915_request *request)
-{
-	struct i915_gpu_error *error = &request->i915->gpu_error;
-
-	if (likely(!i915_reset_handoff(error)))
-		return false;
-
-	__set_current_state(TASK_RUNNING);
-	i915_reset(request->i915, error->stalled_mask, error->reason);
-	return true;
-}
-
 /**
  * i915_request_wait - wait until execution of request has finished
  * @rq: the request to wait upon
@@ -1284,17 +1272,10 @@ long i915_request_wait(struct i915_request *rq,
 {
 	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
 		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
-	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
-	DEFINE_WAIT_FUNC(reset, default_wake_function);
 	DEFINE_WAIT_FUNC(exec, default_wake_function);
 	struct intel_wait wait;
 
 	might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
-	GEM_BUG_ON(debug_locks &&
-		   !!lockdep_is_held(&rq->i915->drm.struct_mutex) !=
-		   !!(flags & I915_WAIT_LOCKED));
-#endif
 	GEM_BUG_ON(timeout < 0);
 
 	if (i915_request_completed(rq))
@@ -1304,10 +1285,7 @@ long i915_request_wait(struct i915_request *rq,
 		return -ETIME;
 
 	trace_i915_request_wait_begin(rq, flags);
-
 	add_wait_queue(&rq->execute, &exec);
-	if (flags & I915_WAIT_LOCKED)
-		add_wait_queue(errq, &reset);
 
 	intel_wait_init(&wait);
 
@@ -1317,10 +1295,6 @@ long i915_request_wait(struct i915_request *rq,
 		if (intel_wait_update_request(&wait, rq))
 			break;
 
-		if (flags & I915_WAIT_LOCKED &&
-		    __i915_wait_request_check_and_reset(rq))
-			continue;
-
 		if (signal_pending_state(state, current)) {
 			timeout = -ERESTARTSYS;
 			goto complete;
@@ -1350,9 +1324,6 @@ long i915_request_wait(struct i915_request *rq,
 		 */
 		goto wakeup;
 
-	if (flags & I915_WAIT_LOCKED)
-		__i915_wait_request_check_and_reset(rq);
-
 	for (;;) {
 		if (signal_pending_state(state, current)) {
 			timeout = -ERESTARTSYS;
@@ -1382,21 +1353,6 @@ long i915_request_wait(struct i915_request *rq,
 		if (__i915_request_irq_complete(rq))
 			break;
 
-		/*
-		 * If the GPU is hung, and we hold the lock, reset the GPU
-		 * and then check for completion. On a full reset, the engine's
-		 * HW seqno will be advanced passed us and we are complete.
-		 * If we do a partial reset, we have to wait for the GPU to
-		 * resume and update the breadcrumb.
-		 *
-		 * If we don't hold the mutex, we can just wait for the worker
-		 * to come along and update the breadcrumb (either directly
-		 * itself, or indirectly by recovering the GPU).
-		 */
-		if (flags & I915_WAIT_LOCKED &&
-		    __i915_wait_request_check_and_reset(rq))
-			continue;
-
 		/* Only spin if we know the GPU is processing this request */
 		if (__i915_spin_request(rq, wait.seqno, state, 2))
 			break;
@@ -1410,8 +1366,6 @@ long i915_request_wait(struct i915_request *rq,
 	intel_engine_remove_wait(rq->engine, &wait);
 complete:
 	__set_current_state(TASK_RUNNING);
-	if (flags & I915_WAIT_LOCKED)
-		remove_wait_queue(errq, &reset);
 	remove_wait_queue(&rq->execute, &exec);
 	trace_i915_request_wait_end(rq);
 
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index edf29da15a99..3717899a287c 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -15,22 +15,23 @@ static void engine_skip_context(struct i915_request *rq)
 	struct intel_engine_cs *engine = rq->engine;
 	struct i915_gem_context *hung_ctx = rq->gem_context;
 	struct i915_timeline *timeline = rq->timeline;
-	unsigned long flags;
 
+	lockdep_assert_held(&engine->timeline.lock);
 	GEM_BUG_ON(timeline == &engine->timeline);
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
 	spin_lock(&timeline->lock);
 
-	list_for_each_entry_continue(rq, &engine->timeline.requests, link)
-		if (rq->gem_context == hung_ctx)
-			i915_request_skip(rq, -EIO);
+	if (rq->global_seqno) {
+		list_for_each_entry_continue(rq,
+					     &engine->timeline.requests, link)
+			if (rq->gem_context == hung_ctx)
+				i915_request_skip(rq, -EIO);
+	}
 
 	list_for_each_entry(rq, &timeline->requests, link)
 		i915_request_skip(rq, -EIO);
 
 	spin_unlock(&timeline->lock);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
 static void client_mark_guilty(struct drm_i915_file_private *file_priv,
@@ -57,7 +58,7 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv,
 	}
 }
 
-static void context_mark_guilty(struct i915_gem_context *ctx)
+static bool context_mark_guilty(struct i915_gem_context *ctx)
 {
 	unsigned int score;
 	bool banned, bannable;
@@ -70,7 +71,7 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
 
 	/* Cool contexts don't accumulate client ban score */
 	if (!bannable)
-		return;
+		return false;
 
 	if (banned) {
 		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
@@ -81,6 +82,8 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
 
 	if (!IS_ERR_OR_NULL(ctx->file_priv))
 		client_mark_guilty(ctx->file_priv, ctx);
+
+	return banned;
 }
 
 static void context_mark_innocent(struct i915_gem_context *ctx)
@@ -88,6 +91,21 @@ static void context_mark_innocent(struct i915_gem_context *ctx)
 	atomic_inc(&ctx->active_count);
 }
 
+void i915_reset_request(struct i915_request *rq, bool guilty)
+{
+	lockdep_assert_held(&rq->engine->timeline.lock);
+	GEM_BUG_ON(i915_request_completed(rq));
+
+	if (guilty) {
+		i915_request_skip(rq, -EIO);
+		if (context_mark_guilty(rq->gem_context))
+			engine_skip_context(rq);
+	} else {
+		dma_fence_set_error(&rq->fence, -EAGAIN);
+		context_mark_innocent(rq->gem_context);
+	}
+}
+
 static void gen3_stop_engine(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
@@ -482,11 +500,8 @@ int intel_reset_guc(struct drm_i915_private *i915)
  * Ensure irq handler finishes, and not run again.
  * Also return the active request so that we only search for it once.
  */
-static struct i915_request *
-reset_prepare_engine(struct intel_engine_cs *engine)
+static void reset_prepare_engine(struct intel_engine_cs *engine)
 {
-	struct i915_request *rq;
-
 	/*
 	 * During the reset sequence, we must prevent the engine from
 	 * entering RC6. As the context state is undefined until we restart
@@ -495,171 +510,78 @@ reset_prepare_engine(struct intel_engine_cs *engine)
 	 * GPU state upon resume, i.e. fail to restart after a reset.
 	 */
 	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
-
-	rq = engine->reset.prepare(engine);
-	if (rq && rq->fence.error == -EIO)
-		rq = ERR_PTR(-EIO); /* Previous reset failed! */
-
-	return rq;
+	engine->reset.prepare(engine);
 }
 
-static int reset_prepare(struct drm_i915_private *i915)
+static void reset_prepare(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
-	struct i915_request *rq;
 	enum intel_engine_id id;
-	int err = 0;
-
-	disable_irq(i915->drm.irq);
-
-	for_each_engine(engine, i915, id) {
-		rq = reset_prepare_engine(engine);
-		if (IS_ERR(rq)) {
-			err = PTR_ERR(rq);
-			continue;
-		}
 
-		engine->hangcheck.active_request = rq;
-	}
+	for_each_engine(engine, i915, id)
+		reset_prepare_engine(engine);
 
-	i915_gem_revoke_fences(i915);
 	intel_uc_sanitize(i915);
-
-	return err;
 }
 
-/* Returns the request if it was guilty of the hang */
-static struct i915_request *
-reset_request(struct intel_engine_cs *engine,
-	      struct i915_request *rq,
-	      bool stalled)
+static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
-	/*
-	 * The guilty request will get skipped on a hung engine.
-	 *
-	 * Users of client default contexts do not rely on logical
-	 * state preserved between batches so it is safe to execute
-	 * queued requests following the hang. Non default contexts
-	 * rely on preserved state, so skipping a batch loses the
-	 * evolution of the state and it needs to be considered corrupted.
-	 * Executing more queued batches on top of corrupted state is
-	 * risky. But we take the risk by trying to advance through
-	 * the queued requests in order to make the client behaviour
-	 * more predictable around resets, by not throwing away random
-	 * amount of batches it has prepared for execution. Sophisticated
-	 * clients can use gem_reset_stats_ioctl and dma fence status
-	 * (exported via sync_file info ioctl on explicit fences) to observe
-	 * when it loses the context state and should rebuild accordingly.
-	 *
-	 * The context ban, and ultimately the client ban, mechanism are safety
-	 * valves if client submission ends up resulting in nothing more than
-	 * subsequent hangs.
-	 */
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
 
-	if (i915_request_completed(rq)) {
-		GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
-			  engine->name, rq->global_seqno,
-			  rq->fence.context, rq->fence.seqno,
-			  intel_engine_get_seqno(engine));
-		stalled = false;
-	}
+	mutex_lock(&i915->ggtt.vm.mutex);
+	__i915_gem_revoke_fences(i915);
 
-	if (stalled) {
-		context_mark_guilty(rq->gem_context);
-		i915_request_skip(rq, -EIO);
+	for_each_engine(engine, i915, id)
+		intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id));
 
-		/* If this context is now banned, skip all pending requests. */
-		if (i915_gem_context_is_banned(rq->gem_context))
-			engine_skip_context(rq);
-	} else {
-		/*
-		 * Since this is not the hung engine, it may have advanced
-		 * since the hang declaration. Double check by refinding
-		 * the active request at the time of the reset.
-		 */
-		rq = i915_gem_find_active_request(engine);
-		if (rq) {
-			unsigned long flags;
-
-			context_mark_innocent(rq->gem_context);
-			dma_fence_set_error(&rq->fence, -EAGAIN);
-
-			/* Rewind the engine to replay the incomplete rq */
-			spin_lock_irqsave(&engine->timeline.lock, flags);
-			rq = list_prev_entry(rq, link);
-			if (&rq->link == &engine->timeline.requests)
-				rq = NULL;
-			spin_unlock_irqrestore(&engine->timeline.lock, flags);
-		}
-	}
-
-	return rq;
+	__i915_gem_restore_fences(i915);
+	mutex_unlock(&i915->ggtt.vm.mutex);
 }
 
-static void reset_engine(struct intel_engine_cs *engine,
-			 struct i915_request *rq,
-			 bool stalled)
+static void reset_finish_engine(struct intel_engine_cs *engine)
 {
-	/*
-	 * Make sure this write is visible before we re-enable the interrupt
-	 * handlers on another CPU, as tasklet_enable() resolves to just
-	 * a compiler barrier which is insufficient for our purpose here.
-	 */
-	smp_store_mb(engine->irq_posted, 0);
-
-	if (rq)
-		rq = reset_request(engine, rq, stalled);
-
-	/* Setup the CS to resume from the breadcrumb of the hung request */
-	engine->reset.reset(engine, rq);
+	engine->reset.finish(engine);
+	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
 }
 
-static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+struct i915_gpu_restart {
+	struct work_struct work;
+	struct drm_i915_private *i915;
+};
+
+static void restart_work(struct work_struct *work)
 {
+	struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work);
+	struct drm_i915_private *i915 = arg->i915;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
+	intel_runtime_pm_get(i915);
+	mutex_lock(&i915->drm.struct_mutex);
 
-	i915_retire_requests(i915);
+	smp_store_mb(i915->gpu_error.restart, NULL);
 
 	for_each_engine(engine, i915, id) {
-		struct intel_context *ce;
-
-		reset_engine(engine,
-			     engine->hangcheck.active_request,
-			     stalled_mask & ENGINE_MASK(id));
-		ce = fetch_and_zero(&engine->last_retired_context);
-		if (ce)
-			intel_context_unpin(ce);
+		struct i915_request *rq;
 
 		/*
 		 * Ostensibily, we always want a context loaded for powersaving,
 		 * so if the engine is idle after the reset, send a request
 		 * to load our scratch kernel_context.
-		 *
-		 * More mysteriously, if we leave the engine idle after a reset,
-		 * the next userspace batch may hang, with what appears to be
-		 * an incoherent read by the CS (presumably stale TLB). An
-		 * empty request appears sufficient to paper over the glitch.
 		 */
-		if (intel_engine_is_idle(engine)) {
-			struct i915_request *rq;
+		if (!intel_engine_is_idle(engine))
+			continue;
 
-			rq = i915_request_alloc(engine, i915->kernel_context);
-			if (!IS_ERR(rq))
-				i915_request_add(rq);
-		}
+		rq = i915_request_alloc(engine, i915->kernel_context);
+		if (!IS_ERR(rq))
+			i915_request_add(rq);
 	}
 
-	i915_gem_restore_fences(i915);
-}
-
-static void reset_finish_engine(struct intel_engine_cs *engine)
-{
-	engine->reset.finish(engine);
+	mutex_unlock(&i915->drm.struct_mutex);
+	intel_runtime_pm_put(i915);
 
-	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
+	kfree(arg);
 }
 
 static void reset_finish(struct drm_i915_private *i915)
@@ -667,14 +589,26 @@ static void reset_finish(struct drm_i915_private *i915)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
-
-	for_each_engine(engine, i915, id) {
-		engine->hangcheck.active_request = NULL;
+	for_each_engine(engine, i915, id)
 		reset_finish_engine(engine);
-	}
 
-	enable_irq(i915->drm.irq);
+	/*
+	 * Following the reset, ensure that we always reload context for
+	 * powersaving, and to correct engine->last_retired_context.
+	 */
+	if (!i915_terminally_wedged(&i915->gpu_error) &&
+	    !READ_ONCE(i915->gpu_error.restart)) {
+		struct i915_gpu_restart *arg;
+
+		arg = kmalloc(sizeof(*arg), GFP_KERNEL);
+		if (arg) {
+			arg->i915 = i915;
+			INIT_WORK(&arg->work, restart_work);
+
+			WRITE_ONCE(i915->gpu_error.restart, arg);
+			queue_work(i915->wq, &arg->work);
+		}
+	}
 }
 
 static void nop_submit_request(struct i915_request *rq)
@@ -784,7 +718,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 {
 	struct i915_timeline *tl;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
 	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
 		return true;
 
@@ -802,9 +735,9 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 */
 	list_for_each_entry(tl, &i915->gt.timelines, link) {
 		struct i915_request *rq;
+		long timeout;
 
-		rq = i915_gem_active_peek(&tl->last_request,
-					  &i915->drm.struct_mutex);
+		rq = i915_gem_active_get_unlocked(&tl->last_request);
 		if (!rq)
 			continue;
 
@@ -819,12 +752,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 		 * and when the seqno passes the fence, the signaler
 		 * then signals the fence waking us up).
 		 */
-		if (dma_fence_default_wait(&rq->fence, true,
-					   MAX_SCHEDULE_TIMEOUT) < 0)
+		timeout = dma_fence_default_wait(&rq->fence, true,
+						 MAX_SCHEDULE_TIMEOUT);
+		i915_request_put(rq);
+		if (timeout < 0)
 			return false;
 	}
-	i915_retire_requests(i915);
-	GEM_BUG_ON(i915->gt.active_requests);
 
 	/*
 	 * Undo nop_submit_request. We prevent all new i915 requests from
@@ -836,7 +769,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 * context and do not require stop_machine().
 	 */
 	intel_engines_reset_default_submission(i915);
-	i915_gem_contexts_lost(i915);
 
 	GEM_TRACE("end\n");
 
@@ -876,25 +808,17 @@ void i915_reset(struct drm_i915_private *i915,
 	GEM_TRACE("flags=%lx\n", error->flags);
 
 	might_sleep();
-	lockdep_assert_held(&i915->drm.struct_mutex);
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
-	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
-		return;
-
 	/* Clear any previous failed attempts at recovery. Time to try again. */
 	if (!i915_gem_unset_wedged(i915))
-		goto wakeup;
+		return;
 
 	if (reason)
 		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
 	error->reset_count++;
 
-	ret = reset_prepare(i915);
-	if (ret) {
-		dev_err(i915->drm.dev, "GPU recovery failed\n");
-		goto taint;
-	}
+	reset_prepare(i915);
 
 	if (!intel_has_gpu_reset(i915)) {
 		if (i915_modparams.reset)
@@ -951,9 +875,6 @@ void i915_reset(struct drm_i915_private *i915,
 
 finish:
 	reset_finish(i915);
-wakeup:
-	clear_bit(I915_RESET_HANDOFF, &error->flags);
-	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
 	return;
 
 taint:
@@ -972,7 +893,6 @@ void i915_reset(struct drm_i915_private *i915,
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 error:
 	i915_gem_set_wedged(i915);
-	i915_retire_requests(i915);
 	goto finish;
 }
 
@@ -998,18 +918,16 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
 int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
 {
 	struct i915_gpu_error *error = &engine->i915->gpu_error;
-	struct i915_request *active_request;
 	int ret;
 
 	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
 	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
 
-	active_request = reset_prepare_engine(engine);
-	if (IS_ERR_OR_NULL(active_request)) {
-		/* Either the previous reset failed, or we pardon the reset. */
-		ret = PTR_ERR(active_request);
-		goto out;
-	}
+	if (i915_seqno_passed(intel_engine_get_seqno(engine),
+			      intel_engine_last_submit(engine)))
+		return 0;
+
+	reset_prepare_engine(engine);
 
 	if (msg)
 		dev_notice(engine->i915->drm.dev,
@@ -1033,7 +951,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
 	 * active request and can drop it, adjust head to skip the offending
 	 * request to resume executing remaining requests in the queue.
 	 */
-	reset_engine(engine, active_request, true);
+	intel_engine_reset(engine, true);
 
 	/*
 	 * The engine and its registers (and workarounds in case of render)
@@ -1109,30 +1027,7 @@ static void i915_reset_device(struct drm_i915_private *i915,
 	i915_wedge_on_timeout(&w, i915, 5 * HZ) {
 		intel_prepare_reset(i915);
 
-		error->reason = reason;
-		error->stalled_mask = engine_mask;
-
-		/* Signal that locked waiters should reset the GPU */
-		smp_mb__before_atomic();
-		set_bit(I915_RESET_HANDOFF, &error->flags);
-		wake_up_all(&error->wait_queue);
-
-		/*
-		 * Wait for anyone holding the lock to wakeup, without
-		 * blocking indefinitely on struct_mutex.
-		 */
-		do {
-			if (mutex_trylock(&i915->drm.struct_mutex)) {
-				i915_reset(i915, engine_mask, reason);
-				mutex_unlock(&i915->drm.struct_mutex);
-			}
-		} while (wait_on_bit_timeout(&error->flags,
-					     I915_RESET_HANDOFF,
-					     TASK_UNINTERRUPTIBLE,
-					     1));
-
-		error->stalled_mask = 0;
-		error->reason = NULL;
+		i915_reset(i915, engine_mask, reason);
 
 		intel_finish_reset(i915);
 	}
@@ -1269,3 +1164,22 @@ void i915_handle_error(struct drm_i915_private *i915,
 out:
 	intel_runtime_pm_put(i915);
 }
+
+bool i915_reset_flush(struct drm_i915_private *i915)
+{
+	int err;
+
+	cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
+
+	flush_workqueue(i915->wq);
+	GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart));
+
+	mutex_lock(&i915->drm.struct_mutex);
+	err = i915_gem_wait_for_idle(i915,
+				     I915_WAIT_LOCKED |
+				     I915_WAIT_FOR_IDLE_BOOST,
+				     MAX_SCHEDULE_TIMEOUT);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	return !err;
+}
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index 09422c4772dd..39d258a82e97 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -27,6 +27,9 @@ void i915_reset(struct drm_i915_private *i915,
 int i915_reset_engine(struct intel_engine_cs *engine,
 		      const char *reason);
 
+void i915_reset_request(struct i915_request *rq, bool guilty);
+bool i915_reset_flush(struct drm_i915_private *i915);
+
 bool intel_has_gpu_reset(struct drm_i915_private *i915);
 bool intel_has_reset_engine(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 0ac497275a51..7d2ff48d7512 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1087,10 +1087,8 @@ void intel_engines_sanitize(struct drm_i915_private *i915)
 
 	GEM_TRACE("\n");
 
-	for_each_engine(engine, i915, id) {
-		if (engine->reset.reset)
-			engine->reset.reset(engine, NULL);
-	}
+	for_each_engine(engine, i915, id)
+		intel_engine_reset(engine, false);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index f3945258fe1b..326de158dae6 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -814,8 +814,7 @@ static void guc_submission_tasklet(unsigned long data)
 		guc_dequeue(engine);
 }
 
-static struct i915_request *
-guc_reset_prepare(struct intel_engine_cs *engine)
+static void guc_reset_prepare(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 
@@ -841,8 +840,6 @@ guc_reset_prepare(struct intel_engine_cs *engine)
 	 */
 	if (engine->i915->guc.preempt_wq)
 		flush_workqueue(engine->i915->guc.preempt_wq);
-
-	return i915_gem_find_active_request(engine);
 }
 
 /*
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 933495996e91..c32603bd6fca 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -137,6 +137,7 @@
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include "i915_gem_render_state.h"
+#include "i915_reset.h"
 #include "i915_vgpu.h"
 #include "intel_lrc_reg.h"
 #include "intel_mocs.h"
@@ -325,9 +326,10 @@ static void unwind_wa_tail(struct i915_request *rq)
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
 
-static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
+static struct i915_request *
+__unwind_incomplete_requests(struct intel_engine_cs *engine)
 {
-	struct i915_request *rq, *rn;
+	struct i915_request *rq, *rn, *active = NULL;
 	struct i915_priolist *uninitialized_var(p);
 	int last_prio = I915_PRIORITY_INVALID;
 
@@ -337,7 +339,7 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 					 &engine->timeline.requests,
 					 link) {
 		if (i915_request_completed(rq))
-			return;
+			break;
 
 		__i915_request_unsubmit(rq);
 		unwind_wa_tail(rq);
@@ -350,7 +352,11 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 
 		GEM_BUG_ON(p->priority != rq_prio(rq));
 		list_add(&rq->sched.link, &p->requests);
+
+		active = rq;
 	}
+
+	return active;
 }
 
 void
@@ -1901,14 +1907,13 @@ static int gen9_init_render_ring(struct intel_engine_cs *engine)
 	return 0;
 }
 
-static struct i915_request *
-execlists_reset_prepare(struct intel_engine_cs *engine)
+static void execlists_reset_prepare(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct i915_request *request, *active;
 	unsigned long flags;
 
-	GEM_TRACE("%s\n", engine->name);
+	GEM_TRACE("%s, tasklet disabled?=%d\n",
+		  engine->name, atomic_read(&execlists->tasklet.count));
 
 	/*
 	 * Prevent request submission to the hardware until we have
@@ -1920,59 +1925,20 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
 	 * prevents the race.
 	 */
 	__tasklet_disable_sync_once(&execlists->tasklet);
+	GEM_BUG_ON(!reset_in_progress(execlists));
 
+	/* And flush any current direct submission. */
 	spin_lock_irqsave(&engine->timeline.lock, flags);
-
-	/*
-	 * We want to flush the pending context switches, having disabled
-	 * the tasklet above, we can assume exclusive access to the execlists.
-	 * For this allows us to catch up with an inflight preemption event,
-	 * and avoid blaming an innocent request if the stall was due to the
-	 * preemption itself.
-	 */
-	process_csb(engine);
-
-	/*
-	 * The last active request can then be no later than the last request
-	 * now in ELSP[0]. So search backwards from there, so that if the GPU
-	 * has advanced beyond the last CSB update, it will be pardoned.
-	 */
-	active = NULL;
-	request = port_request(execlists->port);
-	if (request) {
-		/*
-		 * Prevent the breadcrumb from advancing before we decide
-		 * which request is currently active.
-		 */
-		intel_engine_stop_cs(engine);
-
-		list_for_each_entry_from_reverse(request,
-						 &engine->timeline.requests,
-						 link) {
-			if (__i915_request_completed(request,
-						     request->global_seqno))
-				break;
-
-			active = request;
-		}
-	}
-
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
-
-	return active;
 }
 
-static void execlists_reset(struct intel_engine_cs *engine,
-			    struct i915_request *request)
+static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_request *rq;
 	unsigned long flags;
 	u32 *regs;
 
-	GEM_TRACE("%s request global=%x, current=%d\n",
-		  engine->name, request ? request->global_seqno : 0,
-		  intel_engine_get_seqno(engine));
-
 	spin_lock_irqsave(&engine->timeline.lock, flags);
 
 	/*
@@ -1988,12 +1954,16 @@ static void execlists_reset(struct intel_engine_cs *engine,
 	reset_irq(engine);
 
 	/* Push back any incomplete requests for replay after the reset. */
-	__unwind_incomplete_requests(engine);
+	rq = __unwind_incomplete_requests(engine);
 
 	/* Following the reset, we need to reload the CSB read/write pointers */
 	reset_csb_pointers(&engine->execlists);
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	GEM_TRACE("%s request global=%x, current=%d\n",
+		  engine->name, rq ? rq->global_seqno : 0,
+		  intel_engine_get_seqno(engine));
+	if (!rq)
+		goto out_unlock;
 
 	/*
 	 * If the request was innocent, we leave the request in the ELSP
@@ -2006,8 +1976,9 @@ static void execlists_reset(struct intel_engine_cs *engine,
 	 * and have to at least restore the RING register in the context
 	 * image back to the expected values to skip over the guilty request.
 	 */
-	if (!request || request->fence.error != -EIO)
-		return;
+	i915_reset_request(rq, stalled);
+	if (!stalled)
+		goto out_unlock;
 
 	/*
 	 * We want a simple context + ring to execute the breadcrumb update.
@@ -2017,25 +1988,23 @@ static void execlists_reset(struct intel_engine_cs *engine,
 	 * future request will be after userspace has had the opportunity
 	 * to recreate its own state.
 	 */
-	regs = request->hw_context->lrc_reg_state;
+	regs = rq->hw_context->lrc_reg_state;
 	if (engine->pinned_default_state) {
 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
 		       engine->context_size - PAGE_SIZE);
 	}
-	execlists_init_reg_state(regs,
-				 request->gem_context, engine, request->ring);
+	execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
 
 	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
-	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
+	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(rq->ring->vma);
 
-	request->ring->head = intel_ring_wrap(request->ring, request->postfix);
-	regs[CTX_RING_HEAD + 1] = request->ring->head;
+	rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
+	regs[CTX_RING_HEAD + 1] = rq->ring->head;
+	intel_ring_update_space(rq->ring);
 
-	intel_ring_update_space(request->ring);
-
-	/* Reset WaIdleLiteRestore:bdw,skl as well */
-	unwind_wa_tail(request);
+out_unlock:
+	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
 static void execlists_reset_finish(struct intel_engine_cs *engine)
@@ -2055,6 +2024,7 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
 	 * serialising multiple attempts to reset so that we know that we
 	 * are the only one manipulating tasklet state.
 	 */
+	GEM_BUG_ON(!reset_in_progress(execlists));
 	__tasklet_enable_sync_once(&execlists->tasklet);
 
 	GEM_TRACE("%s\n", engine->name);
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index c2f10d899329..371eb3dbedc0 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -501,8 +501,6 @@ void intel_overlay_reset(struct drm_i915_private *dev_priv)
 	if (!overlay)
 		return;
 
-	intel_overlay_release_old_vid(overlay);
-
 	overlay->old_xscale = 0;
 	overlay->old_yscale = 0;
 	overlay->crtc = NULL;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index f4bd185c9369..5e6d2d79b617 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -34,6 +34,7 @@
 
 #include "i915_drv.h"
 #include "i915_gem_render_state.h"
+#include "i915_reset.h"
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include "intel_workarounds.h"
@@ -533,54 +534,82 @@ static int init_ring_common(struct intel_engine_cs *engine)
 	return ret;
 }
 
-static struct i915_request *reset_prepare(struct intel_engine_cs *engine)
+static void reset_prepare(struct intel_engine_cs *engine)
 {
 	intel_engine_stop_cs(engine);
 
 	if (engine->irq_seqno_barrier)
 		engine->irq_seqno_barrier(engine);
-
-	return i915_gem_find_active_request(engine);
 }
 
-static void skip_request(struct i915_request *rq)
+static void reset_ring(struct intel_engine_cs *engine, bool stalled)
 {
-	void *vaddr = rq->ring->vaddr;
+	struct i915_timeline *tl = &engine->timeline;
+	struct i915_request *pos, *rq;
+	unsigned long flags;
 	u32 head;
 
-	head = rq->infix;
-	if (rq->postfix < head) {
-		memset32(vaddr + head, MI_NOOP,
-			 (rq->ring->size - head) / sizeof(u32));
-		head = 0;
+	rq = NULL;
+	spin_lock_irqsave(&tl->lock, flags);
+	list_for_each_entry(pos, &tl->requests, link) {
+		if (!__i915_request_completed(pos, pos->global_seqno)) {
+			rq = pos;
+			break;
+		}
 	}
-	memset32(vaddr + head, MI_NOOP, (rq->postfix - head) / sizeof(u32));
-}
-
-static void reset_ring(struct intel_engine_cs *engine, struct i915_request *rq)
-{
-	GEM_TRACE("%s seqno=%x\n", engine->name, rq ? rq->global_seqno : 0);
 
+	GEM_TRACE("%s seqno=%x, stalled? %s\n",
+		  engine->name,
+		  rq ? rq->global_seqno : 0,
+		  yesno(stalled));
 	/*
-	 * Try to restore the logical GPU state to match the continuation
-	 * of the request queue. If we skip the context/PD restore, then
-	 * the next request may try to execute assuming that its context
-	 * is valid and loaded on the GPU and so may try to access invalid
-	 * memory, prompting repeated GPU hangs.
+	 * The guilty request will get skipped on a hung engine.
 	 *
-	 * If the request was guilty, we still restore the logical state
-	 * in case the next request requires it (e.g. the aliasing ppgtt),
-	 * but skip over the hung batch.
+	 * Users of client default contexts do not rely on logical
+	 * state preserved between batches so it is safe to execute
+	 * queued requests following the hang. Non default contexts
+	 * rely on preserved state, so skipping a batch loses the
+	 * evolution of the state and it needs to be considered corrupted.
+	 * Executing more queued batches on top of corrupted state is
+	 * risky. But we take the risk by trying to advance through
+	 * the queued requests in order to make the client behaviour
+	 * more predictable around resets, by not throwing away random
+	 * amount of batches it has prepared for execution. Sophisticated
+	 * clients can use gem_reset_stats_ioctl and dma fence status
+	 * (exported via sync_file info ioctl on explicit fences) to observe
+	 * when it loses the context state and should rebuild accordingly.
 	 *
-	 * If the request was innocent, we try to replay the request with
-	 * the restored context.
+	 * The context ban, and ultimately the client ban, mechanism are safety
+	 * valves if client submission ends up resulting in nothing more than
+	 * subsequent hangs.
 	 */
+
 	if (rq) {
-		/* If the rq hung, jump to its breadcrumb and skip the batch */
-		rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
-		if (rq->fence.error == -EIO)
-			skip_request(rq);
+		/*
+		 * Try to restore the logical GPU state to match the
+		 * continuation of the request queue. If we skip the
+		 * context/PD restore, then the next request may try to execute
+		 * assuming that its context is valid and loaded on the GPU and
+		 * so may try to access invalid memory, prompting repeated GPU
+		 * hangs.
+		 *
+		 * If the request was guilty, we still restore the logical
+		 * state in case the next request requires it (e.g. the
+		 * aliasing ppgtt), but skip over the hung batch.
+		 *
+		 * If the request was innocent, we try to replay the request
+		 * with the restored context.
+		 */
+		i915_reset_request(rq, stalled);
+
+		GEM_BUG_ON(rq->ring != engine->buffer);
+		head = rq->head;
+	} else {
+		head = engine->buffer->tail;
 	}
+	engine->buffer->head = intel_ring_wrap(engine->buffer, head);
+
+	spin_unlock_irqrestore(&tl->lock, flags);
 }
 
 static void reset_finish(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index ce6cc2a6cf7a..98675fa8e8cc 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -121,7 +121,6 @@ struct intel_engine_hangcheck {
 	unsigned long action_timestamp;
 	int deadlock;
 	struct intel_instdone instdone;
-	struct i915_request *active_request;
 	bool stalled:1;
 	bool wedged:1;
 };
@@ -446,9 +445,8 @@ struct intel_engine_cs {
 	int		(*init_hw)(struct intel_engine_cs *engine);
 
 	struct {
-		struct i915_request *(*prepare)(struct intel_engine_cs *engine);
-		void (*reset)(struct intel_engine_cs *engine,
-			      struct i915_request *rq);
+		void (*prepare)(struct intel_engine_cs *engine);
+		void (*reset)(struct intel_engine_cs *engine, bool stalled);
 		void (*finish)(struct intel_engine_cs *engine);
 	} reset;
 
@@ -1070,6 +1068,13 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset)
 	return cs;
 }
 
+static inline void intel_engine_reset(struct intel_engine_cs *engine,
+				      bool stalled)
+{
+	if (engine->reset.reset)
+		engine->reset.reset(engine, stalled);
+}
+
 void intel_engines_sanitize(struct drm_i915_private *i915);
 
 bool intel_engine_is_idle(struct intel_engine_cs *engine);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 73462a65a330..64aaef0abd90 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -390,7 +390,6 @@ static int igt_global_reset(void *arg)
 	/* Check that we can issue a global GPU reset */
 
 	global_reset_lock(i915);
-	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 
 	mutex_lock(&i915->drm.struct_mutex);
 	reset_count = i915_reset_count(&i915->gpu_error);
@@ -403,7 +402,6 @@ static int igt_global_reset(void *arg)
 	}
 	mutex_unlock(&i915->drm.struct_mutex);
 
-	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 	global_reset_unlock(i915);
 
 	if (i915_terminally_wedged(&i915->gpu_error))
@@ -513,7 +511,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 				break;
 			}
 
-			if (!wait_for_idle(engine)) {
+			if (!i915_reset_flush(i915)) {
 				struct drm_printer p =
 					drm_info_printer(i915->drm.dev);
 
@@ -905,20 +903,13 @@ static int igt_reset_engines(void *arg)
 	return 0;
 }
 
-static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
+static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask)
 {
-	struct i915_gpu_error *error = &rq->i915->gpu_error;
-	u32 reset_count = i915_reset_count(error);
+	u32 count = i915_reset_count(&i915->gpu_error);
 
-	error->stalled_mask = mask;
+	i915_reset(i915, mask, NULL);
 
-	/* set_bit() must be after we have setup the backchannel (mask) */
-	smp_mb__before_atomic();
-	set_bit(I915_RESET_HANDOFF, &error->flags);
-
-	wake_up_all(&error->wait_queue);
-
-	return reset_count;
+	return count;
 }
 
 static int igt_wait_reset(void *arg)
@@ -964,7 +955,7 @@ static int igt_wait_reset(void *arg)
 		goto out_rq;
 	}
 
-	reset_count = fake_hangcheck(rq, ALL_ENGINES);
+	reset_count = fake_hangcheck(i915, ALL_ENGINES);
 
 	timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
 	if (timeout < 0) {
@@ -974,7 +965,6 @@ static int igt_wait_reset(void *arg)
 		goto out_rq;
 	}
 
-	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 	if (i915_reset_count(&i915->gpu_error) == reset_count) {
 		pr_err("No GPU reset recorded!\n");
 		err = -EINVAL;
@@ -1100,12 +1090,7 @@ static int igt_reset_queue(void *arg)
 				goto fini;
 			}
 
-			reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
-
-			i915_reset(i915, ENGINE_MASK(id), NULL);
-
-			GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
-					    &i915->gpu_error.flags));
+			reset_count = fake_hangcheck(i915, ENGINE_MASK(id));
 
 			if (prev->fence.error != -EIO) {
 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
-- 
2.18.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/7] drm/i915: Introduce i915_address_space.mutex
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
                   ` (6 preceding siblings ...)
  2018-07-11  7:36 ` [PATCH 7/7] drm/i915: Remove GPU reset dependence on struct_mutex Chris Wilson
@ 2018-07-11  7:46 ` Patchwork
  2018-07-11  7:50 ` ✗ Fi.CI.SPARSE: " Patchwork
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 26+ messages in thread
From: Patchwork @ 2018-07-11  7:46 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/7] drm/i915: Introduce i915_address_space.mutex
URL   : https://patchwork.freedesktop.org/series/46289/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
bfb49745fd80 drm/i915: Introduce i915_address_space.mutex
0fc3f4498130 drm/i915: Move fence register tracking to GGTT
236a17479b0d drm/i915: Convert fences to use a GGTT lock rather than struct_mutex
c2383b61bd9b drm/i915: Move fence-reg interface to i915_gem_fence_reg.h
eb0335ad83e2 drm/i915: Dynamically allocate the array of drm_i915_gem_fence_reg
53e04e954008 drm/i915: Pull all the reset functionality together into i915_reset.c
-:1070: WARNING:FILE_PATH_CHANGES: added, moved or deleted file(s), does MAINTAINERS need updating?
#1070: 
new file mode 100644

-:1075: WARNING:SPDX_LICENSE_TAG: Missing or malformed SPDX-License-Identifier tag in line 1
#1075: FILE: drivers/gpu/drm/i915/i915_reset.c:1:
+/*

-:1216: WARNING:TYPO_SPELLING: 'acknowledgement' may be misspelled - perhaps 'acknowledgment'?
#1216: FILE: drivers/gpu/drm/i915/i915_reset.c:142:
+	/* Assert reset for at least 20 usec, and wait for acknowledgement. */

-:1793: WARNING:MEMORY_BARRIER: memory barrier without comment
#1793: FILE: drivers/gpu/drm/i915/i915_reset.c:719:
+	smp_mb__after_atomic();

-:2161: CHECK:MACRO_ARG_REUSE: Macro argument reuse 'W' - possible side-effects?
#2161: FILE: drivers/gpu/drm/i915/i915_reset.c:1087:
+#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
+	for (__init_wedge((W), (DEV), (TIMEOUT), __func__);		\
+	     (W)->i915;							\
+	     __fini_wedge((W)))

-:2172: WARNING:STATIC_CONST_CHAR_ARRAY: char * array declaration might be better as static const
#2172: FILE: drivers/gpu/drm/i915/i915_reset.c:1098:
+	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };

-:2173: WARNING:STATIC_CONST_CHAR_ARRAY: char * array declaration might be better as static const
#2173: FILE: drivers/gpu/drm/i915/i915_reset.c:1099:
+	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };

-:2174: WARNING:STATIC_CONST_CHAR_ARRAY: char * array declaration might be better as static const
#2174: FILE: drivers/gpu/drm/i915/i915_reset.c:1100:
+	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };

-:2352: WARNING:SPDX_LICENSE_TAG: Missing or malformed SPDX-License-Identifier tag in line 1
#2352: FILE: drivers/gpu/drm/i915/i915_reset.h:1:
+/*

total: 0 errors, 8 warnings, 1 checks, 2806 lines checked
c3b127c4ee92 drm/i915: Remove GPU reset dependence on struct_mutex
-:639: WARNING:MEMORY_BARRIER: memory barrier without comment
#639: FILE: drivers/gpu/drm/i915/i915_reset.c:563:
+	smp_store_mb(i915->gpu_error.restart, NULL);

total: 0 errors, 1 warnings, 0 checks, 1265 lines checked

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* ✗ Fi.CI.SPARSE: warning for series starting with [1/7] drm/i915: Introduce i915_address_space.mutex
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
                   ` (7 preceding siblings ...)
  2018-07-11  7:46 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/7] drm/i915: Introduce i915_address_space.mutex Patchwork
@ 2018-07-11  7:50 ` Patchwork
  2018-07-11  8:03 ` ✓ Fi.CI.BAT: success " Patchwork
  2018-07-11  8:59 ` ✗ Fi.CI.IGT: failure " Patchwork
  10 siblings, 0 replies; 26+ messages in thread
From: Patchwork @ 2018-07-11  7:50 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/7] drm/i915: Introduce i915_address_space.mutex
URL   : https://patchwork.freedesktop.org/series/46289/
State : warning

== Summary ==

$ dim sparse origin/drm-tip
Commit: drm/i915: Introduce i915_address_space.mutex
+drivers/gpu/drm/i915/i915_gem_gtt.c:1001:9: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem_gtt.c:1001:9: warning: expression using sizeof(void)
-drivers/gpu/drm/i915/i915_gem_gtt.c:1001:9: warning: expression using sizeof(void)
-drivers/gpu/drm/i915/i915_gem_gtt.c:1001:9: warning: expression using sizeof(void)

Commit: drm/i915: Move fence register tracking to GGTT
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3652:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3645:16: warning: expression using sizeof(void)

Commit: drm/i915: Convert fences to use a GGTT lock rather than struct_mutex
Okay!

Commit: drm/i915: Move fence-reg interface to i915_gem_fence_reg.h
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3645:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3630:16: warning: expression using sizeof(void)

Commit: drm/i915: Dynamically allocate the array of drm_i915_gem_fence_reg
+./include/linux/slab.h:631:13: error: not a function <noident>

Commit: drm/i915: Pull all the reset functionality together into i915_reset.c
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3630:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3601:16: warning: expression using sizeof(void)

Commit: drm/i915: Remove GPU reset dependence on struct_mutex
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3601:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3596:16: warning: expression using sizeof(void)

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/7] drm/i915: Introduce i915_address_space.mutex
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
                   ` (8 preceding siblings ...)
  2018-07-11  7:50 ` ✗ Fi.CI.SPARSE: " Patchwork
@ 2018-07-11  8:03 ` Patchwork
  2018-07-11  8:59 ` ✗ Fi.CI.IGT: failure " Patchwork
  10 siblings, 0 replies; 26+ messages in thread
From: Patchwork @ 2018-07-11  8:03 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/7] drm/i915: Introduce i915_address_space.mutex
URL   : https://patchwork.freedesktop.org/series/46289/
State : success

== Summary ==

= CI Bug Log - changes from CI_DRM_4469 -> Patchwork_9607 =

== Summary - SUCCESS ==

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/46289/revisions/1/mbox/

== Possible new issues ==

  Here are the unknown changes that may have been introduced in Patchwork_9607:

  === IGT changes ===

    ==== Possible regressions ====

    igt@kms_pipe_crc_basic@suspend-read-crc-pipe-a:
      {fi-skl-iommu}:     PASS -> DMESG-WARN

    
== Known issues ==

  Here are the changes found in Patchwork_9607 that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@gem_exec_suspend@basic-s4-devices:
      fi-kbl-7500u:       PASS -> DMESG-WARN (fdo#105128, fdo#107139)

    igt@kms_flip@basic-flip-vs-wf_vblank:
      fi-hsw-peppy:       PASS -> FAIL (fdo#100368)

    
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
  fdo#105128 https://bugs.freedesktop.org/show_bug.cgi?id=105128
  fdo#107139 https://bugs.freedesktop.org/show_bug.cgi?id=107139


== Participating hosts (46 -> 42) ==

  Missing    (4): fi-ctg-p8600 fi-ilk-m540 fi-bsw-cyan fi-hsw-4200u 


== Build changes ==

    * Linux: CI_DRM_4469 -> Patchwork_9607

  CI_DRM_4469: 02e578b7aace48d33fa617dddb40621bd664c92c @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4546: e8905e756cf3640c66541e963ff97f8af2d98936 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_9607: c3b127c4ee925c38cbcf0dcee419cf011879d92a @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

c3b127c4ee92 drm/i915: Remove GPU reset dependence on struct_mutex
53e04e954008 drm/i915: Pull all the reset functionality together into i915_reset.c
eb0335ad83e2 drm/i915: Dynamically allocate the array of drm_i915_gem_fence_reg
c2383b61bd9b drm/i915: Move fence-reg interface to i915_gem_fence_reg.h
236a17479b0d drm/i915: Convert fences to use a GGTT lock rather than struct_mutex
0fc3f4498130 drm/i915: Move fence register tracking to GGTT
bfb49745fd80 drm/i915: Introduce i915_address_space.mutex

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_9607/issues.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex
  2018-07-11  7:36 ` [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex Chris Wilson
@ 2018-07-11  8:09   ` Daniel Vetter
  2018-07-11  9:33   ` Daniel Vetter
  1 sibling, 0 replies; 26+ messages in thread
From: Daniel Vetter @ 2018-07-11  8:09 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, Jul 11, 2018 at 08:36:02AM +0100, Chris Wilson wrote:
> Add a mutex into struct i915_address_space to be used while operating on
> the vma and their lists for a particular vm. As this may be called from
> the shrinker, we taint the mutex with fs_reclaim so that from the start
> lockdep warns us if we are caught holding the mutex across an
> allocation. (With such small steps we will eventually rid ourselves of
> struct_mutex recursion!)
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>

> ---
>  drivers/gpu/drm/i915/i915_drv.h          |  2 +-
>  drivers/gpu/drm/i915/i915_gem_gtt.c      | 10 ++++++++++
>  drivers/gpu/drm/i915/i915_gem_gtt.h      |  2 ++
>  drivers/gpu/drm/i915/i915_gem_shrinker.c | 12 ++++++++++++
>  4 files changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index eeb002a47032..01dd29837233 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3304,7 +3304,7 @@ unsigned long i915_gem_shrink(struct drm_i915_private *i915,
>  unsigned long i915_gem_shrink_all(struct drm_i915_private *i915);
>  void i915_gem_shrinker_register(struct drm_i915_private *i915);
>  void i915_gem_shrinker_unregister(struct drm_i915_private *i915);
> -
> +void i915_gem_shrinker_taints_mutex(struct mutex *mutex);
>  
>  /* i915_gem_tiling.c */
>  static inline bool i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj)
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index abd81fb9b0b6..d0acef299b9c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -531,6 +531,14 @@ static void vm_free_page(struct i915_address_space *vm, struct page *page)
>  static void i915_address_space_init(struct i915_address_space *vm,
>  				    struct drm_i915_private *dev_priv)
>  {
> +	/*
> +	 * The vm->mutex must be reclaim safe (for use in the shrinker).
> +	 * Do a dummy acquire now under fs_reclaim so that any allocation
> +	 * attempt holding the lock is immediately reported by lockdep.
> +	 */
> +	mutex_init(&vm->mutex);
> +	i915_gem_shrinker_taints_mutex(&vm->mutex);
> +
>  	GEM_BUG_ON(!vm->total);
>  	drm_mm_init(&vm->mm, 0, vm->total);
>  	vm->mm.head_node.color = I915_COLOR_UNEVICTABLE;
> @@ -551,6 +559,8 @@ static void i915_address_space_fini(struct i915_address_space *vm)
>  	spin_unlock(&vm->free_pages.lock);
>  
>  	drm_mm_takedown(&vm->mm);
> +
> +	mutex_destroy(&vm->mutex);
>  }
>  
>  static int __setup_page_dma(struct i915_address_space *vm,
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index feda45dfd481..14e62651010b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -293,6 +293,8 @@ struct i915_address_space {
>  
>  	bool closed;
>  
> +	struct mutex mutex; /* protects vma and our lists */
> +
>  	struct i915_page_dma scratch_page;
>  	struct i915_page_table *scratch_pt;
>  	struct i915_page_directory *scratch_pd;
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index c61f5b80fee3..ea90d3a0d511 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -23,6 +23,7 @@
>   */
>  
>  #include <linux/oom.h>
> +#include <linux/sched/mm.h>
>  #include <linux/shmem_fs.h>
>  #include <linux/slab.h>
>  #include <linux/swap.h>
> @@ -531,3 +532,14 @@ void i915_gem_shrinker_unregister(struct drm_i915_private *i915)
>  	WARN_ON(unregister_oom_notifier(&i915->mm.oom_notifier));
>  	unregister_shrinker(&i915->mm.shrinker);
>  }
> +
> +void i915_gem_shrinker_taints_mutex(struct mutex *mutex)
> +{
> +	if (!IS_ENABLED(CONFIG_LOCKDEP))
> +		return;
> +
> +	fs_reclaim_acquire(GFP_KERNEL);
> +	mutex_lock(mutex);
> +	mutex_unlock(mutex);
> +	fs_reclaim_release(GFP_KERNEL);
> +}
> -- 
> 2.18.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/7] drm/i915: Move fence register tracking to GGTT
  2018-07-11  7:36 ` [PATCH 2/7] drm/i915: Move fence register tracking to GGTT Chris Wilson
@ 2018-07-11  8:19   ` Daniel Vetter
  2018-07-11  8:27     ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2018-07-11  8:19 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, Jul 11, 2018 at 08:36:03AM +0100, Chris Wilson wrote:
> As the fence registers define special regions of the mappable aperture
> inside the Global GTT, and we track those regions using GGTT VMA, it
> makes sense to pull that bookkeeping under i915_ggtt. The advantage is
> that we can then start using a local GGTT lock to handle the fence
> registers (in conjunction with the GGTT VMA) rather than struct_mutex.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/gvt/gvt.h            |  2 +-
>  drivers/gpu/drm/i915/i915_debugfs.c       | 16 ++---
>  drivers/gpu/drm/i915/i915_drv.c           |  4 +-
>  drivers/gpu/drm/i915/i915_drv.h           |  7 ---
>  drivers/gpu/drm/i915/i915_gem.c           | 33 +++++-----
>  drivers/gpu/drm/i915/i915_gem_fence_reg.c | 76 ++++++++++++-----------
>  drivers/gpu/drm/i915/i915_gem_fence_reg.h |  9 ++-
>  drivers/gpu/drm/i915/i915_gem_gtt.c       |  8 ++-
>  drivers/gpu/drm/i915/i915_gem_gtt.h       |  7 ++-
>  drivers/gpu/drm/i915/i915_gpu_error.c     |  7 ++-
>  10 files changed, 89 insertions(+), 80 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gvt/gvt.h b/drivers/gpu/drm/i915/gvt/gvt.h
> index de2a3a2580be..11609a4003ff 100644
> --- a/drivers/gpu/drm/i915/gvt/gvt.h
> +++ b/drivers/gpu/drm/i915/gvt/gvt.h
> @@ -391,7 +391,7 @@ int intel_gvt_load_firmware(struct intel_gvt *gvt);
>  #define gvt_hidden_gmadr_end(gvt) (gvt_hidden_gmadr_base(gvt) \
>  				   + gvt_hidden_sz(gvt) - 1)
>  
> -#define gvt_fence_sz(gvt) (gvt->dev_priv->num_fence_regs)
> +#define gvt_fence_sz(gvt) ((gvt)->dev_priv->ggtt.num_fence_regs)
>  
>  /* Aperture/GM space definitions for vGPU */
>  #define vgpu_aperture_offset(vgpu)	((vgpu)->gm.low_gm_node.start)
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 099f97ef2303..75ffed6a3f31 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -914,20 +914,20 @@ static int i915_interrupt_info(struct seq_file *m, void *data)
>  
>  static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
>  {
> -	struct drm_i915_private *dev_priv = node_to_i915(m->private);
> -	struct drm_device *dev = &dev_priv->drm;
> +	struct drm_i915_private *i915 = node_to_i915(m->private);
> +	const struct i915_ggtt *ggtt = &i915->ggtt;
>  	int i, ret;
>  
> -	ret = mutex_lock_interruptible(&dev->struct_mutex);
> +	ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
>  	if (ret)
>  		return ret;
>  
> -	seq_printf(m, "Total fences = %d\n", dev_priv->num_fence_regs);
> -	for (i = 0; i < dev_priv->num_fence_regs; i++) {
> -		struct i915_vma *vma = dev_priv->fence_regs[i].vma;
> +	seq_printf(m, "Total fences = %d\n", ggtt->num_fence_regs);
> +	for (i = 0; i < ggtt->num_fence_regs; i++) {
> +		struct i915_vma *vma = ggtt->fence_regs[i].vma;
>  
>  		seq_printf(m, "Fence %d, pin count = %d, object = ",
> -			   i, dev_priv->fence_regs[i].pin_count);
> +			   i, ggtt->fence_regs[i].pin_count);
>  		if (!vma)
>  			seq_puts(m, "unused");
>  		else
> @@ -935,7 +935,7 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
>  		seq_putc(m, '\n');
>  	}
>  
> -	mutex_unlock(&dev->struct_mutex);
> +	mutex_unlock(&i915->drm.struct_mutex);
>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 3eba3d1ab5b8..97a2054c38d4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -321,7 +321,7 @@ static int i915_getparam_ioctl(struct drm_device *dev, void *data,
>  		value = pdev->revision;
>  		break;
>  	case I915_PARAM_NUM_FENCES_AVAIL:
> -		value = dev_priv->num_fence_regs;
> +		value = dev_priv->ggtt.num_fence_regs;
>  		break;
>  	case I915_PARAM_HAS_OVERLAY:
>  		value = dev_priv->overlay ? 1 : 0;
> @@ -1154,8 +1154,6 @@ static int i915_driver_init_hw(struct drm_i915_private *dev_priv)
>  
>  	intel_uncore_sanitize(dev_priv);
>  
> -	i915_gem_load_init_fences(dev_priv);

Not entirely sure whether moving this is ok, but it does look ok:
uncore_sanitize just cleans up forcewake stuff, which we don't need for
the fence regs. I'll leave actual checking with reality to CI.
> -
>  	/* On the 945G/GM, the chipset reports the MSI capability on the
>  	 * integrated graphics even though the support isn't actually there
>  	 * according to the published specs.  It doesn't appear to function
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 01dd29837233..a7f2d747e221 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -966,9 +966,6 @@ struct i915_gem_mm {
>  	struct notifier_block vmap_notifier;
>  	struct shrinker shrinker;
>  
> -	/** LRU list of objects with fence regs on them. */
> -	struct list_head fence_list;
> -
>  	/**
>  	 * Workqueue to fault in userptr pages, flushed by the execbuf
>  	 * when required but otherwise left to userspace to try again
> @@ -1678,9 +1675,6 @@ struct drm_i915_private {
>  	/* protects panel power sequencer state */
>  	struct mutex pps_mutex;
>  
> -	struct drm_i915_fence_reg fence_regs[I915_MAX_NUM_FENCES]; /* assume 965 */
> -	int num_fence_regs; /* 8 on pre-965, 16 otherwise */
> -
>  	unsigned int fsb_freq, mem_freq, is_ddr3;
>  	unsigned int skl_preferred_vco_freq;
>  	unsigned int max_cdclk_freq;
> @@ -2886,7 +2880,6 @@ int i915_gem_wait_ioctl(struct drm_device *dev, void *data,
>  void i915_gem_sanitize(struct drm_i915_private *i915);
>  int i915_gem_init_early(struct drm_i915_private *dev_priv);
>  void i915_gem_cleanup_early(struct drm_i915_private *dev_priv);
> -void i915_gem_load_init_fences(struct drm_i915_private *dev_priv);
>  int i915_gem_freeze(struct drm_i915_private *dev_priv);
>  int i915_gem_freeze_late(struct drm_i915_private *dev_priv);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 07a92ca61dbf..356c86071ccc 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2214,8 +2214,9 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
>  	intel_runtime_pm_put(i915);
>  }
>  
> -void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
> +void i915_gem_runtime_suspend(struct drm_i915_private *i915)
>  {
> +	struct i915_ggtt *ggtt = &i915->ggtt;
>  	struct drm_i915_gem_object *obj, *on;
>  	int i;
>  
> @@ -2227,15 +2228,15 @@ void i915_gem_runtime_suspend(struct drm_i915_private *dev_priv)
>  	 */
>  
>  	list_for_each_entry_safe(obj, on,
> -				 &dev_priv->mm.userfault_list, userfault_link)
> +				 &i915->mm.userfault_list, userfault_link)
>  		__i915_gem_object_release_mmap(obj);
>  
>  	/* The fence will be lost when the device powers down. If any were
>  	 * in use by hardware (i.e. they are pinned), we should not be powering
>  	 * down! All other fences will be reacquired by the user upon waking.
>  	 */
> -	for (i = 0; i < dev_priv->num_fence_regs; i++) {
> -		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
> +	for (i = 0; i < ggtt->num_fence_regs; i++) {
> +		struct drm_i915_fence_reg *reg = &ggtt->fence_regs[i];
>  
>  		/* Ideally we want to assert that the fence register is not
>  		 * live at this point (i.e. that no piece of code will be
> @@ -5630,32 +5631,33 @@ i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
>  		dev_priv->gt.cleanup_engine(engine);
>  }
>  
> -void
> -i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
> +void i915_ggtt_init_fences(struct i915_ggtt *ggtt)

Follow-up patch to put this into i915_gem_fence_reg.c would be kinda cool
I think. The decl is already in i915_gem_fence_reg.h already anyway.

>  {
> +	struct drm_i915_private *dev_priv = ggtt->vm.i915;
>  	int i;
>  
>  	if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
>  	    !IS_CHERRYVIEW(dev_priv))
> -		dev_priv->num_fence_regs = 32;
> +		ggtt->num_fence_regs = 32;
>  	else if (INTEL_GEN(dev_priv) >= 4 ||
>  		 IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
>  		 IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
> -		dev_priv->num_fence_regs = 16;
> +		ggtt->num_fence_regs = 16;
>  	else
> -		dev_priv->num_fence_regs = 8;
> +		ggtt->num_fence_regs = 8;
>  
>  	if (intel_vgpu_active(dev_priv))
> -		dev_priv->num_fence_regs =
> -				I915_READ(vgtif_reg(avail_rs.fence_num));
> +		ggtt->num_fence_regs = I915_READ(vgtif_reg(avail_rs.fence_num));
> +
> +	INIT_LIST_HEAD(&ggtt->fence_list);
>  
>  	/* Initialize fence registers to zero */
> -	for (i = 0; i < dev_priv->num_fence_regs; i++) {
> -		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
> +	for (i = 0; i < ggtt->num_fence_regs; i++) {
> +		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
>  
> -		fence->i915 = dev_priv;
> +		fence->ggtt = ggtt;
>  		fence->id = i;
> -		list_add_tail(&fence->link, &dev_priv->mm.fence_list);
> +		list_add_tail(&fence->link, &ggtt->fence_list);
>  	}
>  	i915_gem_restore_fences(dev_priv);
>  
> @@ -5672,7 +5674,6 @@ static void i915_gem_init__mm(struct drm_i915_private *i915)
>  
>  	INIT_LIST_HEAD(&i915->mm.unbound_list);
>  	INIT_LIST_HEAD(&i915->mm.bound_list);
> -	INIT_LIST_HEAD(&i915->mm.fence_list);
>  	INIT_LIST_HEAD(&i915->mm.userfault_list);
>  
>  	INIT_WORK(&i915->mm.free_work, __i915_gem_free_work);
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> index d548ac05ccd7..60fa5a8276cb 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> @@ -64,7 +64,7 @@ static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
>  	int fence_pitch_shift;
>  	u64 val;
>  
> -	if (INTEL_GEN(fence->i915) >= 6) {
> +	if (INTEL_GEN(fence->ggtt->vm.i915) >= 6) {
>  		fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
>  		fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
>  		fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
> @@ -93,7 +93,7 @@ static void i965_write_fence_reg(struct drm_i915_fence_reg *fence,
>  	}
>  
>  	if (!pipelined) {
> -		struct drm_i915_private *dev_priv = fence->i915;
> +		struct drm_i915_private *dev_priv = fence->ggtt->vm.i915;
>  
>  		/* To w/a incoherency with non-atomic 64-bit register updates,
>  		 * we split the 64-bit update into two 32-bit writes. In order
> @@ -129,7 +129,7 @@ static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
>  		GEM_BUG_ON(!is_power_of_2(vma->fence_size));
>  		GEM_BUG_ON(!IS_ALIGNED(vma->node.start, vma->fence_size));
>  
> -		if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence->i915))
> +		if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence->ggtt->vm.i915))
>  			stride /= 128;
>  		else
>  			stride /= 512;
> @@ -145,7 +145,7 @@ static void i915_write_fence_reg(struct drm_i915_fence_reg *fence,
>  	}
>  
>  	if (!pipelined) {
> -		struct drm_i915_private *dev_priv = fence->i915;
> +		struct drm_i915_private *dev_priv = fence->ggtt->vm.i915;
>  		i915_reg_t reg = FENCE_REG(fence->id);
>  
>  		I915_WRITE(reg, val);
> @@ -177,7 +177,7 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
>  	}
>  
>  	if (!pipelined) {
> -		struct drm_i915_private *dev_priv = fence->i915;
> +		struct drm_i915_private *dev_priv = fence->ggtt->vm.i915;
>  		i915_reg_t reg = FENCE_REG(fence->id);
>  
>  		I915_WRITE(reg, val);
> @@ -193,9 +193,9 @@ static void fence_write(struct drm_i915_fence_reg *fence,
>  	 * and explicitly managed for internal users.
>  	 */
>  
> -	if (IS_GEN2(fence->i915))
> +	if (IS_GEN2(fence->ggtt->vm.i915))
>  		i830_write_fence_reg(fence, vma);
> -	else if (IS_GEN3(fence->i915))
> +	else if (IS_GEN3(fence->ggtt->vm.i915))
>  		i915_write_fence_reg(fence, vma);
>  	else
>  		i965_write_fence_reg(fence, vma);
> @@ -210,6 +210,7 @@ static void fence_write(struct drm_i915_fence_reg *fence,
>  static int fence_update(struct drm_i915_fence_reg *fence,
>  			struct i915_vma *vma)
>  {
> +	struct i915_ggtt *ggtt = fence->ggtt;
>  	int ret;
>  
>  	if (vma) {
> @@ -250,16 +251,16 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>  		fence->vma->fence = NULL;
>  		fence->vma = NULL;
>  
> -		list_move(&fence->link, &fence->i915->mm.fence_list);
> +		list_move(&fence->link, &ggtt->fence_list);
>  	}
>  
>  	/* We only need to update the register itself if the device is awake.
>  	 * If the device is currently powered down, we will defer the write
>  	 * to the runtime resume, see i915_gem_restore_fences().
>  	 */
> -	if (intel_runtime_pm_get_if_in_use(fence->i915)) {
> +	if (intel_runtime_pm_get_if_in_use(ggtt->vm.i915)) {
>  		fence_write(fence, vma);
> -		intel_runtime_pm_put(fence->i915);
> +		intel_runtime_pm_put(ggtt->vm.i915);
>  	}
>  
>  	if (vma) {
> @@ -268,7 +269,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>  			fence->vma = vma;
>  		}
>  
> -		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
> +		list_move_tail(&fence->link, &ggtt->fence_list);
>  	}
>  
>  	return 0;
> @@ -298,11 +299,11 @@ int i915_vma_put_fence(struct i915_vma *vma)
>  	return fence_update(fence, NULL);
>  }
>  
> -static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
> +static struct drm_i915_fence_reg *fence_find(struct i915_ggtt *ggtt)
>  {
>  	struct drm_i915_fence_reg *fence;
>  
> -	list_for_each_entry(fence, &dev_priv->mm.fence_list, link) {
> +	list_for_each_entry(fence, &ggtt->fence_list, link) {
>  		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
>  
>  		if (fence->pin_count)
> @@ -312,7 +313,7 @@ static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
>  	}
>  
>  	/* Wait for completion of pending flips which consume fences */
> -	if (intel_has_pending_fb_unpin(dev_priv))
> +	if (intel_has_pending_fb_unpin(ggtt->vm.i915))
>  		return ERR_PTR(-EAGAIN);
>  
>  	return ERR_PTR(-EDEADLK);
> @@ -339,14 +340,15 @@ static struct drm_i915_fence_reg *fence_find(struct drm_i915_private *dev_priv)
>  int
>  i915_vma_pin_fence(struct i915_vma *vma)
>  {
> -	struct drm_i915_fence_reg *fence;
> +	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vma->vm);
>  	struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
> +	struct drm_i915_fence_reg *fence;
>  	int err;
>  
>  	/* Note that we revoke fences on runtime suspend. Therefore the user
>  	 * must keep the device awake whilst using the fence.
>  	 */
> -	assert_rpm_wakelock_held(vma->vm->i915);
> +	assert_rpm_wakelock_held(ggtt->vm.i915);
>  
>  	/* Just update our place in the LRU if our fence is getting reused. */
>  	if (vma->fence) {
> @@ -354,12 +356,11 @@ i915_vma_pin_fence(struct i915_vma *vma)
>  		GEM_BUG_ON(fence->vma != vma);
>  		fence->pin_count++;
>  		if (!fence->dirty) {
> -			list_move_tail(&fence->link,
> -				       &fence->i915->mm.fence_list);
> +			list_move_tail(&fence->link, &ggtt->fence_list);
>  			return 0;
>  		}
>  	} else if (set) {
> -		fence = fence_find(vma->vm->i915);
> +		fence = fence_find(ggtt);
>  		if (IS_ERR(fence))
>  			return PTR_ERR(fence);
>  
> @@ -385,28 +386,29 @@ i915_vma_pin_fence(struct i915_vma *vma)
>  
>  /**
>   * i915_reserve_fence - Reserve a fence for vGPU
> - * @dev_priv: i915 device private
> + * @i915: i915 device private
>   *
>   * This function walks the fence regs looking for a free one and remove
>   * it from the fence_list. It is used to reserve fence for vGPU to use.
>   */
>  struct drm_i915_fence_reg *
> -i915_reserve_fence(struct drm_i915_private *dev_priv)
> +i915_reserve_fence(struct drm_i915_private *i915)
>  {
> +	struct i915_ggtt *ggtt = &i915->ggtt;
>  	struct drm_i915_fence_reg *fence;
>  	int count;
>  	int ret;
>  
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +	lockdep_assert_held(&i915->drm.struct_mutex);
>  
>  	/* Keep at least one fence available for the display engine. */
>  	count = 0;
> -	list_for_each_entry(fence, &dev_priv->mm.fence_list, link)
> +	list_for_each_entry(fence, &ggtt->fence_list, link)
>  		count += !fence->pin_count;
>  	if (count <= 1)
>  		return ERR_PTR(-ENOSPC);
>  
> -	fence = fence_find(dev_priv);
> +	fence = fence_find(ggtt);
>  	if (IS_ERR(fence))
>  		return fence;
>  
> @@ -429,14 +431,14 @@ i915_reserve_fence(struct drm_i915_private *dev_priv)
>   */
>  void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
>  {
> -	lockdep_assert_held(&fence->i915->drm.struct_mutex);
> +	lockdep_assert_held(&fence->ggtt->vm.i915->drm.struct_mutex);
>  
> -	list_add(&fence->link, &fence->i915->mm.fence_list);
> +	list_add(&fence->link, &fence->ggtt->fence_list);
>  }
>  
>  /**
>   * i915_gem_revoke_fences - revoke fence state
> - * @dev_priv: i915 device private
> + * @i915: i915 device private
>   *
>   * Removes all GTT mmappings via the fence registers. This forces any user
>   * of the fence to reacquire that fence before continuing with their access.
> @@ -444,14 +446,15 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
>   * revoke concurrent userspace access via GTT mmaps until the hardware has been
>   * reset and the fence registers have been restored.
>   */
> -void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
> +void i915_gem_revoke_fences(struct drm_i915_private *i915)
>  {
> +	struct i915_ggtt *ggtt = &i915->ggtt;
>  	int i;
>  
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> +	lockdep_assert_held(&i915->drm.struct_mutex);
>  
> -	for (i = 0; i < dev_priv->num_fence_regs; i++) {
> -		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
> +	for (i = 0; i < ggtt->num_fence_regs; i++) {
> +		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
>  
>  		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
>  
> @@ -462,18 +465,19 @@ void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
>  
>  /**
>   * i915_gem_restore_fences - restore fence state
> - * @dev_priv: i915 device private
> + * @i915: i915 device private
>   *
>   * Restore the hw fence state to match the software tracking again, to be called
>   * after a gpu reset and on resume. Note that on runtime suspend we only cancel
>   * the fences, to be reacquired by the user later.
>   */
> -void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
> +void i915_gem_restore_fences(struct drm_i915_private *i915)
>  {
> +	struct i915_ggtt *ggtt = &i915->ggtt;
>  	int i;
>  
> -	for (i = 0; i < dev_priv->num_fence_regs; i++) {
> -		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
> +	for (i = 0; i < ggtt->num_fence_regs; i++) {
> +		struct drm_i915_fence_reg *reg = &ggtt->fence_regs[i];
>  		struct i915_vma *vma = reg->vma;
>  
>  		GEM_BUG_ON(vma && vma->fence != reg);
> @@ -486,7 +490,7 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
>  			GEM_BUG_ON(!reg->dirty);
>  			GEM_BUG_ON(i915_vma_has_userfault(vma));
>  
> -			list_move(&reg->link, &dev_priv->mm.fence_list);
> +			list_move(&reg->link, &ggtt->fence_list);
>  			vma->fence = NULL;
>  			vma = NULL;
>  		}
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> index 99a31ded4dfd..c8f1d0cdfa90 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> @@ -28,16 +28,20 @@
>  #include <linux/list.h>
>  
>  struct drm_i915_private;
> +struct i915_ggtt;
>  struct i915_vma;
>  
>  #define I965_FENCE_PAGE 4096UL
>  
>  struct drm_i915_fence_reg {
>  	struct list_head link;
> -	struct drm_i915_private *i915;
> +
> +	struct i915_ggtt *ggtt;
>  	struct i915_vma *vma;
> +
>  	int pin_count;
>  	int id;
> +
>  	/**
>  	 * Whether the tiling parameters for the currently
>  	 * associated fence register have changed. Note that
> @@ -49,5 +53,6 @@ struct drm_i915_fence_reg {
>  	bool dirty;
>  };
>  
> -#endif
> +void i915_ggtt_init_fences(struct i915_ggtt *ggtt);
>  
> +#endif
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index d0acef299b9c..abf41f90a925 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -3595,9 +3595,11 @@ int i915_ggtt_init_hw(struct drm_i915_private *dev_priv)
>  		ggtt->vm.mm.color_adjust = i915_gtt_color_adjust;
>  	mutex_unlock(&dev_priv->drm.struct_mutex);
>  
> -	if (!io_mapping_init_wc(&dev_priv->ggtt.iomap,
> -				dev_priv->ggtt.gmadr.start,
> -				dev_priv->ggtt.mappable_end)) {
> +	i915_ggtt_init_fences(ggtt);
> +
> +	if (!io_mapping_init_wc(&ggtt->iomap,
> +				ggtt->gmadr.start,
> +				ggtt->mappable_end)) {
>  		ret = -EIO;
>  		goto out_gtt_cleanup;
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 14e62651010b..f35a85284b1a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -38,6 +38,7 @@
>  #include <linux/mm.h>
>  #include <linux/pagevec.h>
>  
> +#include "i915_gem_fence_reg.h"
>  #include "i915_request.h"
>  #include "i915_selftest.h"
>  #include "i915_timeline.h"
> @@ -57,7 +58,6 @@
>  #define I915_MAX_NUM_FENCE_BITS 6
>  
>  struct drm_i915_file_private;
> -struct drm_i915_fence_reg;
>  struct i915_vma;
>  
>  typedef u32 gen6_pte_t;
> @@ -396,6 +396,11 @@ struct i915_ggtt {
>  
>  	int mtrr;
>  
> +	/** LRU list of objects with fence regs on them. */
> +	struct list_head fence_list;
> +	struct drm_i915_fence_reg fence_regs[I915_MAX_NUM_FENCES];
> +	int num_fence_regs;
> +
>  	struct drm_mm_node error_capture;
>  };
>  
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 8c81cf3aa182..9dfe1d02f098 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1093,16 +1093,17 @@ static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
>  static void gem_record_fences(struct i915_gpu_state *error)
>  {
>  	struct drm_i915_private *dev_priv = error->i915;
> +	const struct i915_ggtt *ggtt = &error->i915->ggtt;
>  	int i;
>  
>  	if (INTEL_GEN(dev_priv) >= 6) {
> -		for (i = 0; i < dev_priv->num_fence_regs; i++)
> +		for (i = 0; i < ggtt->num_fence_regs; i++)
>  			error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i));
>  	} else if (INTEL_GEN(dev_priv) >= 4) {
> -		for (i = 0; i < dev_priv->num_fence_regs; i++)
> +		for (i = 0; i < ggtt->num_fence_regs; i++)
>  			error->fence[i] = I915_READ64(FENCE_REG_965_LO(i));
>  	} else {
> -		for (i = 0; i < dev_priv->num_fence_regs; i++)
> +		for (i = 0; i < ggtt->num_fence_regs; i++)
>  			error->fence[i] = I915_READ(FENCE_REG(i));
>  	}
>  	error->nfence = i;

I didn't check whether you caught all the changes, that's what gcc is for.

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> -- 
> 2.18.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 2/7] drm/i915: Move fence register tracking to GGTT
  2018-07-11  8:19   ` Daniel Vetter
@ 2018-07-11  8:27     ` Chris Wilson
  0 siblings, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  8:27 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

Quoting Daniel Vetter (2018-07-11 09:19:58)
> On Wed, Jul 11, 2018 at 08:36:03AM +0100, Chris Wilson wrote:
> > @@ -5630,32 +5631,33 @@ i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
> >               dev_priv->gt.cleanup_engine(engine);
> >  }
> >  
> > -void
> > -i915_gem_load_init_fences(struct drm_i915_private *dev_priv)
> > +void i915_ggtt_init_fences(struct i915_ggtt *ggtt)
> 
> Follow-up patch to put this into i915_gem_fence_reg.c would be kinda cool
> I think. The decl is already in i915_gem_fence_reg.h already anyway.

It's caught between two worlds, iirc, I pulled it into i915_gem_gtt so
that it's inline with ggtt_init. From my pov, all we do inside this
function is setup the bookkeeping associated with tracking our usage of
fences, with a callout to the i915_gem_fence_reg.c backend to sanitize
the registers.

At the moment, I'm still falling on the side of "this touches i915->ggtt
therefore it should be alongside the rest of i915->ggtt initialisation".
But grass, it always grows greener on the other side.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* ✗ Fi.CI.IGT: failure for series starting with [1/7] drm/i915: Introduce i915_address_space.mutex
  2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
                   ` (9 preceding siblings ...)
  2018-07-11  8:03 ` ✓ Fi.CI.BAT: success " Patchwork
@ 2018-07-11  8:59 ` Patchwork
  10 siblings, 0 replies; 26+ messages in thread
From: Patchwork @ 2018-07-11  8:59 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/7] drm/i915: Introduce i915_address_space.mutex
URL   : https://patchwork.freedesktop.org/series/46289/
State : failure

== Summary ==

= CI Bug Log - changes from CI_DRM_4469_full -> Patchwork_9607_full =

== Summary - FAILURE ==

  Serious unknown changes coming with Patchwork_9607_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_9607_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

== Possible new issues ==

  Here are the unknown changes that may have been introduced in Patchwork_9607_full:

  === IGT changes ===

    ==== Possible regressions ====

    igt@kms_universal_plane@cursor-fb-leak-pipe-c:
      shard-apl:          PASS -> FAIL

    
    ==== Warnings ====

    igt@gem_exec_schedule@deep-render:
      shard-kbl:          SKIP -> PASS

    igt@kms_atomic_interruptible@universal-setplane-cursor:
      shard-snb:          SKIP -> PASS +1

    igt@pm_rc6_residency@rc6-accuracy:
      shard-kbl:          PASS -> SKIP

    
== Known issues ==

  Here are the changes found in Patchwork_9607_full that come from known issues:

  === IGT changes ===

    ==== Issues hit ====

    igt@gem_exec_big:
      shard-hsw:          PASS -> INCOMPLETE (fdo#103540)

    igt@kms_cursor_legacy@2x-nonblocking-modeset-vs-cursor-atomic:
      shard-glk:          PASS -> FAIL (fdo#105454, fdo#106509)

    igt@kms_cursor_legacy@cursor-vs-flip-toggle:
      shard-hsw:          PASS -> FAIL (fdo#103355)

    igt@kms_fbcon_fbt@fbc-suspend:
      shard-snb:          PASS -> DMESG-WARN (fdo#102365)

    igt@kms_flip@2x-plain-flip-fb-recreate-interruptible:
      shard-glk:          PASS -> FAIL (fdo#100368) +2

    igt@kms_flip_tiling@flip-x-tiled:
      shard-glk:          PASS -> FAIL (fdo#107161, fdo#103822)

    
    ==== Possible fixes ====

    igt@drv_suspend@shrink:
      shard-apl:          INCOMPLETE (fdo#103927) -> PASS
      shard-glk:          FAIL (fdo#106886) -> PASS

    igt@gem_ctx_isolation@rcs0-s3:
      shard-kbl:          INCOMPLETE (fdo#103665) -> PASS

    igt@kms_flip@flip-vs-expired-vblank-interruptible:
      shard-glk:          FAIL (fdo#102887) -> PASS

    igt@kms_setmode@basic:
      shard-apl:          FAIL (fdo#99912) -> PASS

    
  fdo#100368 https://bugs.freedesktop.org/show_bug.cgi?id=100368
  fdo#102365 https://bugs.freedesktop.org/show_bug.cgi?id=102365
  fdo#102887 https://bugs.freedesktop.org/show_bug.cgi?id=102887
  fdo#103355 https://bugs.freedesktop.org/show_bug.cgi?id=103355
  fdo#103540 https://bugs.freedesktop.org/show_bug.cgi?id=103540
  fdo#103665 https://bugs.freedesktop.org/show_bug.cgi?id=103665
  fdo#103822 https://bugs.freedesktop.org/show_bug.cgi?id=103822
  fdo#103927 https://bugs.freedesktop.org/show_bug.cgi?id=103927
  fdo#105454 https://bugs.freedesktop.org/show_bug.cgi?id=105454
  fdo#106509 https://bugs.freedesktop.org/show_bug.cgi?id=106509
  fdo#106886 https://bugs.freedesktop.org/show_bug.cgi?id=106886
  fdo#107161 https://bugs.freedesktop.org/show_bug.cgi?id=107161
  fdo#99912 https://bugs.freedesktop.org/show_bug.cgi?id=99912


== Participating hosts (5 -> 5) ==

  No changes in participating hosts


== Build changes ==

    * Linux: CI_DRM_4469 -> Patchwork_9607

  CI_DRM_4469: 02e578b7aace48d33fa617dddb40621bd664c92c @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4546: e8905e756cf3640c66541e963ff97f8af2d98936 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_9607: c3b127c4ee925c38cbcf0dcee419cf011879d92a @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_9607/shards.html
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/7] drm/i915: Convert fences to use a GGTT lock rather than struct_mutex
  2018-07-11  7:36 ` [PATCH 3/7] drm/i915: Convert fences to use a GGTT lock rather than struct_mutex Chris Wilson
@ 2018-07-11  9:08   ` Daniel Vetter
  2018-07-11 10:57     ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2018-07-11  9:08 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, Jul 11, 2018 at 08:36:04AM +0100, Chris Wilson wrote:
> Introduce a new mutex to guard all of the vma operations within a vm (as
> opposed to the BKL struct_mutex) and start by using it to guard the
> fence operations for a GGTT VMA.

Commit message is a bit confusing, since you've already introduce this new
mutex in an earlier patch. Please change to "Switch from dev->struct_mutex
to ggtt.vm->mutex" or similar ...

For the reviewers benefit it would also be good to explain how this new
vm.mutex nests with others stuff here (dev->struct_mutex and rpm come to
mind, looking from the patch). Probably best here than in docs since it's
likely going to get outdated.

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        |  9 ++-
>  drivers/gpu/drm/i915/i915_gem.c            | 11 +++-
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  5 +-
>  drivers/gpu/drm/i915/i915_gem_fence_reg.c  | 68 +++++++++++++++++-----
>  drivers/gpu/drm/i915/i915_vma.c            | 12 ++--
>  drivers/gpu/drm/i915/i915_vma.h            | 23 +++++++-
>  6 files changed, 96 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 75ffed6a3f31..e2ba298a5d88 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -80,7 +80,7 @@ static char get_tiling_flag(struct drm_i915_gem_object *obj)
>  
>  static char get_global_flag(struct drm_i915_gem_object *obj)
>  {
> -	return obj->userfault_count ? 'g' : ' ';
> +	return READ_ONCE(obj->userfault_count) ? 'g' : ' ';

The usefault_count here (and below) look like misplaced hunks?

>  }
>  
>  static char get_pin_mapped_flag(struct drm_i915_gem_object *obj)
> @@ -914,11 +914,10 @@ static int i915_interrupt_info(struct seq_file *m, void *data)
>  
>  static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
>  {
> -	struct drm_i915_private *i915 = node_to_i915(m->private);
> -	const struct i915_ggtt *ggtt = &i915->ggtt;
> +	struct i915_ggtt *ggtt = &node_to_i915(m->private)->ggtt;
>  	int i, ret;
>  
> -	ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
> +	ret = mutex_lock_interruptible(&ggtt->vm.mutex);
>  	if (ret)
>  		return ret;
>  
> @@ -935,7 +934,7 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
>  		seq_putc(m, '\n');
>  	}
>  
> -	mutex_unlock(&i915->drm.struct_mutex);
> +	mutex_unlock(&ggtt->vm.mutex);
>  	return 0;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 356c86071ccc..cbcba613b175 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2193,8 +2193,8 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
>  	 * requirement that operations to the GGTT be made holding the RPM
>  	 * wakeref.
>  	 */
> -	lockdep_assert_held(&i915->drm.struct_mutex);
>  	intel_runtime_pm_get(i915);
> +	mutex_lock(&i915->ggtt.vm.mutex);
>  
>  	if (!obj->userfault_count)
>  		goto out;
> @@ -2211,6 +2211,7 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
>  	wmb();
>  
>  out:
> +	mutex_unlock(&i915->ggtt.vm.mutex);
>  	intel_runtime_pm_put(i915);
>  }
>  
> @@ -2223,10 +2224,12 @@ void i915_gem_runtime_suspend(struct drm_i915_private *i915)
>  	/*
>  	 * Only called during RPM suspend. All users of the userfault_list
>  	 * must be holding an RPM wakeref to ensure that this can not
> -	 * run concurrently with themselves (and use the struct_mutex for
> +	 * run concurrently with themselves (and use the ggtt->mutex for
>  	 * protection between themselves).
>  	 */

I think the above change isn't correct, at least not yet at this stage:
All users of the userfault_list still use dev->struct_mutex, not vm.mutex.
I guess we could move that over to the ggtt.vm.mutex eventually, but this
patch doesn't do that.
>  
> +	mutex_lock(&i915->ggtt.vm.mutex);

I also don't think you need to take the lock just yet. For rpm we keep the
fences around (we restore them in rpm_resume after all), we just nuke the
mmap ptes.

> +
>  	list_for_each_entry_safe(obj, on,
>  				 &i915->mm.userfault_list, userfault_link)
>  		__i915_gem_object_release_mmap(obj);
> @@ -2255,6 +2258,8 @@ void i915_gem_runtime_suspend(struct drm_i915_private *i915)
>  		GEM_BUG_ON(i915_vma_has_userfault(reg->vma));
>  		reg->dirty = true;
>  	}
> +
> +	mutex_unlock(&i915->ggtt.vm.mutex);

>  }
>  
>  static int i915_gem_object_create_mmap_offset(struct drm_i915_gem_object *obj)
> @@ -4861,7 +4866,7 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
>  		mutex_unlock(&i915->drm.struct_mutex);
>  
>  		GEM_BUG_ON(obj->bind_count);
> -		GEM_BUG_ON(obj->userfault_count);
> +		GEM_BUG_ON(READ_ONCE(obj->userfault_count));

Another misplaced hunk?

>  		GEM_BUG_ON(atomic_read(&obj->frontbuffer_bits));
>  		GEM_BUG_ON(!list_empty(&obj->lut_list));
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 3f0c612d42e7..e1d65b165bf1 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -426,8 +426,11 @@ static inline void __eb_unreserve_vma(struct i915_vma *vma, unsigned int flags)
>  {
>  	GEM_BUG_ON(!(flags & __EXEC_OBJECT_HAS_PIN));
>  
> -	if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE))
> +	if (unlikely(flags & __EXEC_OBJECT_HAS_FENCE)) {
> +		mutex_lock(&vma->vm->mutex);
>  		__i915_vma_unpin_fence(vma);
> +		mutex_unlock(&vma->vm->mutex);
> +	}
>  
>  	__i915_vma_unpin(vma);
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> index 60fa5a8276cb..9313a8e675c8 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> @@ -188,6 +188,8 @@ static void i830_write_fence_reg(struct drm_i915_fence_reg *fence,
>  static void fence_write(struct drm_i915_fence_reg *fence,
>  			struct i915_vma *vma)
>  {
> +	lockdep_assert_held(&fence->ggtt->vm.mutex);
> +
>  	/* Previous access through the fence register is marshalled by
>  	 * the mb() inside the fault handlers (i915_gem_release_mmaps)
>  	 * and explicitly managed for internal users.
> @@ -213,6 +215,8 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>  	struct i915_ggtt *ggtt = fence->ggtt;
>  	int ret;
>  
> +	lockdep_assert_held(&ggtt->vm.mutex);
> +
>  	if (vma) {
>  		if (!i915_vma_is_map_and_fenceable(vma))
>  			return -EINVAL;
> @@ -289,14 +293,39 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>  int i915_vma_put_fence(struct i915_vma *vma)
>  {
>  	struct drm_i915_fence_reg *fence = vma->fence;
> +	int err;
>  
>  	if (!fence)
>  		return 0;
>  
> -	if (fence->pin_count)
> -		return -EBUSY;
> +	mutex_lock(&vma->vm->mutex);
> +	if (!fence->pin_count)
> +		err = fence_update(fence, NULL);
> +	else
> +		err = -EBUSY;
> +	mutex_unlock(&vma->vm->mutex);
>  
> -	return fence_update(fence, NULL);
> +	return err;
> +}
> +
> +void i915_vma_revoke_fence(struct i915_vma *vma)
> +{
> +	struct drm_i915_fence_reg *fence;
> +
> +	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
> +	lockdep_assert_held(&vma->vm->mutex);
> +
> +	fence = vma->fence;
> +	if (!fence)
> +		return;
> +
> +	GEM_BUG_ON(fence->pin_count);
> +
> +	list_move(&fence->link, &i915_vm_to_ggtt(vma->vm)->fence_list);
> +	vma->fence = NULL;
> +
> +	fence_write(fence, NULL);
> +	fence->vma = NULL;

fence_update(fence, NULL); here please instead of open-coding it. Or I'm
kinda not following why you're doing this?

>  }
>  
>  static struct drm_i915_fence_reg *fence_find(struct i915_ggtt *ggtt)
> @@ -337,8 +366,7 @@ static struct drm_i915_fence_reg *fence_find(struct i915_ggtt *ggtt)
>   *
>   * 0 on success, negative error code on failure.
>   */
> -int
> -i915_vma_pin_fence(struct i915_vma *vma)
> +int __i915_vma_pin_fence(struct i915_vma *vma)

I don't (yet) see a caller of this new __ version ...

>  {
>  	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vma->vm);
>  	struct i915_vma *set = i915_gem_object_is_tiled(vma->obj) ? vma : NULL;
> @@ -349,6 +377,7 @@ i915_vma_pin_fence(struct i915_vma *vma)
>  	 * must keep the device awake whilst using the fence.
>  	 */
>  	assert_rpm_wakelock_held(ggtt->vm.i915);
> +	lockdep_assert_held(&ggtt->vm.mutex);
>  
>  	/* Just update our place in the LRU if our fence is getting reused. */
>  	if (vma->fence) {
> @@ -399,27 +428,34 @@ i915_reserve_fence(struct drm_i915_private *i915)
>  	int count;
>  	int ret;
>  
> -	lockdep_assert_held(&i915->drm.struct_mutex);
> +	mutex_lock(&i915->ggtt.vm.mutex);
>  
>  	/* Keep at least one fence available for the display engine. */
>  	count = 0;
>  	list_for_each_entry(fence, &ggtt->fence_list, link)
>  		count += !fence->pin_count;
> -	if (count <= 1)
> -		return ERR_PTR(-ENOSPC);
> +	if (count <= 1) {
> +		fence = ERR_PTR(-ENOSPC);
> +		goto out_unlock;
> +	}
>  
>  	fence = fence_find(ggtt);
>  	if (IS_ERR(fence))
> -		return fence;
> +		goto out_unlock;
>  
>  	if (fence->vma) {
>  		/* Force-remove fence from VMA */
>  		ret = fence_update(fence, NULL);
> -		if (ret)
> -			return ERR_PTR(ret);
> +		if (ret) {
> +			fence = ERR_PTR(ret);
> +			goto out_unlock;
> +		}
>  	}
>  
>  	list_del(&fence->link);
> +
> +out_unlock:
> +	mutex_unlock(&i915->ggtt.vm.mutex);
>  	return fence;
>  }
>  
> @@ -431,9 +467,9 @@ i915_reserve_fence(struct drm_i915_private *i915)
>   */
>  void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
>  {
> -	lockdep_assert_held(&fence->ggtt->vm.i915->drm.struct_mutex);
> -
> +	mutex_lock(&fence->ggtt->vm.mutex);
>  	list_add(&fence->link, &fence->ggtt->fence_list);
> +	mutex_unlock(&fence->ggtt->vm.mutex);
>  }
>  
>  /**
> @@ -451,8 +487,7 @@ void i915_gem_revoke_fences(struct drm_i915_private *i915)
>  	struct i915_ggtt *ggtt = &i915->ggtt;
>  	int i;
>  
> -	lockdep_assert_held(&i915->drm.struct_mutex);
> -
> +	mutex_lock(&ggtt->vm.mutex);
>  	for (i = 0; i < ggtt->num_fence_regs; i++) {
>  		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
>  
> @@ -461,6 +496,7 @@ void i915_gem_revoke_fences(struct drm_i915_private *i915)
>  		if (fence->vma)
>  			i915_vma_revoke_mmap(fence->vma);
>  	}
> +	mutex_unlock(&ggtt->vm.mutex);
>  }
>  
>  /**
> @@ -476,6 +512,7 @@ void i915_gem_restore_fences(struct drm_i915_private *i915)
>  	struct i915_ggtt *ggtt = &i915->ggtt;
>  	int i;
>  
> +	mutex_lock(&ggtt->vm.mutex);
>  	for (i = 0; i < ggtt->num_fence_regs; i++) {
>  		struct drm_i915_fence_reg *reg = &ggtt->fence_regs[i];
>  		struct i915_vma *vma = reg->vma;
> @@ -498,6 +535,7 @@ void i915_gem_restore_fences(struct drm_i915_private *i915)
>  		fence_write(reg, vma);
>  		reg->vma = vma;
>  	}
> +	mutex_unlock(&ggtt->vm.mutex);
>  }
>  
>  /**
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index ed4e0fb558f7..045b75d79f60 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -860,7 +860,7 @@ void i915_vma_revoke_mmap(struct i915_vma *vma)
>  	struct drm_vma_offset_node *node = &vma->obj->base.vma_node;
>  	u64 vma_offset;
>  
> -	lockdep_assert_held(&vma->vm->i915->drm.struct_mutex);
> +	lockdep_assert_held(&vma->vm->mutex);
>  
>  	if (!i915_vma_has_userfault(vma))
>  		return;
> @@ -1082,6 +1082,8 @@ int i915_vma_unbind(struct i915_vma *vma)
>  		return 0;
>  
>  	if (i915_vma_is_map_and_fenceable(vma)) {
> +		mutex_lock(&vma->vm->mutex);
> +
>  		/*
>  		 * Check that we have flushed all writes through the GGTT
>  		 * before the unbind, other due to non-strict nature of those
> @@ -1091,16 +1093,14 @@ int i915_vma_unbind(struct i915_vma *vma)
>  		i915_vma_flush_writes(vma);
>  		GEM_BUG_ON(i915_vma_has_ggtt_write(vma));
>  
> -		/* release the fence reg _after_ flushing */
> -		ret = i915_vma_put_fence(vma);
> -		if (ret)
> -			return ret;
> -
>  		/* Force a pagefault for domain tracking on next user access */
>  		i915_vma_revoke_mmap(vma);
> +		i915_vma_revoke_fence(vma);
>  
>  		__i915_vma_iounmap(vma);
>  		vma->flags &= ~I915_VMA_CAN_FENCE;
> +
> +		mutex_unlock(&vma->vm->mutex);
>  	}
>  	GEM_BUG_ON(vma->fence);
>  	GEM_BUG_ON(i915_vma_has_userfault(vma));
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index f06d66377107..422d90c686b5 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -190,6 +190,7 @@ static inline bool i915_vma_set_userfault(struct i915_vma *vma)
>  
>  static inline void i915_vma_unset_userfault(struct i915_vma *vma)
>  {
> +	lockdep_assert_held(&vma->vm->mutex);
>  	return __clear_bit(I915_VMA_USERFAULT_BIT, &vma->flags);
>  }
>  
> @@ -378,11 +379,26 @@ static inline struct page *i915_vma_first_page(struct i915_vma *vma)
>   *
>   * True if the vma has a fence, false otherwise.
>   */
> -int i915_vma_pin_fence(struct i915_vma *vma);
> +int __i915_vma_pin_fence(struct i915_vma *vma);
> +static inline int i915_vma_pin_fence(struct i915_vma *vma)
> +{
> +	int err;
> +
> +	mutex_lock(&vma->vm->mutex);
> +	err = __i915_vma_pin_fence(vma);
> +	mutex_unlock(&vma->vm->mutex);
> +
> +	return err;
> +}
> +
>  int __must_check i915_vma_put_fence(struct i915_vma *vma);
> +void i915_vma_revoke_fence(struct i915_vma *vma);
>  
>  static inline void __i915_vma_unpin_fence(struct i915_vma *vma)
>  {
> +	lockdep_assert_held(&vma->vm->mutex);
> +	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
> +
>  	GEM_BUG_ON(vma->fence->pin_count <= 0);
>  	vma->fence->pin_count--;
>  }
> @@ -399,8 +415,11 @@ static inline void
>  i915_vma_unpin_fence(struct i915_vma *vma)
>  {
>  	/* lockdep_assert_held(&vma->vm->i915->drm.struct_mutex); */
> -	if (vma->fence)
> +	if (vma->fence) {

I think you need the lock here outside of vma->fence. At least I
understood the fancy new locking rules to mean that vma->fence is
protected by vma->vm.mutex. Since vma's can't move between vm this is
safe.

Would be good to update the kerneldoc/comments for that accordingly too.

> +		mutex_lock(&vma->vm->mutex);
>  		__i915_vma_unpin_fence(vma);
> +		mutex_unlock(&vma->vm->mutex);
> +	}
>  }
>  
>  void i915_vma_parked(struct drm_i915_private *i915);

Ok the locking changes in i915_gem_fence_reg.c look good, but git grep
says there's more:
- describe_obj in i915_debugfs.c only wants dev->struct_mutex, but really
  wants vm.mutex on top too now. At least for ggtt.
- i915_gem_fence_regs_info in i915_debugfs.c is already fixed in this
  patch.
- I've ignored i915_gpu_error.c as usual :-)
- intel_display.c is annoying, since the way it tries to figure out
  whether it got a fence or not is racy. That one probably needs the
  __i915_vma_pin_fence version that I didn't find ...
- intel_fbc.c looks safe due to the PLANE_HAS_FENCE flags (minus the issue
  in intel_display.c)
- I found one more access to vma->fence->id in the selftests, that's
  really the same as we do in i915_gpu_error.c. If you feel like it, a
  i915_vma_fence_id() static inline that does the correct READ_ONCE dance
  on vma->fence could be useful for the paranoid.

Absolutely loved the naming collision with dma_fence, and my grep fu isn't
good enough to avoid these. So probably missed some (but tried rather hard
not to).

Besides these small nits and questions looks all good.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 5/7] drm/i915: Dynamically allocate the array of drm_i915_gem_fence_reg
  2018-07-11  7:36 ` [PATCH 5/7] drm/i915: Dynamically allocate the array of drm_i915_gem_fence_reg Chris Wilson
@ 2018-07-11  9:11   ` Daniel Vetter
  0 siblings, 0 replies; 26+ messages in thread
From: Daniel Vetter @ 2018-07-11  9:11 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, Jul 11, 2018 at 08:36:06AM +0100, Chris Wilson wrote:
> If we dynamically allocate the correct sized array for the fence
> registers, we can avoid the 4x overallocation on older, typically
> smaller devices and avoid having to know the static layout in advance.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Feels a bit like micro-optimizing, but patch looks ok. I'd put these two
into i915_gem_fence_reg.h tough since you're already moving/creating them.
Either way:

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>

> ---
>  drivers/gpu/drm/i915/i915_gem.c           | 33 ------------
>  drivers/gpu/drm/i915/i915_gem_fence_reg.h |  2 -
>  drivers/gpu/drm/i915/i915_gem_gtt.c       | 64 +++++++++++++++++++++--
>  drivers/gpu/drm/i915/i915_gem_gtt.h       |  3 +-
>  drivers/gpu/drm/i915/i915_vma.h           |  1 +
>  5 files changed, 62 insertions(+), 41 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index cbcba613b175..8eecd68f9e23 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -5636,39 +5636,6 @@ i915_gem_cleanup_engines(struct drm_i915_private *dev_priv)
>  		dev_priv->gt.cleanup_engine(engine);
>  }
>  
> -void i915_ggtt_init_fences(struct i915_ggtt *ggtt)
> -{
> -	struct drm_i915_private *dev_priv = ggtt->vm.i915;
> -	int i;
> -
> -	if (INTEL_GEN(dev_priv) >= 7 && !IS_VALLEYVIEW(dev_priv) &&
> -	    !IS_CHERRYVIEW(dev_priv))
> -		ggtt->num_fence_regs = 32;
> -	else if (INTEL_GEN(dev_priv) >= 4 ||
> -		 IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
> -		 IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
> -		ggtt->num_fence_regs = 16;
> -	else
> -		ggtt->num_fence_regs = 8;
> -
> -	if (intel_vgpu_active(dev_priv))
> -		ggtt->num_fence_regs = I915_READ(vgtif_reg(avail_rs.fence_num));
> -
> -	INIT_LIST_HEAD(&ggtt->fence_list);
> -
> -	/* Initialize fence registers to zero */
> -	for (i = 0; i < ggtt->num_fence_regs; i++) {
> -		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
> -
> -		fence->ggtt = ggtt;
> -		fence->id = i;
> -		list_add_tail(&fence->link, &ggtt->fence_list);
> -	}
> -	i915_gem_restore_fences(dev_priv);
> -
> -	i915_gem_detect_bit_6_swizzle(dev_priv);
> -}
> -
>  static void i915_gem_init__mm(struct drm_i915_private *i915)
>  {
>  	spin_lock_init(&i915->mm.object_stat_lock);
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> index c510f8efc1bb..6e66f6b3f851 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
> @@ -56,8 +56,6 @@ struct drm_i915_fence_reg {
>  	bool dirty;
>  };
>  
> -void i915_ggtt_init_fences(struct i915_ggtt *ggtt);
> -
>  struct drm_i915_fence_reg *
>  i915_reserve_fence(struct drm_i915_private *i915);
>  void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index abf41f90a925..e6787c3af544 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -37,6 +37,7 @@
>  #include <drm/i915_drm.h>
>  
>  #include "i915_drv.h"
> +#include "i915_gem_fence_reg.h"
>  #include "i915_vgpu.h"
>  #include "i915_trace.h"
>  #include "intel_drv.h"
> @@ -2901,6 +2902,51 @@ void i915_gem_fini_aliasing_ppgtt(struct drm_i915_private *i915)
>  	ggtt->vm.vma_ops.unbind_vma = ggtt_unbind_vma;
>  }
>  
> +static int i915_ggtt_init_fences(struct i915_ggtt *ggtt)
> +{
> +	struct drm_i915_private *dev_priv = ggtt->vm.i915;
> +	int i;
> +
> +	if (INTEL_GEN(dev_priv) >= 7 &&
> +	    !(IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)))
> +		ggtt->num_fence_regs = 32;
> +	else if (INTEL_GEN(dev_priv) >= 4 ||
> +		 IS_I945G(dev_priv) || IS_I945GM(dev_priv) ||
> +		 IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
> +		ggtt->num_fence_regs = 16;
> +	else
> +		ggtt->num_fence_regs = 8;
> +
> +	if (intel_vgpu_active(dev_priv))
> +		ggtt->num_fence_regs = I915_READ(vgtif_reg(avail_rs.fence_num));
> +
> +	ggtt->fence_regs = kcalloc(ggtt->num_fence_regs,
> +				   sizeof(*ggtt->fence_regs),
> +				   GFP_KERNEL);
> +	if (!ggtt->fence_regs)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&ggtt->fence_list);
> +
> +	/* Initialize fence registers to zero */
> +	for (i = 0; i < ggtt->num_fence_regs; i++) {
> +		struct drm_i915_fence_reg *fence = &ggtt->fence_regs[i];
> +
> +		fence->ggtt = ggtt;
> +		fence->id = i;
> +		list_add_tail(&fence->link, &ggtt->fence_list);
> +	}
> +	i915_gem_restore_fences(dev_priv);
> +
> +	i915_gem_detect_bit_6_swizzle(dev_priv);
> +	return 0;
> +}
> +
> +static void i915_ggtt_cleanup_fences(struct i915_ggtt *ggtt)
> +{
> +	kfree(ggtt->fence_regs);
> +}
> +
>  int i915_gem_init_ggtt(struct drm_i915_private *dev_priv)
>  {
>  	/* Let GEM Manage all of the aperture.
> @@ -2990,6 +3036,8 @@ void i915_ggtt_cleanup_hw(struct drm_i915_private *dev_priv)
>  
>  	mutex_unlock(&dev_priv->drm.struct_mutex);
>  
> +	i915_ggtt_cleanup_fences(ggtt);
> +
>  	arch_phys_wc_del(ggtt->mtrr);
>  	io_mapping_fini(&ggtt->iomap);
>  
> @@ -3595,13 +3643,15 @@ int i915_ggtt_init_hw(struct drm_i915_private *dev_priv)
>  		ggtt->vm.mm.color_adjust = i915_gtt_color_adjust;
>  	mutex_unlock(&dev_priv->drm.struct_mutex);
>  
> -	i915_ggtt_init_fences(ggtt);
> +	ret = i915_ggtt_init_fences(ggtt);
> +	if (ret)
> +		goto err_fini;
>  
>  	if (!io_mapping_init_wc(&ggtt->iomap,
>  				ggtt->gmadr.start,
>  				ggtt->mappable_end)) {
>  		ret = -EIO;
> -		goto out_gtt_cleanup;
> +		goto err_fences;
>  	}
>  
>  	ggtt->mtrr = arch_phys_wc_add(ggtt->gmadr.start, ggtt->mappable_end);
> @@ -3612,12 +3662,18 @@ int i915_ggtt_init_hw(struct drm_i915_private *dev_priv)
>  	 */
>  	ret = i915_gem_init_stolen(dev_priv);
>  	if (ret)
> -		goto out_gtt_cleanup;
> +		goto err_io;
>  
>  	return 0;
>  
> -out_gtt_cleanup:
> +err_io:
> +	arch_phys_wc_del(ggtt->mtrr);
> +	io_mapping_fini(&ggtt->iomap);
> +err_fences:
> +	i915_ggtt_cleanup_fences(ggtt);
> +err_fini:
>  	ggtt->vm.cleanup(&ggtt->vm);
> +	i915_address_space_fini(&ggtt->vm);
>  	return ret;
>  }
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index f35a85284b1a..f8c372dd6362 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -38,7 +38,6 @@
>  #include <linux/mm.h>
>  #include <linux/pagevec.h>
>  
> -#include "i915_gem_fence_reg.h"
>  #include "i915_request.h"
>  #include "i915_selftest.h"
>  #include "i915_timeline.h"
> @@ -398,7 +397,7 @@ struct i915_ggtt {
>  
>  	/** LRU list of objects with fence regs on them. */
>  	struct list_head fence_list;
> -	struct drm_i915_fence_reg fence_regs[I915_MAX_NUM_FENCES];
> +	struct drm_i915_fence_reg *fence_regs;
>  	int num_fence_regs;
>  
>  	struct drm_mm_node error_capture;
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index 925af79cc6d6..7df156e1ca06 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -30,6 +30,7 @@
>  
>  #include <drm/drm_mm.h>
>  
> +#include "i915_gem_fence_reg.h"
>  #include "i915_gem_gtt.h"
>  #include "i915_gem_object.h"
>  
> -- 
> 2.18.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 6/7] drm/i915: Pull all the reset functionality together into i915_reset.c
  2018-07-11  7:36 ` [PATCH 6/7] drm/i915: Pull all the reset functionality together into i915_reset.c Chris Wilson
@ 2018-07-11  9:17   ` Daniel Vetter
  2018-07-11  9:28     ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2018-07-11  9:17 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, Jul 11, 2018 at 08:36:07AM +0100, Chris Wilson wrote:
> Currently the code to reset the GPU and our state is spread widely
> across a few files. Pull the logic together into a common file.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Hm when reading the fence code I'd suggested more to move the reset/init
code for a given topic (like fence_reg) into one place. Imo that makes
reviewing changes to a specific thing easier because it's all in one file.

Ofc it makes it harder to review large-scale flows like suspend/resume and
gpu reset, but I think the only thing that can safe us there is lots and
lots of runtime consistency checks for each component (lockdep,
GEM_BUG_ON). Reviewing those is doable, and for the big picture
CI+regrets.

tldr; not sure this is the best organization we can do.
-Daniel

> ---
>  drivers/gpu/drm/i915/Makefile                 |    3 +-
>  drivers/gpu/drm/i915/i915_debugfs.c           |    2 +
>  drivers/gpu/drm/i915/i915_drv.c               |  207 +--
>  drivers/gpu/drm/i915/i915_drv.h               |   31 +-
>  drivers/gpu/drm/i915/i915_gem.c               |  465 +-----
>  drivers/gpu/drm/i915/i915_irq.c               |  220 ---
>  drivers/gpu/drm/i915/i915_request.c           |    1 +
>  drivers/gpu/drm/i915/i915_reset.c             | 1271 +++++++++++++++++
>  drivers/gpu/drm/i915/i915_reset.h             |   37 +
>  drivers/gpu/drm/i915/intel_display.c          |   15 +-
>  drivers/gpu/drm/i915/intel_guc.h              |    3 +
>  drivers/gpu/drm/i915/intel_hangcheck.c        |    1 +
>  drivers/gpu/drm/i915/intel_uc.c               |    1 +
>  drivers/gpu/drm/i915/intel_uncore.c           |  415 ------
>  .../drm/i915/selftests/intel_workarounds.c    |    1 +
>  15 files changed, 1342 insertions(+), 1331 deletions(-)
>  create mode 100644 drivers/gpu/drm/i915/i915_reset.c
>  create mode 100644 drivers/gpu/drm/i915/i915_reset.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 5794f102f9b8..d09799e79893 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -40,7 +40,8 @@ i915-y := i915_drv.o \
>  	  i915_mm.o \
>  	  i915_params.o \
>  	  i915_pci.o \
> -          i915_suspend.o \
> +	  i915_reset.o \
> +	  i915_suspend.o \
>  	  i915_syncmap.o \
>  	  i915_sw_fence.o \
>  	  i915_sysfs.o \
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index e2ba298a5d88..a0f519c44410 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -32,6 +32,8 @@
>  #include "intel_drv.h"
>  #include "intel_guc_submission.h"
>  
> +#include "i915_reset.h"
> +
>  static inline struct drm_i915_private *node_to_i915(struct drm_info_node *node)
>  {
>  	return to_i915(node->minor->dev);
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 97a2054c38d4..fa3b4144a7fa 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -49,6 +49,7 @@
>  #include "i915_drv.h"
>  #include "i915_trace.h"
>  #include "i915_pmu.h"
> +#include "i915_reset.h"
>  #include "i915_query.h"
>  #include "i915_vgpu.h"
>  #include "intel_drv.h"
> @@ -1878,212 +1879,6 @@ static int i915_resume_switcheroo(struct drm_device *dev)
>  	return i915_drm_resume(dev);
>  }
>  
> -/**
> - * i915_reset - reset chip after a hang
> - * @i915: #drm_i915_private to reset
> - * @stalled_mask: mask of the stalled engines with the guilty requests
> - * @reason: user error message for why we are resetting
> - *
> - * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
> - * on failure.
> - *
> - * Caller must hold the struct_mutex.
> - *
> - * Procedure is fairly simple:
> - *   - reset the chip using the reset reg
> - *   - re-init context state
> - *   - re-init hardware status page
> - *   - re-init ring buffer
> - *   - re-init interrupt state
> - *   - re-init display
> - */
> -void i915_reset(struct drm_i915_private *i915,
> -		unsigned int stalled_mask,
> -		const char *reason)
> -{
> -	struct i915_gpu_error *error = &i915->gpu_error;
> -	int ret;
> -	int i;
> -
> -	GEM_TRACE("flags=%lx\n", error->flags);
> -
> -	might_sleep();
> -	lockdep_assert_held(&i915->drm.struct_mutex);
> -	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
> -
> -	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
> -		return;
> -
> -	/* Clear any previous failed attempts at recovery. Time to try again. */
> -	if (!i915_gem_unset_wedged(i915))
> -		goto wakeup;
> -
> -	if (reason)
> -		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
> -	error->reset_count++;
> -
> -	disable_irq(i915->drm.irq);
> -	ret = i915_gem_reset_prepare(i915);
> -	if (ret) {
> -		dev_err(i915->drm.dev, "GPU recovery failed\n");
> -		goto taint;
> -	}
> -
> -	if (!intel_has_gpu_reset(i915)) {
> -		if (i915_modparams.reset)
> -			dev_err(i915->drm.dev, "GPU reset not supported\n");
> -		else
> -			DRM_DEBUG_DRIVER("GPU reset disabled\n");
> -		goto error;
> -	}
> -
> -	for (i = 0; i < 3; i++) {
> -		ret = intel_gpu_reset(i915, ALL_ENGINES);
> -		if (ret == 0)
> -			break;
> -
> -		msleep(100);
> -	}
> -	if (ret) {
> -		dev_err(i915->drm.dev, "Failed to reset chip\n");
> -		goto taint;
> -	}
> -
> -	/* Ok, now get things going again... */
> -
> -	/*
> -	 * Everything depends on having the GTT running, so we need to start
> -	 * there.
> -	 */
> -	ret = i915_ggtt_enable_hw(i915);
> -	if (ret) {
> -		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
> -			  ret);
> -		goto error;
> -	}
> -
> -	i915_gem_reset(i915, stalled_mask);
> -	intel_overlay_reset(i915);
> -
> -	/*
> -	 * Next we need to restore the context, but we don't use those
> -	 * yet either...
> -	 *
> -	 * Ring buffer needs to be re-initialized in the KMS case, or if X
> -	 * was running at the time of the reset (i.e. we weren't VT
> -	 * switched away).
> -	 */
> -	ret = i915_gem_init_hw(i915);
> -	if (ret) {
> -		DRM_ERROR("Failed to initialise HW following reset (%d)\n",
> -			  ret);
> -		goto error;
> -	}
> -
> -	i915_queue_hangcheck(i915);
> -
> -finish:
> -	i915_gem_reset_finish(i915);
> -	enable_irq(i915->drm.irq);
> -
> -wakeup:
> -	clear_bit(I915_RESET_HANDOFF, &error->flags);
> -	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
> -	return;
> -
> -taint:
> -	/*
> -	 * History tells us that if we cannot reset the GPU now, we
> -	 * never will. This then impacts everything that is run
> -	 * subsequently. On failing the reset, we mark the driver
> -	 * as wedged, preventing further execution on the GPU.
> -	 * We also want to go one step further and add a taint to the
> -	 * kernel so that any subsequent faults can be traced back to
> -	 * this failure. This is important for CI, where if the
> -	 * GPU/driver fails we would like to reboot and restart testing
> -	 * rather than continue on into oblivion. For everyone else,
> -	 * the system should still plod along, but they have been warned!
> -	 */
> -	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
> -error:
> -	i915_gem_set_wedged(i915);
> -	i915_retire_requests(i915);
> -	goto finish;
> -}
> -
> -static inline int intel_gt_reset_engine(struct drm_i915_private *dev_priv,
> -					struct intel_engine_cs *engine)
> -{
> -	return intel_gpu_reset(dev_priv, intel_engine_flag(engine));
> -}
> -
> -/**
> - * i915_reset_engine - reset GPU engine to recover from a hang
> - * @engine: engine to reset
> - * @msg: reason for GPU reset; or NULL for no dev_notice()
> - *
> - * Reset a specific GPU engine. Useful if a hang is detected.
> - * Returns zero on successful reset or otherwise an error code.
> - *
> - * Procedure is:
> - *  - identifies the request that caused the hang and it is dropped
> - *  - reset engine (which will force the engine to idle)
> - *  - re-init/configure engine
> - */
> -int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
> -{
> -	struct i915_gpu_error *error = &engine->i915->gpu_error;
> -	struct i915_request *active_request;
> -	int ret;
> -
> -	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
> -	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
> -
> -	active_request = i915_gem_reset_prepare_engine(engine);
> -	if (IS_ERR_OR_NULL(active_request)) {
> -		/* Either the previous reset failed, or we pardon the reset. */
> -		ret = PTR_ERR(active_request);
> -		goto out;
> -	}
> -
> -	if (msg)
> -		dev_notice(engine->i915->drm.dev,
> -			   "Resetting %s for %s\n", engine->name, msg);
> -	error->reset_engine_count[engine->id]++;
> -
> -	if (!engine->i915->guc.execbuf_client)
> -		ret = intel_gt_reset_engine(engine->i915, engine);
> -	else
> -		ret = intel_guc_reset_engine(&engine->i915->guc, engine);
> -	if (ret) {
> -		/* If we fail here, we expect to fallback to a global reset */
> -		DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
> -				 engine->i915->guc.execbuf_client ? "GuC " : "",
> -				 engine->name, ret);
> -		goto out;
> -	}
> -
> -	/*
> -	 * The request that caused the hang is stuck on elsp, we know the
> -	 * active request and can drop it, adjust head to skip the offending
> -	 * request to resume executing remaining requests in the queue.
> -	 */
> -	i915_gem_reset_engine(engine, active_request, true);
> -
> -	/*
> -	 * The engine and its registers (and workarounds in case of render)
> -	 * have been reset to their default values. Follow the init_ring
> -	 * process to program RING_MODE, HWSP and re-enable submission.
> -	 */
> -	ret = engine->init_hw(engine);
> -	if (ret)
> -		goto out;
> -
> -out:
> -	i915_gem_reset_finish_engine(engine);
> -	return ret;
> -}
> -
>  static int i915_pm_prepare(struct device *kdev)
>  {
>  	struct pci_dev *pdev = to_pci_dev(kdev);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 43f545add21c..84b1073eacd8 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2700,19 +2700,7 @@ extern const struct dev_pm_ops i915_pm_ops;
>  extern int i915_driver_load(struct pci_dev *pdev,
>  			    const struct pci_device_id *ent);
>  extern void i915_driver_unload(struct drm_device *dev);
> -extern int intel_gpu_reset(struct drm_i915_private *dev_priv, u32 engine_mask);
> -extern bool intel_has_gpu_reset(struct drm_i915_private *dev_priv);
> -
> -extern void i915_reset(struct drm_i915_private *i915,
> -		       unsigned int stalled_mask,
> -		       const char *reason);
> -extern int i915_reset_engine(struct intel_engine_cs *engine,
> -			     const char *reason);
> -
> -extern bool intel_has_reset_engine(struct drm_i915_private *dev_priv);
> -extern int intel_reset_guc(struct drm_i915_private *dev_priv);
> -extern int intel_guc_reset_engine(struct intel_guc *guc,
> -				  struct intel_engine_cs *engine);
> +
>  extern void intel_engine_init_hangcheck(struct intel_engine_cs *engine);
>  extern void intel_hangcheck_init(struct drm_i915_private *dev_priv);
>  extern unsigned long i915_chipset_val(struct drm_i915_private *dev_priv);
> @@ -2757,13 +2745,6 @@ static inline void i915_queue_hangcheck(struct drm_i915_private *dev_priv)
>  			   &dev_priv->gpu_error.hangcheck_work, delay);
>  }
>  
> -__printf(4, 5)
> -void i915_handle_error(struct drm_i915_private *dev_priv,
> -		       u32 engine_mask,
> -		       unsigned long flags,
> -		       const char *fmt, ...);
> -#define I915_ERROR_CAPTURE BIT(0)
> -
>  extern void intel_irq_init(struct drm_i915_private *dev_priv);
>  extern void intel_irq_fini(struct drm_i915_private *dev_priv);
>  int intel_irq_install(struct drm_i915_private *dev_priv);
> @@ -3126,18 +3107,8 @@ static inline u32 i915_reset_engine_count(struct i915_gpu_error *error,
>  	return READ_ONCE(error->reset_engine_count[engine->id]);
>  }
>  
> -struct i915_request *
> -i915_gem_reset_prepare_engine(struct intel_engine_cs *engine);
> -int i915_gem_reset_prepare(struct drm_i915_private *dev_priv);
> -void i915_gem_reset(struct drm_i915_private *dev_priv,
> -		    unsigned int stalled_mask);
> -void i915_gem_reset_finish_engine(struct intel_engine_cs *engine);
> -void i915_gem_reset_finish(struct drm_i915_private *dev_priv);
>  void i915_gem_set_wedged(struct drm_i915_private *dev_priv);
>  bool i915_gem_unset_wedged(struct drm_i915_private *dev_priv);
> -void i915_gem_reset_engine(struct intel_engine_cs *engine,
> -			   struct i915_request *request,
> -			   bool stalled);
>  
>  void i915_gem_init_mmio(struct drm_i915_private *i915);
>  int __must_check i915_gem_init(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 8eecd68f9e23..b5822cc36221 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -28,15 +28,6 @@
>  #include <drm/drmP.h>
>  #include <drm/drm_vma_manager.h>
>  #include <drm/i915_drm.h>
> -#include "i915_drv.h"
> -#include "i915_gem_clflush.h"
> -#include "i915_vgpu.h"
> -#include "i915_trace.h"
> -#include "intel_drv.h"
> -#include "intel_frontbuffer.h"
> -#include "intel_mocs.h"
> -#include "intel_workarounds.h"
> -#include "i915_gemfs.h"
>  #include <linux/dma-fence-array.h>
>  #include <linux/kthread.h>
>  #include <linux/reservation.h>
> @@ -47,6 +38,18 @@
>  #include <linux/pci.h>
>  #include <linux/dma-buf.h>
>  
> +#include "i915_drv.h"
> +#include "i915_gem_clflush.h"
> +#include "i915_gemfs.h"
> +#include "i915_reset.h"
> +#include "i915_trace.h"
> +#include "i915_vgpu.h"
> +
> +#include "intel_drv.h"
> +#include "intel_frontbuffer.h"
> +#include "intel_mocs.h"
> +#include "intel_workarounds.h"
> +
>  static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
>  
>  static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> @@ -2960,61 +2963,6 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
>  	return 0;
>  }
>  
> -static void i915_gem_client_mark_guilty(struct drm_i915_file_private *file_priv,
> -					const struct i915_gem_context *ctx)
> -{
> -	unsigned int score;
> -	unsigned long prev_hang;
> -
> -	if (i915_gem_context_is_banned(ctx))
> -		score = I915_CLIENT_SCORE_CONTEXT_BAN;
> -	else
> -		score = 0;
> -
> -	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
> -	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
> -		score += I915_CLIENT_SCORE_HANG_FAST;
> -
> -	if (score) {
> -		atomic_add(score, &file_priv->ban_score);
> -
> -		DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
> -				 ctx->name, score,
> -				 atomic_read(&file_priv->ban_score));
> -	}
> -}
> -
> -static void i915_gem_context_mark_guilty(struct i915_gem_context *ctx)
> -{
> -	unsigned int score;
> -	bool banned, bannable;
> -
> -	atomic_inc(&ctx->guilty_count);
> -
> -	bannable = i915_gem_context_is_bannable(ctx);
> -	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> -	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> -
> -	/* Cool contexts don't accumulate client ban score */
> -	if (!bannable)
> -		return;
> -
> -	if (banned) {
> -		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
> -				 ctx->name, atomic_read(&ctx->guilty_count),
> -				 score);
> -		i915_gem_context_set_banned(ctx);
> -	}
> -
> -	if (!IS_ERR_OR_NULL(ctx->file_priv))
> -		i915_gem_client_mark_guilty(ctx->file_priv, ctx);
> -}
> -
> -static void i915_gem_context_mark_innocent(struct i915_gem_context *ctx)
> -{
> -	atomic_inc(&ctx->active_count);
> -}
> -
>  struct i915_request *
>  i915_gem_find_active_request(struct intel_engine_cs *engine)
>  {
> @@ -3045,395 +2993,6 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
>  	return active;
>  }
>  
> -/*
> - * Ensure irq handler finishes, and not run again.
> - * Also return the active request so that we only search for it once.
> - */
> -struct i915_request *
> -i915_gem_reset_prepare_engine(struct intel_engine_cs *engine)
> -{
> -	struct i915_request *request;
> -
> -	/*
> -	 * During the reset sequence, we must prevent the engine from
> -	 * entering RC6. As the context state is undefined until we restart
> -	 * the engine, if it does enter RC6 during the reset, the state
> -	 * written to the powercontext is undefined and so we may lose
> -	 * GPU state upon resume, i.e. fail to restart after a reset.
> -	 */
> -	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
> -
> -	request = engine->reset.prepare(engine);
> -	if (request && request->fence.error == -EIO)
> -		request = ERR_PTR(-EIO); /* Previous reset failed! */
> -
> -	return request;
> -}
> -
> -int i915_gem_reset_prepare(struct drm_i915_private *dev_priv)
> -{
> -	struct intel_engine_cs *engine;
> -	struct i915_request *request;
> -	enum intel_engine_id id;
> -	int err = 0;
> -
> -	for_each_engine(engine, dev_priv, id) {
> -		request = i915_gem_reset_prepare_engine(engine);
> -		if (IS_ERR(request)) {
> -			err = PTR_ERR(request);
> -			continue;
> -		}
> -
> -		engine->hangcheck.active_request = request;
> -	}
> -
> -	i915_gem_revoke_fences(dev_priv);
> -	intel_uc_sanitize(dev_priv);
> -
> -	return err;
> -}
> -
> -static void engine_skip_context(struct i915_request *request)
> -{
> -	struct intel_engine_cs *engine = request->engine;
> -	struct i915_gem_context *hung_ctx = request->gem_context;
> -	struct i915_timeline *timeline = request->timeline;
> -	unsigned long flags;
> -
> -	GEM_BUG_ON(timeline == &engine->timeline);
> -
> -	spin_lock_irqsave(&engine->timeline.lock, flags);
> -	spin_lock(&timeline->lock);
> -
> -	list_for_each_entry_continue(request, &engine->timeline.requests, link)
> -		if (request->gem_context == hung_ctx)
> -			i915_request_skip(request, -EIO);
> -
> -	list_for_each_entry(request, &timeline->requests, link)
> -		i915_request_skip(request, -EIO);
> -
> -	spin_unlock(&timeline->lock);
> -	spin_unlock_irqrestore(&engine->timeline.lock, flags);
> -}
> -
> -/* Returns the request if it was guilty of the hang */
> -static struct i915_request *
> -i915_gem_reset_request(struct intel_engine_cs *engine,
> -		       struct i915_request *request,
> -		       bool stalled)
> -{
> -	/* The guilty request will get skipped on a hung engine.
> -	 *
> -	 * Users of client default contexts do not rely on logical
> -	 * state preserved between batches so it is safe to execute
> -	 * queued requests following the hang. Non default contexts
> -	 * rely on preserved state, so skipping a batch loses the
> -	 * evolution of the state and it needs to be considered corrupted.
> -	 * Executing more queued batches on top of corrupted state is
> -	 * risky. But we take the risk by trying to advance through
> -	 * the queued requests in order to make the client behaviour
> -	 * more predictable around resets, by not throwing away random
> -	 * amount of batches it has prepared for execution. Sophisticated
> -	 * clients can use gem_reset_stats_ioctl and dma fence status
> -	 * (exported via sync_file info ioctl on explicit fences) to observe
> -	 * when it loses the context state and should rebuild accordingly.
> -	 *
> -	 * The context ban, and ultimately the client ban, mechanism are safety
> -	 * valves if client submission ends up resulting in nothing more than
> -	 * subsequent hangs.
> -	 */
> -
> -	if (i915_request_completed(request)) {
> -		GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
> -			  engine->name, request->global_seqno,
> -			  request->fence.context, request->fence.seqno,
> -			  intel_engine_get_seqno(engine));
> -		stalled = false;
> -	}
> -
> -	if (stalled) {
> -		i915_gem_context_mark_guilty(request->gem_context);
> -		i915_request_skip(request, -EIO);
> -
> -		/* If this context is now banned, skip all pending requests. */
> -		if (i915_gem_context_is_banned(request->gem_context))
> -			engine_skip_context(request);
> -	} else {
> -		/*
> -		 * Since this is not the hung engine, it may have advanced
> -		 * since the hang declaration. Double check by refinding
> -		 * the active request at the time of the reset.
> -		 */
> -		request = i915_gem_find_active_request(engine);
> -		if (request) {
> -			unsigned long flags;
> -
> -			i915_gem_context_mark_innocent(request->gem_context);
> -			dma_fence_set_error(&request->fence, -EAGAIN);
> -
> -			/* Rewind the engine to replay the incomplete rq */
> -			spin_lock_irqsave(&engine->timeline.lock, flags);
> -			request = list_prev_entry(request, link);
> -			if (&request->link == &engine->timeline.requests)
> -				request = NULL;
> -			spin_unlock_irqrestore(&engine->timeline.lock, flags);
> -		}
> -	}
> -
> -	return request;
> -}
> -
> -void i915_gem_reset_engine(struct intel_engine_cs *engine,
> -			   struct i915_request *request,
> -			   bool stalled)
> -{
> -	/*
> -	 * Make sure this write is visible before we re-enable the interrupt
> -	 * handlers on another CPU, as tasklet_enable() resolves to just
> -	 * a compiler barrier which is insufficient for our purpose here.
> -	 */
> -	smp_store_mb(engine->irq_posted, 0);
> -
> -	if (request)
> -		request = i915_gem_reset_request(engine, request, stalled);
> -
> -	/* Setup the CS to resume from the breadcrumb of the hung request */
> -	engine->reset.reset(engine, request);
> -}
> -
> -void i915_gem_reset(struct drm_i915_private *dev_priv,
> -		    unsigned int stalled_mask)
> -{
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> -
> -	i915_retire_requests(dev_priv);
> -
> -	for_each_engine(engine, dev_priv, id) {
> -		struct intel_context *ce;
> -
> -		i915_gem_reset_engine(engine,
> -				      engine->hangcheck.active_request,
> -				      stalled_mask & ENGINE_MASK(id));
> -		ce = fetch_and_zero(&engine->last_retired_context);
> -		if (ce)
> -			intel_context_unpin(ce);
> -
> -		/*
> -		 * Ostensibily, we always want a context loaded for powersaving,
> -		 * so if the engine is idle after the reset, send a request
> -		 * to load our scratch kernel_context.
> -		 *
> -		 * More mysteriously, if we leave the engine idle after a reset,
> -		 * the next userspace batch may hang, with what appears to be
> -		 * an incoherent read by the CS (presumably stale TLB). An
> -		 * empty request appears sufficient to paper over the glitch.
> -		 */
> -		if (intel_engine_is_idle(engine)) {
> -			struct i915_request *rq;
> -
> -			rq = i915_request_alloc(engine,
> -						dev_priv->kernel_context);
> -			if (!IS_ERR(rq))
> -				i915_request_add(rq);
> -		}
> -	}
> -
> -	i915_gem_restore_fences(dev_priv);
> -}
> -
> -void i915_gem_reset_finish_engine(struct intel_engine_cs *engine)
> -{
> -	engine->reset.finish(engine);
> -
> -	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> -}
> -
> -void i915_gem_reset_finish(struct drm_i915_private *dev_priv)
> -{
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> -
> -	for_each_engine(engine, dev_priv, id) {
> -		engine->hangcheck.active_request = NULL;
> -		i915_gem_reset_finish_engine(engine);
> -	}
> -}
> -
> -static void nop_submit_request(struct i915_request *request)
> -{
> -	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
> -		  request->engine->name,
> -		  request->fence.context, request->fence.seqno);
> -	dma_fence_set_error(&request->fence, -EIO);
> -
> -	i915_request_submit(request);
> -}
> -
> -static void nop_complete_submit_request(struct i915_request *request)
> -{
> -	unsigned long flags;
> -
> -	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
> -		  request->engine->name,
> -		  request->fence.context, request->fence.seqno);
> -	dma_fence_set_error(&request->fence, -EIO);
> -
> -	spin_lock_irqsave(&request->engine->timeline.lock, flags);
> -	__i915_request_submit(request);
> -	intel_engine_init_global_seqno(request->engine, request->global_seqno);
> -	spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
> -}
> -
> -void i915_gem_set_wedged(struct drm_i915_private *i915)
> -{
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	GEM_TRACE("start\n");
> -
> -	if (GEM_SHOW_DEBUG()) {
> -		struct drm_printer p = drm_debug_printer(__func__);
> -
> -		for_each_engine(engine, i915, id)
> -			intel_engine_dump(engine, &p, "%s\n", engine->name);
> -	}
> -
> -	set_bit(I915_WEDGED, &i915->gpu_error.flags);
> -	smp_mb__after_atomic();
> -
> -	/*
> -	 * First, stop submission to hw, but do not yet complete requests by
> -	 * rolling the global seqno forward (since this would complete requests
> -	 * for which we haven't set the fence error to EIO yet).
> -	 */
> -	for_each_engine(engine, i915, id) {
> -		i915_gem_reset_prepare_engine(engine);
> -
> -		engine->submit_request = nop_submit_request;
> -		engine->schedule = NULL;
> -	}
> -	i915->caps.scheduler = 0;
> -
> -	/* Even if the GPU reset fails, it should still stop the engines */
> -	intel_gpu_reset(i915, ALL_ENGINES);
> -
> -	/*
> -	 * Make sure no one is running the old callback before we proceed with
> -	 * cancelling requests and resetting the completion tracking. Otherwise
> -	 * we might submit a request to the hardware which never completes.
> -	 */
> -	synchronize_rcu();
> -
> -	for_each_engine(engine, i915, id) {
> -		/* Mark all executing requests as skipped */
> -		engine->cancel_requests(engine);
> -
> -		/*
> -		 * Only once we've force-cancelled all in-flight requests can we
> -		 * start to complete all requests.
> -		 */
> -		engine->submit_request = nop_complete_submit_request;
> -	}
> -
> -	/*
> -	 * Make sure no request can slip through without getting completed by
> -	 * either this call here to intel_engine_init_global_seqno, or the one
> -	 * in nop_complete_submit_request.
> -	 */
> -	synchronize_rcu();
> -
> -	for_each_engine(engine, i915, id) {
> -		unsigned long flags;
> -
> -		/*
> -		 * Mark all pending requests as complete so that any concurrent
> -		 * (lockless) lookup doesn't try and wait upon the request as we
> -		 * reset it.
> -		 */
> -		spin_lock_irqsave(&engine->timeline.lock, flags);
> -		intel_engine_init_global_seqno(engine,
> -					       intel_engine_last_submit(engine));
> -		spin_unlock_irqrestore(&engine->timeline.lock, flags);
> -
> -		i915_gem_reset_finish_engine(engine);
> -	}
> -
> -	GEM_TRACE("end\n");
> -
> -	wake_up_all(&i915->gpu_error.reset_queue);
> -}
> -
> -bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> -{
> -	struct i915_timeline *tl;
> -
> -	lockdep_assert_held(&i915->drm.struct_mutex);
> -	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
> -		return true;
> -
> -	GEM_TRACE("start\n");
> -
> -	/*
> -	 * Before unwedging, make sure that all pending operations
> -	 * are flushed and errored out - we may have requests waiting upon
> -	 * third party fences. We marked all inflight requests as EIO, and
> -	 * every execbuf since returned EIO, for consistency we want all
> -	 * the currently pending requests to also be marked as EIO, which
> -	 * is done inside our nop_submit_request - and so we must wait.
> -	 *
> -	 * No more can be submitted until we reset the wedged bit.
> -	 */
> -	list_for_each_entry(tl, &i915->gt.timelines, link) {
> -		struct i915_request *rq;
> -
> -		rq = i915_gem_active_peek(&tl->last_request,
> -					  &i915->drm.struct_mutex);
> -		if (!rq)
> -			continue;
> -
> -		/*
> -		 * We can't use our normal waiter as we want to
> -		 * avoid recursively trying to handle the current
> -		 * reset. The basic dma_fence_default_wait() installs
> -		 * a callback for dma_fence_signal(), which is
> -		 * triggered by our nop handler (indirectly, the
> -		 * callback enables the signaler thread which is
> -		 * woken by the nop_submit_request() advancing the seqno
> -		 * and when the seqno passes the fence, the signaler
> -		 * then signals the fence waking us up).
> -		 */
> -		if (dma_fence_default_wait(&rq->fence, true,
> -					   MAX_SCHEDULE_TIMEOUT) < 0)
> -			return false;
> -	}
> -	i915_retire_requests(i915);
> -	GEM_BUG_ON(i915->gt.active_requests);
> -
> -	/*
> -	 * Undo nop_submit_request. We prevent all new i915 requests from
> -	 * being queued (by disallowing execbuf whilst wedged) so having
> -	 * waited for all active requests above, we know the system is idle
> -	 * and do not have to worry about a thread being inside
> -	 * engine->submit_request() as we swap over. So unlike installing
> -	 * the nop_submit_request on reset, we can do this from normal
> -	 * context and do not require stop_machine().
> -	 */
> -	intel_engines_reset_default_submission(i915);
> -	i915_gem_contexts_lost(i915);
> -
> -	GEM_TRACE("end\n");
> -
> -	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
> -	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> -
> -	return true;
> -}
> -
>  static void
>  i915_gem_retire_work_handler(struct work_struct *work)
>  {
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 495b9d27990e..76daa31dc2ba 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -2928,46 +2928,6 @@ static irqreturn_t gen8_irq_handler(int irq, void *arg)
>  	return IRQ_HANDLED;
>  }
>  
> -struct wedge_me {
> -	struct delayed_work work;
> -	struct drm_i915_private *i915;
> -	const char *name;
> -};
> -
> -static void wedge_me(struct work_struct *work)
> -{
> -	struct wedge_me *w = container_of(work, typeof(*w), work.work);
> -
> -	dev_err(w->i915->drm.dev,
> -		"%s timed out, cancelling all in-flight rendering.\n",
> -		w->name);
> -	i915_gem_set_wedged(w->i915);
> -}
> -
> -static void __init_wedge(struct wedge_me *w,
> -			 struct drm_i915_private *i915,
> -			 long timeout,
> -			 const char *name)
> -{
> -	w->i915 = i915;
> -	w->name = name;
> -
> -	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> -	schedule_delayed_work(&w->work, timeout);
> -}
> -
> -static void __fini_wedge(struct wedge_me *w)
> -{
> -	cancel_delayed_work_sync(&w->work);
> -	destroy_delayed_work_on_stack(&w->work);
> -	w->i915 = NULL;
> -}
> -
> -#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
> -	for (__init_wedge((W), (DEV), (TIMEOUT), __func__);		\
> -	     (W)->i915;							\
> -	     __fini_wedge((W)))
> -
>  static u32
>  gen11_gt_engine_identity(struct drm_i915_private * const i915,
>  			 const unsigned int bank, const unsigned int bit)
> @@ -3172,186 +3132,6 @@ static irqreturn_t gen11_irq_handler(int irq, void *arg)
>  	return IRQ_HANDLED;
>  }
>  
> -static void i915_reset_device(struct drm_i915_private *dev_priv,
> -			      u32 engine_mask,
> -			      const char *reason)
> -{
> -	struct i915_gpu_error *error = &dev_priv->gpu_error;
> -	struct kobject *kobj = &dev_priv->drm.primary->kdev->kobj;
> -	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
> -	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
> -	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
> -	struct wedge_me w;
> -
> -	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
> -
> -	DRM_DEBUG_DRIVER("resetting chip\n");
> -	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> -
> -	/* Use a watchdog to ensure that our reset completes */
> -	i915_wedge_on_timeout(&w, dev_priv, 5*HZ) {
> -		intel_prepare_reset(dev_priv);
> -
> -		error->reason = reason;
> -		error->stalled_mask = engine_mask;
> -
> -		/* Signal that locked waiters should reset the GPU */
> -		smp_mb__before_atomic();
> -		set_bit(I915_RESET_HANDOFF, &error->flags);
> -		wake_up_all(&error->wait_queue);
> -
> -		/* Wait for anyone holding the lock to wakeup, without
> -		 * blocking indefinitely on struct_mutex.
> -		 */
> -		do {
> -			if (mutex_trylock(&dev_priv->drm.struct_mutex)) {
> -				i915_reset(dev_priv, engine_mask, reason);
> -				mutex_unlock(&dev_priv->drm.struct_mutex);
> -			}
> -		} while (wait_on_bit_timeout(&error->flags,
> -					     I915_RESET_HANDOFF,
> -					     TASK_UNINTERRUPTIBLE,
> -					     1));
> -
> -		error->stalled_mask = 0;
> -		error->reason = NULL;
> -
> -		intel_finish_reset(dev_priv);
> -	}
> -
> -	if (!test_bit(I915_WEDGED, &error->flags))
> -		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
> -}
> -
> -static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
> -{
> -	u32 eir;
> -
> -	if (!IS_GEN2(dev_priv))
> -		I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER));
> -
> -	if (INTEL_GEN(dev_priv) < 4)
> -		I915_WRITE(IPEIR, I915_READ(IPEIR));
> -	else
> -		I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965));
> -
> -	I915_WRITE(EIR, I915_READ(EIR));
> -	eir = I915_READ(EIR);
> -	if (eir) {
> -		/*
> -		 * some errors might have become stuck,
> -		 * mask them.
> -		 */
> -		DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
> -		I915_WRITE(EMR, I915_READ(EMR) | eir);
> -		I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT);
> -	}
> -}
> -
> -/**
> - * i915_handle_error - handle a gpu error
> - * @dev_priv: i915 device private
> - * @engine_mask: mask representing engines that are hung
> - * @flags: control flags
> - * @fmt: Error message format string
> - *
> - * Do some basic checking of register state at error time and
> - * dump it to the syslog.  Also call i915_capture_error_state() to make
> - * sure we get a record and make it available in debugfs.  Fire a uevent
> - * so userspace knows something bad happened (should trigger collection
> - * of a ring dump etc.).
> - */
> -void i915_handle_error(struct drm_i915_private *dev_priv,
> -		       u32 engine_mask,
> -		       unsigned long flags,
> -		       const char *fmt, ...)
> -{
> -	struct intel_engine_cs *engine;
> -	unsigned int tmp;
> -	char error_msg[80];
> -	char *msg = NULL;
> -
> -	if (fmt) {
> -		va_list args;
> -
> -		va_start(args, fmt);
> -		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> -		va_end(args);
> -
> -		msg = error_msg;
> -	}
> -
> -	/*
> -	 * In most cases it's guaranteed that we get here with an RPM
> -	 * reference held, for example because there is a pending GPU
> -	 * request that won't finish until the reset is done. This
> -	 * isn't the case at least when we get here by doing a
> -	 * simulated reset via debugfs, so get an RPM reference.
> -	 */
> -	intel_runtime_pm_get(dev_priv);
> -
> -	engine_mask &= INTEL_INFO(dev_priv)->ring_mask;
> -
> -	if (flags & I915_ERROR_CAPTURE) {
> -		i915_capture_error_state(dev_priv, engine_mask, msg);
> -		i915_clear_error_registers(dev_priv);
> -	}
> -
> -	/*
> -	 * Try engine reset when available. We fall back to full reset if
> -	 * single reset fails.
> -	 */
> -	if (intel_has_reset_engine(dev_priv)) {
> -		for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
> -			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
> -			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> -					     &dev_priv->gpu_error.flags))
> -				continue;
> -
> -			if (i915_reset_engine(engine, msg) == 0)
> -				engine_mask &= ~intel_engine_flag(engine);
> -
> -			clear_bit(I915_RESET_ENGINE + engine->id,
> -				  &dev_priv->gpu_error.flags);
> -			wake_up_bit(&dev_priv->gpu_error.flags,
> -				    I915_RESET_ENGINE + engine->id);
> -		}
> -	}
> -
> -	if (!engine_mask)
> -		goto out;
> -
> -	/* Full reset needs the mutex, stop any other user trying to do so. */
> -	if (test_and_set_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags)) {
> -		wait_event(dev_priv->gpu_error.reset_queue,
> -			   !test_bit(I915_RESET_BACKOFF,
> -				     &dev_priv->gpu_error.flags));
> -		goto out;
> -	}
> -
> -	/* Prevent any other reset-engine attempt. */
> -	for_each_engine(engine, dev_priv, tmp) {
> -		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> -					&dev_priv->gpu_error.flags))
> -			wait_on_bit(&dev_priv->gpu_error.flags,
> -				    I915_RESET_ENGINE + engine->id,
> -				    TASK_UNINTERRUPTIBLE);
> -	}
> -
> -	i915_reset_device(dev_priv, engine_mask, msg);
> -
> -	for_each_engine(engine, dev_priv, tmp) {
> -		clear_bit(I915_RESET_ENGINE + engine->id,
> -			  &dev_priv->gpu_error.flags);
> -	}
> -
> -	clear_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags);
> -	wake_up_all(&dev_priv->gpu_error.reset_queue);
> -
> -out:
> -	intel_runtime_pm_put(dev_priv);
> -}
> -
>  /* Called from drm generic code, passed 'crtc' which
>   * we use as a pipe index
>   */
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 5c2c93cbab12..9bbea7baa55d 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -29,6 +29,7 @@
>  #include <linux/sched/signal.h>
>  
>  #include "i915_drv.h"
> +#include "i915_reset.h"
>  
>  static const char *i915_fence_get_driver_name(struct dma_fence *fence)
>  {
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> new file mode 100644
> index 000000000000..edf29da15a99
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -0,0 +1,1271 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2008-2018 Intel Corporation
> + */
> +
> +#include "i915_drv.h"
> +#include "i915_gpu_error.h"
> +#include "i915_reset.h"
> +
> +#include "intel_guc.h"
> +
> +static void engine_skip_context(struct i915_request *rq)
> +{
> +	struct intel_engine_cs *engine = rq->engine;
> +	struct i915_gem_context *hung_ctx = rq->gem_context;
> +	struct i915_timeline *timeline = rq->timeline;
> +	unsigned long flags;
> +
> +	GEM_BUG_ON(timeline == &engine->timeline);
> +
> +	spin_lock_irqsave(&engine->timeline.lock, flags);
> +	spin_lock(&timeline->lock);
> +
> +	list_for_each_entry_continue(rq, &engine->timeline.requests, link)
> +		if (rq->gem_context == hung_ctx)
> +			i915_request_skip(rq, -EIO);
> +
> +	list_for_each_entry(rq, &timeline->requests, link)
> +		i915_request_skip(rq, -EIO);
> +
> +	spin_unlock(&timeline->lock);
> +	spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +}
> +
> +static void client_mark_guilty(struct drm_i915_file_private *file_priv,
> +			       const struct i915_gem_context *ctx)
> +{
> +	unsigned int score;
> +	unsigned long prev_hang;
> +
> +	if (i915_gem_context_is_banned(ctx))
> +		score = I915_CLIENT_SCORE_CONTEXT_BAN;
> +	else
> +		score = 0;
> +
> +	prev_hang = xchg(&file_priv->hang_timestamp, jiffies);
> +	if (time_before(jiffies, prev_hang + I915_CLIENT_FAST_HANG_JIFFIES))
> +		score += I915_CLIENT_SCORE_HANG_FAST;
> +
> +	if (score) {
> +		atomic_add(score, &file_priv->ban_score);
> +
> +		DRM_DEBUG_DRIVER("client %s: gained %u ban score, now %u\n",
> +				 ctx->name, score,
> +				 atomic_read(&file_priv->ban_score));
> +	}
> +}
> +
> +static void context_mark_guilty(struct i915_gem_context *ctx)
> +{
> +	unsigned int score;
> +	bool banned, bannable;
> +
> +	atomic_inc(&ctx->guilty_count);
> +
> +	bannable = i915_gem_context_is_bannable(ctx);
> +	score = atomic_add_return(CONTEXT_SCORE_GUILTY, &ctx->ban_score);
> +	banned = score >= CONTEXT_SCORE_BAN_THRESHOLD;
> +
> +	/* Cool contexts don't accumulate client ban score */
> +	if (!bannable)
> +		return;
> +
> +	if (banned) {
> +		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
> +				 ctx->name, atomic_read(&ctx->guilty_count),
> +				 score);
> +		i915_gem_context_set_banned(ctx);
> +	}
> +
> +	if (!IS_ERR_OR_NULL(ctx->file_priv))
> +		client_mark_guilty(ctx->file_priv, ctx);
> +}
> +
> +static void context_mark_innocent(struct i915_gem_context *ctx)
> +{
> +	atomic_inc(&ctx->active_count);
> +}
> +
> +static void gen3_stop_engine(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	const u32 base = engine->mmio_base;
> +
> +	if (intel_engine_stop_cs(engine))
> +		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name);
> +
> +	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
> +	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
> +
> +	I915_WRITE_FW(RING_HEAD(base), 0);
> +	I915_WRITE_FW(RING_TAIL(base), 0);
> +	POSTING_READ_FW(RING_TAIL(base));
> +
> +	/* The ring must be empty before it is disabled */
> +	I915_WRITE_FW(RING_CTL(base), 0);
> +
> +	/* Check acts as a post */
> +	if (I915_READ_FW(RING_HEAD(base)) != 0)
> +		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
> +				 engine->name);
> +}
> +
> +static void i915_stop_engines(struct drm_i915_private *i915,
> +			      unsigned int engine_mask)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	if (INTEL_GEN(i915) < 3)
> +		return;
> +
> +	for_each_engine_masked(engine, i915, engine_mask, id)
> +		gen3_stop_engine(engine);
> +}
> +
> +static bool i915_in_reset(struct pci_dev *pdev)
> +{
> +	u8 gdrst;
> +
> +	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
> +	return gdrst & GRDOM_RESET_STATUS;
> +}
> +
> +static int i915_do_reset(struct drm_i915_private *i915,
> +			 unsigned int engine_mask)
> +{
> +	struct pci_dev *pdev = i915->drm.pdev;
> +	int err;
> +
> +	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
> +	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> +	usleep_range(50, 200);
> +	err = wait_for(i915_in_reset(pdev), 500);
> +
> +	/* Clear the reset request. */
> +	pci_write_config_byte(pdev, I915_GDRST, 0);
> +	usleep_range(50, 200);
> +	if (!err)
> +		err = wait_for(!i915_in_reset(pdev), 500);
> +
> +	return err;
> +}
> +
> +static bool g4x_reset_complete(struct pci_dev *pdev)
> +{
> +	u8 gdrst;
> +
> +	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
> +	return (gdrst & GRDOM_RESET_ENABLE) == 0;
> +}
> +
> +static int g33_do_reset(struct drm_i915_private *i915, unsigned int engine_mask)
> +{
> +	struct pci_dev *pdev = i915->drm.pdev;
> +
> +	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> +	return wait_for(g4x_reset_complete(pdev), 500);
> +}
> +
> +static int g4x_do_reset(struct drm_i915_private *dev_priv,
> +			unsigned int engine_mask)
> +{
> +	struct pci_dev *pdev = dev_priv->drm.pdev;
> +	int ret;
> +
> +	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
> +	I915_WRITE(VDECCLK_GATE_D,
> +		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
> +	POSTING_READ(VDECCLK_GATE_D);
> +
> +	pci_write_config_byte(pdev, I915_GDRST,
> +			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
> +	ret =  wait_for(g4x_reset_complete(pdev), 500);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
> +		goto out;
> +	}
> +
> +	pci_write_config_byte(pdev, I915_GDRST,
> +			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
> +	ret =  wait_for(g4x_reset_complete(pdev), 500);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
> +		goto out;
> +	}
> +
> +out:
> +	pci_write_config_byte(pdev, I915_GDRST, 0);
> +
> +	I915_WRITE(VDECCLK_GATE_D,
> +		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
> +	POSTING_READ(VDECCLK_GATE_D);
> +
> +	return ret;
> +}
> +
> +static int ironlake_do_reset(struct drm_i915_private *dev_priv,
> +			     unsigned int engine_mask)
> +{
> +	int ret;
> +
> +	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
> +	ret = intel_wait_for_register(dev_priv,
> +				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> +				      500);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
> +		goto out;
> +	}
> +
> +	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
> +	ret = intel_wait_for_register(dev_priv,
> +				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> +				      500);
> +	if (ret) {
> +		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
> +		goto out;
> +	}
> +
> +out:
> +	I915_WRITE(ILK_GDSR, 0);
> +	POSTING_READ(ILK_GDSR);
> +	return ret;
> +}
> +
> +/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
> +static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
> +				u32 hw_domain_mask)
> +{
> +	int err;
> +
> +	/*
> +	 * GEN6_GDRST is not in the gt power well, no need to check
> +	 * for fifo space for the write or forcewake the chip for
> +	 * the read
> +	 */
> +	I915_WRITE_FW(GEN6_GDRST, hw_domain_mask);
> +
> +	/* Wait for the device to ack the reset requests */
> +	err = __intel_wait_for_register_fw(dev_priv,
> +					   GEN6_GDRST, hw_domain_mask, 0,
> +					   500, 0,
> +					   NULL);
> +	if (err)
> +		DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
> +				 hw_domain_mask);
> +
> +	return err;
> +}
> +
> +static int gen6_reset_engines(struct drm_i915_private *i915,
> +			      unsigned int engine_mask)
> +{
> +	struct intel_engine_cs *engine;
> +	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
> +		[RCS] = GEN6_GRDOM_RENDER,
> +		[BCS] = GEN6_GRDOM_BLT,
> +		[VCS] = GEN6_GRDOM_MEDIA,
> +		[VCS2] = GEN8_GRDOM_MEDIA2,
> +		[VECS] = GEN6_GRDOM_VECS,
> +	};
> +	u32 hw_mask;
> +
> +	if (engine_mask == ALL_ENGINES) {
> +		hw_mask = GEN6_GRDOM_FULL;
> +	} else {
> +		unsigned int tmp;
> +
> +		hw_mask = 0;
> +		for_each_engine_masked(engine, i915, engine_mask, tmp)
> +			hw_mask |= hw_engine_mask[engine->id];
> +	}
> +
> +	return gen6_hw_domain_reset(i915, hw_mask);
> +}
> +
> +static int gen11_reset_engines(struct drm_i915_private *i915,
> +			       unsigned int engine_mask)
> +{
> +	struct intel_engine_cs *engine;
> +	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
> +		[RCS] = GEN11_GRDOM_RENDER,
> +		[BCS] = GEN11_GRDOM_BLT,
> +		[VCS] = GEN11_GRDOM_MEDIA,
> +		[VCS2] = GEN11_GRDOM_MEDIA2,
> +		[VCS3] = GEN11_GRDOM_MEDIA3,
> +		[VCS4] = GEN11_GRDOM_MEDIA4,
> +		[VECS] = GEN11_GRDOM_VECS,
> +		[VECS2] = GEN11_GRDOM_VECS2,
> +	};
> +	u32 hw_mask;
> +
> +	BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES);
> +
> +	if (engine_mask == ALL_ENGINES) {
> +		hw_mask = GEN11_GRDOM_FULL;
> +	} else {
> +		unsigned int tmp;
> +
> +		hw_mask = 0;
> +		for_each_engine_masked(engine, i915, engine_mask, tmp)
> +			hw_mask |= hw_engine_mask[engine->id];
> +	}
> +
> +	return gen6_hw_domain_reset(i915, hw_mask);
> +}
> +
> +static int gen8_reset_engine_start(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +	int ret;
> +
> +	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
> +		      _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
> +
> +	ret = __intel_wait_for_register_fw(dev_priv,
> +					   RING_RESET_CTL(engine->mmio_base),
> +					   RESET_CTL_READY_TO_RESET,
> +					   RESET_CTL_READY_TO_RESET,
> +					   700, 0,
> +					   NULL);
> +	if (ret)
> +		DRM_ERROR("%s: reset request timeout\n", engine->name);
> +
> +	return ret;
> +}
> +
> +static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
> +{
> +	struct drm_i915_private *dev_priv = engine->i915;
> +
> +	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
> +		      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
> +}
> +
> +static int gen8_reset_engines(struct drm_i915_private *i915,
> +			      unsigned int engine_mask)
> +{
> +	struct intel_engine_cs *engine;
> +	unsigned int tmp;
> +	int ret;
> +
> +	for_each_engine_masked(engine, i915, engine_mask, tmp) {
> +		if (gen8_reset_engine_start(engine)) {
> +			ret = -EIO;
> +			goto not_ready;
> +		}
> +	}
> +
> +	if (INTEL_GEN(i915) >= 11)
> +		ret = gen11_reset_engines(i915, engine_mask);
> +	else
> +		ret = gen6_reset_engines(i915, engine_mask);
> +
> +not_ready:
> +	for_each_engine_masked(engine, i915, engine_mask, tmp)
> +		gen8_reset_engine_cancel(engine);
> +
> +	return ret;
> +}
> +
> +typedef int (*reset_func)(struct drm_i915_private *, unsigned int engine_mask);
> +
> +static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
> +{
> +	if (!i915_modparams.reset)
> +		return NULL;
> +
> +	if (INTEL_GEN(i915) >= 8)
> +		return gen8_reset_engines;
> +	else if (INTEL_GEN(i915) >= 6)
> +		return gen6_reset_engines;
> +	else if (IS_GEN5(i915))
> +		return ironlake_do_reset;
> +	else if (IS_G4X(i915))
> +		return g4x_do_reset;
> +	else if (IS_G33(i915) || IS_PINEVIEW(i915))
> +		return g33_do_reset;
> +	else if (INTEL_GEN(i915) >= 3)
> +		return i915_do_reset;
> +	else
> +		return NULL;
> +}
> +
> +int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
> +{
> +	reset_func reset = intel_get_gpu_reset(i915);
> +	int retry;
> +	int ret;
> +
> +	/*
> +	 * We want to perform per-engine reset from atomic context (e.g.
> +	 * softirq), which imposes the constraint that we cannot sleep.
> +	 * However, experience suggests that spending a bit of time waiting
> +	 * for a reset helps in various cases, so for a full-device reset
> +	 * we apply the opposite rule and wait if we want to. As we should
> +	 * always follow up a failed per-engine reset with a full device reset,
> +	 * being a little faster, stricter and more error prone for the
> +	 * atomic case seems an acceptable compromise.
> +	 *
> +	 * Unfortunately this leads to a bimodal routine, when the goal was
> +	 * to have a single reset function that worked for resetting any
> +	 * number of engines simultaneously.
> +	 */
> +	might_sleep_if(engine_mask == ALL_ENGINES);
> +
> +	/*
> +	 * If the power well sleeps during the reset, the reset
> +	 * request may be dropped and never completes (causing -EIO).
> +	 */
> +	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
> +	for (retry = 0; retry < 3; retry++) {
> +		/*
> +		 * We stop engines, otherwise we might get failed reset and a
> +		 * dead gpu (on elk). Also as modern gpu as kbl can suffer
> +		 * from system hang if batchbuffer is progressing when
> +		 * the reset is issued, regardless of READY_TO_RESET ack.
> +		 * Thus assume it is best to stop engines on all gens
> +		 * where we have a gpu reset.
> +		 *
> +		 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
> +		 *
> +		 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
> +		 *
> +		 * FIXME: Wa for more modern gens needs to be validated
> +		 */
> +		i915_stop_engines(i915, engine_mask);
> +
> +		ret = -ENODEV;
> +		if (reset) {
> +			GEM_TRACE("engine_mask=%x\n", engine_mask);
> +			ret = reset(i915, engine_mask);
> +		}
> +		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
> +			break;
> +
> +		cond_resched();
> +	}
> +	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
> +
> +	return ret;
> +}
> +
> +bool intel_has_gpu_reset(struct drm_i915_private *i915)
> +{
> +	return intel_get_gpu_reset(i915);
> +}
> +
> +bool intel_has_reset_engine(struct drm_i915_private *i915)
> +{
> +	return i915->info.has_reset_engine && i915_modparams.reset >= 2;
> +}
> +
> +int intel_reset_guc(struct drm_i915_private *i915)
> +{
> +	u32 guc_domain =
> +		INTEL_GEN(i915) >= 11 ? GEN11_GRDOM_GUC : GEN9_GRDOM_GUC;
> +	int ret;
> +
> +	GEM_BUG_ON(!HAS_GUC(i915));
> +
> +	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
> +	ret = gen6_hw_domain_reset(i915, guc_domain);
> +	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
> +
> +	return ret;
> +}
> +
> +/*
> + * Ensure irq handler finishes, and not run again.
> + * Also return the active request so that we only search for it once.
> + */
> +static struct i915_request *
> +reset_prepare_engine(struct intel_engine_cs *engine)
> +{
> +	struct i915_request *rq;
> +
> +	/*
> +	 * During the reset sequence, we must prevent the engine from
> +	 * entering RC6. As the context state is undefined until we restart
> +	 * the engine, if it does enter RC6 during the reset, the state
> +	 * written to the powercontext is undefined and so we may lose
> +	 * GPU state upon resume, i.e. fail to restart after a reset.
> +	 */
> +	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
> +
> +	rq = engine->reset.prepare(engine);
> +	if (rq && rq->fence.error == -EIO)
> +		rq = ERR_PTR(-EIO); /* Previous reset failed! */
> +
> +	return rq;
> +}
> +
> +static int reset_prepare(struct drm_i915_private *i915)
> +{
> +	struct intel_engine_cs *engine;
> +	struct i915_request *rq;
> +	enum intel_engine_id id;
> +	int err = 0;
> +
> +	disable_irq(i915->drm.irq);
> +
> +	for_each_engine(engine, i915, id) {
> +		rq = reset_prepare_engine(engine);
> +		if (IS_ERR(rq)) {
> +			err = PTR_ERR(rq);
> +			continue;
> +		}
> +
> +		engine->hangcheck.active_request = rq;
> +	}
> +
> +	i915_gem_revoke_fences(i915);
> +	intel_uc_sanitize(i915);
> +
> +	return err;
> +}
> +
> +/* Returns the request if it was guilty of the hang */
> +static struct i915_request *
> +reset_request(struct intel_engine_cs *engine,
> +	      struct i915_request *rq,
> +	      bool stalled)
> +{
> +	/*
> +	 * The guilty request will get skipped on a hung engine.
> +	 *
> +	 * Users of client default contexts do not rely on logical
> +	 * state preserved between batches so it is safe to execute
> +	 * queued requests following the hang. Non default contexts
> +	 * rely on preserved state, so skipping a batch loses the
> +	 * evolution of the state and it needs to be considered corrupted.
> +	 * Executing more queued batches on top of corrupted state is
> +	 * risky. But we take the risk by trying to advance through
> +	 * the queued requests in order to make the client behaviour
> +	 * more predictable around resets, by not throwing away random
> +	 * amount of batches it has prepared for execution. Sophisticated
> +	 * clients can use gem_reset_stats_ioctl and dma fence status
> +	 * (exported via sync_file info ioctl on explicit fences) to observe
> +	 * when it loses the context state and should rebuild accordingly.
> +	 *
> +	 * The context ban, and ultimately the client ban, mechanism are safety
> +	 * valves if client submission ends up resulting in nothing more than
> +	 * subsequent hangs.
> +	 */
> +
> +	if (i915_request_completed(rq)) {
> +		GEM_TRACE("%s pardoned global=%d (fence %llx:%d), current %d\n",
> +			  engine->name, rq->global_seqno,
> +			  rq->fence.context, rq->fence.seqno,
> +			  intel_engine_get_seqno(engine));
> +		stalled = false;
> +	}
> +
> +	if (stalled) {
> +		context_mark_guilty(rq->gem_context);
> +		i915_request_skip(rq, -EIO);
> +
> +		/* If this context is now banned, skip all pending requests. */
> +		if (i915_gem_context_is_banned(rq->gem_context))
> +			engine_skip_context(rq);
> +	} else {
> +		/*
> +		 * Since this is not the hung engine, it may have advanced
> +		 * since the hang declaration. Double check by refinding
> +		 * the active request at the time of the reset.
> +		 */
> +		rq = i915_gem_find_active_request(engine);
> +		if (rq) {
> +			unsigned long flags;
> +
> +			context_mark_innocent(rq->gem_context);
> +			dma_fence_set_error(&rq->fence, -EAGAIN);
> +
> +			/* Rewind the engine to replay the incomplete rq */
> +			spin_lock_irqsave(&engine->timeline.lock, flags);
> +			rq = list_prev_entry(rq, link);
> +			if (&rq->link == &engine->timeline.requests)
> +				rq = NULL;
> +			spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +		}
> +	}
> +
> +	return rq;
> +}
> +
> +static void reset_engine(struct intel_engine_cs *engine,
> +			 struct i915_request *rq,
> +			 bool stalled)
> +{
> +	/*
> +	 * Make sure this write is visible before we re-enable the interrupt
> +	 * handlers on another CPU, as tasklet_enable() resolves to just
> +	 * a compiler barrier which is insufficient for our purpose here.
> +	 */
> +	smp_store_mb(engine->irq_posted, 0);
> +
> +	if (rq)
> +		rq = reset_request(engine, rq, stalled);
> +
> +	/* Setup the CS to resume from the breadcrumb of the hung request */
> +	engine->reset.reset(engine, rq);
> +}
> +
> +static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +
> +	i915_retire_requests(i915);
> +
> +	for_each_engine(engine, i915, id) {
> +		struct intel_context *ce;
> +
> +		reset_engine(engine,
> +			     engine->hangcheck.active_request,
> +			     stalled_mask & ENGINE_MASK(id));
> +		ce = fetch_and_zero(&engine->last_retired_context);
> +		if (ce)
> +			intel_context_unpin(ce);
> +
> +		/*
> +		 * Ostensibily, we always want a context loaded for powersaving,
> +		 * so if the engine is idle after the reset, send a request
> +		 * to load our scratch kernel_context.
> +		 *
> +		 * More mysteriously, if we leave the engine idle after a reset,
> +		 * the next userspace batch may hang, with what appears to be
> +		 * an incoherent read by the CS (presumably stale TLB). An
> +		 * empty request appears sufficient to paper over the glitch.
> +		 */
> +		if (intel_engine_is_idle(engine)) {
> +			struct i915_request *rq;
> +
> +			rq = i915_request_alloc(engine, i915->kernel_context);
> +			if (!IS_ERR(rq))
> +				i915_request_add(rq);
> +		}
> +	}
> +
> +	i915_gem_restore_fences(i915);
> +}
> +
> +static void reset_finish_engine(struct intel_engine_cs *engine)
> +{
> +	engine->reset.finish(engine);
> +
> +	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
> +}
> +
> +static void reset_finish(struct drm_i915_private *i915)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +
> +	for_each_engine(engine, i915, id) {
> +		engine->hangcheck.active_request = NULL;
> +		reset_finish_engine(engine);
> +	}
> +
> +	enable_irq(i915->drm.irq);
> +}
> +
> +static void nop_submit_request(struct i915_request *rq)
> +{
> +	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
> +		  rq->engine->name, rq->fence.context, rq->fence.seqno);
> +	dma_fence_set_error(&rq->fence, -EIO);
> +
> +	i915_request_submit(rq);
> +}
> +
> +static void nop_complete_submit_request(struct i915_request *rq)
> +{
> +	unsigned long flags;
> +
> +	GEM_TRACE("%s fence %llx:%d -> -EIO\n",
> +		  rq->engine->name,
> +		  rq->fence.context, rq->fence.seqno);
> +	dma_fence_set_error(&rq->fence, -EIO);
> +
> +	spin_lock_irqsave(&rq->engine->timeline.lock, flags);
> +	__i915_request_submit(rq);
> +	intel_engine_init_global_seqno(rq->engine, rq->global_seqno);
> +	spin_unlock_irqrestore(&rq->engine->timeline.lock, flags);
> +}
> +
> +void i915_gem_set_wedged(struct drm_i915_private *i915)
> +{
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +
> +	GEM_TRACE("start\n");
> +
> +	if (GEM_SHOW_DEBUG()) {
> +		struct drm_printer p = drm_debug_printer(__func__);
> +
> +		for_each_engine(engine, i915, id)
> +			intel_engine_dump(engine, &p, "%s\n", engine->name);
> +	}
> +
> +	set_bit(I915_WEDGED, &i915->gpu_error.flags);
> +	smp_mb__after_atomic();
> +
> +	/*
> +	 * First, stop submission to hw, but do not yet complete requests by
> +	 * rolling the global seqno forward (since this would complete requests
> +	 * for which we haven't set the fence error to EIO yet).
> +	 */
> +	for_each_engine(engine, i915, id) {
> +		reset_prepare_engine(engine);
> +
> +		engine->submit_request = nop_submit_request;
> +		engine->schedule = NULL;
> +	}
> +	i915->caps.scheduler = 0;
> +
> +	/* Even if the GPU reset fails, it should still stop the engines */
> +	intel_gpu_reset(i915, ALL_ENGINES);
> +
> +	/*
> +	 * Make sure no one is running the old callback before we proceed with
> +	 * cancelling requests and resetting the completion tracking. Otherwise
> +	 * we might submit a request to the hardware which never completes.
> +	 */
> +	synchronize_rcu();
> +
> +	for_each_engine(engine, i915, id) {
> +		/* Mark all executing requests as skipped */
> +		engine->cancel_requests(engine);
> +
> +		/*
> +		 * Only once we've force-cancelled all in-flight requests can we
> +		 * start to complete all requests.
> +		 */
> +		engine->submit_request = nop_complete_submit_request;
> +	}
> +
> +	/*
> +	 * Make sure no request can slip through without getting completed by
> +	 * either this call here to intel_engine_init_global_seqno, or the one
> +	 * in nop_complete_submit_request.
> +	 */
> +	synchronize_rcu();
> +
> +	for_each_engine(engine, i915, id) {
> +		unsigned long flags;
> +
> +		/*
> +		 * Mark all pending requests as complete so that any concurrent
> +		 * (lockless) lookup doesn't try and wait upon the request as we
> +		 * reset it.
> +		 */
> +		spin_lock_irqsave(&engine->timeline.lock, flags);
> +		intel_engine_init_global_seqno(engine,
> +					       intel_engine_last_submit(engine));
> +		spin_unlock_irqrestore(&engine->timeline.lock, flags);
> +
> +		reset_finish_engine(engine);
> +	}
> +
> +	GEM_TRACE("end\n");
> +
> +	wake_up_all(&i915->gpu_error.reset_queue);
> +}
> +
> +bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> +{
> +	struct i915_timeline *tl;
> +
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +	if (!test_bit(I915_WEDGED, &i915->gpu_error.flags))
> +		return true;
> +
> +	GEM_TRACE("start\n");
> +
> +	/*
> +	 * Before unwedging, make sure that all pending operations
> +	 * are flushed and errored out - we may have requests waiting upon
> +	 * third party fences. We marked all inflight requests as EIO, and
> +	 * every execbuf since returned EIO, for consistency we want all
> +	 * the currently pending requests to also be marked as EIO, which
> +	 * is done inside our nop_submit_request - and so we must wait.
> +	 *
> +	 * No more can be submitted until we reset the wedged bit.
> +	 */
> +	list_for_each_entry(tl, &i915->gt.timelines, link) {
> +		struct i915_request *rq;
> +
> +		rq = i915_gem_active_peek(&tl->last_request,
> +					  &i915->drm.struct_mutex);
> +		if (!rq)
> +			continue;
> +
> +		/*
> +		 * We can't use our normal waiter as we want to
> +		 * avoid recursively trying to handle the current
> +		 * reset. The basic dma_fence_default_wait() installs
> +		 * a callback for dma_fence_signal(), which is
> +		 * triggered by our nop handler (indirectly, the
> +		 * callback enables the signaler thread which is
> +		 * woken by the nop_submit_request() advancing the seqno
> +		 * and when the seqno passes the fence, the signaler
> +		 * then signals the fence waking us up).
> +		 */
> +		if (dma_fence_default_wait(&rq->fence, true,
> +					   MAX_SCHEDULE_TIMEOUT) < 0)
> +			return false;
> +	}
> +	i915_retire_requests(i915);
> +	GEM_BUG_ON(i915->gt.active_requests);
> +
> +	/*
> +	 * Undo nop_submit_request. We prevent all new i915 requests from
> +	 * being queued (by disallowing execbuf whilst wedged) so having
> +	 * waited for all active requests above, we know the system is idle
> +	 * and do not have to worry about a thread being inside
> +	 * engine->submit_request() as we swap over. So unlike installing
> +	 * the nop_submit_request on reset, we can do this from normal
> +	 * context and do not require stop_machine().
> +	 */
> +	intel_engines_reset_default_submission(i915);
> +	i915_gem_contexts_lost(i915);
> +
> +	GEM_TRACE("end\n");
> +
> +	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
> +	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> +
> +	return true;
> +}
> +
> +/**
> + * i915_reset - reset chip after a hang
> + * @i915: #drm_i915_private to reset
> + * @stalled_mask: mask of the stalled engines with the guilty requests
> + * @reason: user error message for why we are resetting
> + *
> + * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
> + * on failure.
> + *
> + * Caller must hold the struct_mutex.
> + *
> + * Procedure is fairly simple:
> + *   - reset the chip using the reset reg
> + *   - re-init context state
> + *   - re-init hardware status page
> + *   - re-init ring buffer
> + *   - re-init interrupt state
> + *   - re-init display
> + */
> +void i915_reset(struct drm_i915_private *i915,
> +		unsigned int stalled_mask,
> +		const char *reason)
> +{
> +	struct i915_gpu_error *error = &i915->gpu_error;
> +	int ret;
> +	int i;
> +
> +	GEM_TRACE("flags=%lx\n", error->flags);
> +
> +	might_sleep();
> +	lockdep_assert_held(&i915->drm.struct_mutex);
> +	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
> +
> +	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
> +		return;
> +
> +	/* Clear any previous failed attempts at recovery. Time to try again. */
> +	if (!i915_gem_unset_wedged(i915))
> +		goto wakeup;
> +
> +	if (reason)
> +		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
> +	error->reset_count++;
> +
> +	ret = reset_prepare(i915);
> +	if (ret) {
> +		dev_err(i915->drm.dev, "GPU recovery failed\n");
> +		goto taint;
> +	}
> +
> +	if (!intel_has_gpu_reset(i915)) {
> +		if (i915_modparams.reset)
> +			dev_err(i915->drm.dev, "GPU reset not supported\n");
> +		else
> +			DRM_DEBUG_DRIVER("GPU reset disabled\n");
> +		goto error;
> +	}
> +
> +	for (i = 0; i < 3; i++) {
> +		ret = intel_gpu_reset(i915, ALL_ENGINES);
> +		if (ret == 0)
> +			break;
> +
> +		msleep(100);
> +	}
> +	if (ret) {
> +		dev_err(i915->drm.dev, "Failed to reset chip\n");
> +		goto taint;
> +	}
> +
> +	/* Ok, now get things going again... */
> +
> +	/*
> +	 * Everything depends on having the GTT running, so we need to start
> +	 * there.
> +	 */
> +	ret = i915_ggtt_enable_hw(i915);
> +	if (ret) {
> +		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
> +			  ret);
> +		goto error;
> +	}
> +
> +	gt_reset(i915, stalled_mask);
> +	intel_overlay_reset(i915);
> +
> +	/*
> +	 * Next we need to restore the context, but we don't use those
> +	 * yet either...
> +	 *
> +	 * Ring buffer needs to be re-initialized in the KMS case, or if X
> +	 * was running at the time of the reset (i.e. we weren't VT
> +	 * switched away).
> +	 */
> +	ret = i915_gem_init_hw(i915);
> +	if (ret) {
> +		DRM_ERROR("Failed to initialise HW following reset (%d)\n",
> +			  ret);
> +		goto error;
> +	}
> +
> +	i915_queue_hangcheck(i915);
> +
> +finish:
> +	reset_finish(i915);
> +wakeup:
> +	clear_bit(I915_RESET_HANDOFF, &error->flags);
> +	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
> +	return;
> +
> +taint:
> +	/*
> +	 * History tells us that if we cannot reset the GPU now, we
> +	 * never will. This then impacts everything that is run
> +	 * subsequently. On failing the reset, we mark the driver
> +	 * as wedged, preventing further execution on the GPU.
> +	 * We also want to go one step further and add a taint to the
> +	 * kernel so that any subsequent faults can be traced back to
> +	 * this failure. This is important for CI, where if the
> +	 * GPU/driver fails we would like to reboot and restart testing
> +	 * rather than continue on into oblivion. For everyone else,
> +	 * the system should still plod along, but they have been warned!
> +	 */
> +	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
> +error:
> +	i915_gem_set_wedged(i915);
> +	i915_retire_requests(i915);
> +	goto finish;
> +}
> +
> +static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
> +					struct intel_engine_cs *engine)
> +{
> +	return intel_gpu_reset(i915, intel_engine_flag(engine));
> +}
> +
> +/**
> + * i915_reset_engine - reset GPU engine to recover from a hang
> + * @engine: engine to reset
> + * @msg: reason for GPU reset; or NULL for no dev_notice()
> + *
> + * Reset a specific GPU engine. Useful if a hang is detected.
> + * Returns zero on successful reset or otherwise an error code.
> + *
> + * Procedure is:
> + *  - identifies the request that caused the hang and it is dropped
> + *  - reset engine (which will force the engine to idle)
> + *  - re-init/configure engine
> + */
> +int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
> +{
> +	struct i915_gpu_error *error = &engine->i915->gpu_error;
> +	struct i915_request *active_request;
> +	int ret;
> +
> +	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
> +	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
> +
> +	active_request = reset_prepare_engine(engine);
> +	if (IS_ERR_OR_NULL(active_request)) {
> +		/* Either the previous reset failed, or we pardon the reset. */
> +		ret = PTR_ERR(active_request);
> +		goto out;
> +	}
> +
> +	if (msg)
> +		dev_notice(engine->i915->drm.dev,
> +			   "Resetting %s for %s\n", engine->name, msg);
> +	error->reset_engine_count[engine->id]++;
> +
> +	if (!engine->i915->guc.execbuf_client)
> +		ret = intel_gt_reset_engine(engine->i915, engine);
> +	else
> +		ret = intel_guc_reset_engine(&engine->i915->guc, engine);
> +	if (ret) {
> +		/* If we fail here, we expect to fallback to a global reset */
> +		DRM_DEBUG_DRIVER("%sFailed to reset %s, ret=%d\n",
> +				 engine->i915->guc.execbuf_client ? "GuC " : "",
> +				 engine->name, ret);
> +		goto out;
> +	}
> +
> +	/*
> +	 * The request that caused the hang is stuck on elsp, we know the
> +	 * active request and can drop it, adjust head to skip the offending
> +	 * request to resume executing remaining requests in the queue.
> +	 */
> +	reset_engine(engine, active_request, true);
> +
> +	/*
> +	 * The engine and its registers (and workarounds in case of render)
> +	 * have been reset to their default values. Follow the init_ring
> +	 * process to program RING_MODE, HWSP and re-enable submission.
> +	 */
> +	ret = engine->init_hw(engine);
> +	if (ret)
> +		goto out;
> +
> +out:
> +	reset_finish_engine(engine);
> +	return ret;
> +}
> +
> +struct wedge_me {
> +	struct delayed_work work;
> +	struct drm_i915_private *i915;
> +	const char *name;
> +};
> +
> +static void wedge_me(struct work_struct *work)
> +{
> +	struct wedge_me *w = container_of(work, typeof(*w), work.work);
> +
> +	dev_err(w->i915->drm.dev,
> +		"%s timed out, cancelling all in-flight rendering.\n",
> +		w->name);
> +	i915_gem_set_wedged(w->i915);
> +}
> +
> +static void __init_wedge(struct wedge_me *w,
> +			 struct drm_i915_private *i915,
> +			 long timeout,
> +			 const char *name)
> +{
> +	w->i915 = i915;
> +	w->name = name;
> +
> +	INIT_DELAYED_WORK_ONSTACK(&w->work, wedge_me);
> +	schedule_delayed_work(&w->work, timeout);
> +}
> +
> +static void __fini_wedge(struct wedge_me *w)
> +{
> +	cancel_delayed_work_sync(&w->work);
> +	destroy_delayed_work_on_stack(&w->work);
> +	w->i915 = NULL;
> +}
> +
> +#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
> +	for (__init_wedge((W), (DEV), (TIMEOUT), __func__);		\
> +	     (W)->i915;							\
> +	     __fini_wedge((W)))
> +
> +static void i915_reset_device(struct drm_i915_private *i915,
> +			      u32 engine_mask,
> +			      const char *reason)
> +{
> +	struct i915_gpu_error *error = &i915->gpu_error;
> +	struct kobject *kobj = &i915->drm.primary->kdev->kobj;
> +	char *error_event[] = { I915_ERROR_UEVENT "=1", NULL };
> +	char *reset_event[] = { I915_RESET_UEVENT "=1", NULL };
> +	char *reset_done_event[] = { I915_ERROR_UEVENT "=0", NULL };
> +	struct wedge_me w;
> +
> +	kobject_uevent_env(kobj, KOBJ_CHANGE, error_event);
> +
> +	DRM_DEBUG_DRIVER("resetting chip\n");
> +	kobject_uevent_env(kobj, KOBJ_CHANGE, reset_event);
> +
> +	/* Use a watchdog to ensure that our reset completes */
> +	i915_wedge_on_timeout(&w, i915, 5 * HZ) {
> +		intel_prepare_reset(i915);
> +
> +		error->reason = reason;
> +		error->stalled_mask = engine_mask;
> +
> +		/* Signal that locked waiters should reset the GPU */
> +		smp_mb__before_atomic();
> +		set_bit(I915_RESET_HANDOFF, &error->flags);
> +		wake_up_all(&error->wait_queue);
> +
> +		/*
> +		 * Wait for anyone holding the lock to wakeup, without
> +		 * blocking indefinitely on struct_mutex.
> +		 */
> +		do {
> +			if (mutex_trylock(&i915->drm.struct_mutex)) {
> +				i915_reset(i915, engine_mask, reason);
> +				mutex_unlock(&i915->drm.struct_mutex);
> +			}
> +		} while (wait_on_bit_timeout(&error->flags,
> +					     I915_RESET_HANDOFF,
> +					     TASK_UNINTERRUPTIBLE,
> +					     1));
> +
> +		error->stalled_mask = 0;
> +		error->reason = NULL;
> +
> +		intel_finish_reset(i915);
> +	}
> +
> +	if (!test_bit(I915_WEDGED, &error->flags))
> +		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
> +}
> +
> +static void i915_clear_error_registers(struct drm_i915_private *dev_priv)
> +{
> +	u32 eir;
> +
> +	if (!IS_GEN2(dev_priv))
> +		I915_WRITE(PGTBL_ER, I915_READ(PGTBL_ER));
> +
> +	if (INTEL_GEN(dev_priv) < 4)
> +		I915_WRITE(IPEIR, I915_READ(IPEIR));
> +	else
> +		I915_WRITE(IPEIR_I965, I915_READ(IPEIR_I965));
> +
> +	I915_WRITE(EIR, I915_READ(EIR));
> +	eir = I915_READ(EIR);
> +	if (eir) {
> +		/*
> +		 * some errors might have become stuck,
> +		 * mask them.
> +		 */
> +		DRM_DEBUG_DRIVER("EIR stuck: 0x%08x, masking\n", eir);
> +		I915_WRITE(EMR, I915_READ(EMR) | eir);
> +		I915_WRITE(IIR, I915_MASTER_ERROR_INTERRUPT);
> +	}
> +}
> +
> +/**
> + * i915_handle_error - handle a gpu error
> + * @i915: i915 device private
> + * @engine_mask: mask representing engines that are hung
> + * @flags: control flags
> + * @fmt: Error message format string
> + *
> + * Do some basic checking of register state at error time and
> + * dump it to the syslog.  Also call i915_capture_error_state() to make
> + * sure we get a record and make it available in debugfs.  Fire a uevent
> + * so userspace knows something bad happened (should trigger collection
> + * of a ring dump etc.).
> + */
> +void i915_handle_error(struct drm_i915_private *i915,
> +		       u32 engine_mask,
> +		       unsigned long flags,
> +		       const char *fmt, ...)
> +{
> +	struct intel_engine_cs *engine;
> +	unsigned int tmp;
> +	char error_msg[80];
> +	char *msg = NULL;
> +
> +	if (fmt) {
> +		va_list args;
> +
> +		va_start(args, fmt);
> +		vscnprintf(error_msg, sizeof(error_msg), fmt, args);
> +		va_end(args);
> +
> +		msg = error_msg;
> +	}
> +
> +	/*
> +	 * In most cases it's guaranteed that we get here with an RPM
> +	 * reference held, for example because there is a pending GPU
> +	 * request that won't finish until the reset is done. This
> +	 * isn't the case at least when we get here by doing a
> +	 * simulated reset via debugfs, so get an RPM reference.
> +	 */
> +	intel_runtime_pm_get(i915);
> +
> +	engine_mask &= INTEL_INFO(i915)->ring_mask;
> +
> +	if (flags & I915_ERROR_CAPTURE) {
> +		i915_capture_error_state(i915, engine_mask, msg);
> +		i915_clear_error_registers(i915);
> +	}
> +
> +	/*
> +	 * Try engine reset when available. We fall back to full reset if
> +	 * single reset fails.
> +	 */
> +	if (intel_has_reset_engine(i915)) {
> +		for_each_engine_masked(engine, i915, engine_mask, tmp) {
> +			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
> +			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> +					     &i915->gpu_error.flags))
> +				continue;
> +
> +			if (i915_reset_engine(engine, msg) == 0)
> +				engine_mask &= ~intel_engine_flag(engine);
> +
> +			clear_bit(I915_RESET_ENGINE + engine->id,
> +				  &i915->gpu_error.flags);
> +			wake_up_bit(&i915->gpu_error.flags,
> +				    I915_RESET_ENGINE + engine->id);
> +		}
> +	}
> +
> +	if (!engine_mask)
> +		goto out;
> +
> +	/* Full reset needs the mutex, stop any other user trying to do so. */
> +	if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
> +		wait_event(i915->gpu_error.reset_queue,
> +			   !test_bit(I915_RESET_BACKOFF,
> +				     &i915->gpu_error.flags));
> +		goto out;
> +	}
> +
> +	/* Prevent any other reset-engine attempt. */
> +	for_each_engine(engine, i915, tmp) {
> +		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> +					&i915->gpu_error.flags))
> +			wait_on_bit(&i915->gpu_error.flags,
> +				    I915_RESET_ENGINE + engine->id,
> +				    TASK_UNINTERRUPTIBLE);
> +	}
> +
> +	i915_reset_device(i915, engine_mask, msg);
> +
> +	for_each_engine(engine, i915, tmp) {
> +		clear_bit(I915_RESET_ENGINE + engine->id,
> +			  &i915->gpu_error.flags);
> +	}
> +
> +	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> +	wake_up_all(&i915->gpu_error.reset_queue);
> +
> +out:
> +	intel_runtime_pm_put(i915);
> +}
> diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
> new file mode 100644
> index 000000000000..09422c4772dd
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_reset.h
> @@ -0,0 +1,37 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2008-2018 Intel Corporation
> + */
> +
> +#ifndef I915_RESET_H
> +#define I915_RESET_H
> +
> +#include <linux/compiler.h>
> +#include <linux/types.h>
> +
> +struct drm_i915_private;
> +struct intel_engine_cs;
> +struct intel_guc;
> +
> +__printf(4, 5)
> +void i915_handle_error(struct drm_i915_private *i915,
> +		       u32 engine_mask,
> +		       unsigned long flags,
> +		       const char *fmt, ...);
> +#define I915_ERROR_CAPTURE BIT(0)
> +
> +void i915_reset(struct drm_i915_private *i915,
> +		unsigned int stalled_mask,
> +		const char *reason);
> +int i915_reset_engine(struct intel_engine_cs *engine,
> +		      const char *reason);
> +
> +bool intel_has_gpu_reset(struct drm_i915_private *i915);
> +bool intel_has_reset_engine(struct drm_i915_private *i915);
> +
> +int intel_gpu_reset(struct drm_i915_private *i915, u32 engine_mask);
> +
> +int intel_reset_guc(struct drm_i915_private *i915);
> +
> +#endif /* I915_RESET_H */
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 7998e70a3174..8a07de5ac740 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -33,13 +33,7 @@
>  #include <linux/vgaarb.h>
>  #include <drm/drm_edid.h>
>  #include <drm/drmP.h>
> -#include "intel_drv.h"
> -#include "intel_frontbuffer.h"
>  #include <drm/i915_drm.h>
> -#include "i915_drv.h"
> -#include "i915_gem_clflush.h"
> -#include "intel_dsi.h"
> -#include "i915_trace.h"
>  #include <drm/drm_atomic.h>
>  #include <drm/drm_atomic_helper.h>
>  #include <drm/drm_dp_helper.h>
> @@ -49,6 +43,15 @@
>  #include <linux/dma_remapping.h>
>  #include <linux/reservation.h>
>  
> +#include "intel_drv.h"
> +#include "intel_dsi.h"
> +#include "intel_frontbuffer.h"
> +
> +#include "i915_drv.h"
> +#include "i915_gem_clflush.h"
> +#include "i915_reset.h"
> +#include "i915_trace.h"
> +
>  /* Primary plane formats for gen <= 3 */
>  static const uint32_t i8xx_primary_formats[] = {
>  	DRM_FORMAT_C8,
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index 4121928a495e..df1a384c2f92 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -189,4 +189,7 @@ static inline void intel_guc_disable_msg(struct intel_guc *guc, u32 mask)
>  	spin_unlock_irq(&guc->irq_lock);
>  }
>  
> +int intel_guc_reset_engine(struct intel_guc *guc,
> +			   struct intel_engine_cs *engine);
> +
>  #endif
> diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
> index 2fc7a0dd0df9..5141df342884 100644
> --- a/drivers/gpu/drm/i915/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/intel_hangcheck.c
> @@ -23,6 +23,7 @@
>   */
>  
>  #include "i915_drv.h"
> +#include "i915_reset.h"
>  
>  static bool
>  ipehr_is_semaphore_wait(struct intel_engine_cs *engine, u32 ipehr)
> diff --git a/drivers/gpu/drm/i915/intel_uc.c b/drivers/gpu/drm/i915/intel_uc.c
> index 7c95697e1a35..88352ff7164a 100644
> --- a/drivers/gpu/drm/i915/intel_uc.c
> +++ b/drivers/gpu/drm/i915/intel_uc.c
> @@ -26,6 +26,7 @@
>  #include "intel_guc_submission.h"
>  #include "intel_guc.h"
>  #include "i915_drv.h"
> +#include "i915_reset.h"
>  
>  static void guc_free_load_err_log(struct intel_guc *guc);
>  
> diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c
> index b892ca8396e8..1abd342e9cce 100644
> --- a/drivers/gpu/drm/i915/intel_uncore.c
> +++ b/drivers/gpu/drm/i915/intel_uncore.c
> @@ -1698,258 +1698,6 @@ int i915_reg_read_ioctl(struct drm_device *dev,
>  	return ret;
>  }
>  
> -static void gen3_stop_engine(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -	const u32 base = engine->mmio_base;
> -
> -	if (intel_engine_stop_cs(engine))
> -		DRM_DEBUG_DRIVER("%s: timed out on STOP_RING\n", engine->name);
> -
> -	I915_WRITE_FW(RING_HEAD(base), I915_READ_FW(RING_TAIL(base)));
> -	POSTING_READ_FW(RING_HEAD(base)); /* paranoia */
> -
> -	I915_WRITE_FW(RING_HEAD(base), 0);
> -	I915_WRITE_FW(RING_TAIL(base), 0);
> -	POSTING_READ_FW(RING_TAIL(base));
> -
> -	/* The ring must be empty before it is disabled */
> -	I915_WRITE_FW(RING_CTL(base), 0);
> -
> -	/* Check acts as a post */
> -	if (I915_READ_FW(RING_HEAD(base)) != 0)
> -		DRM_DEBUG_DRIVER("%s: ring head not parked\n",
> -				 engine->name);
> -}
> -
> -static void i915_stop_engines(struct drm_i915_private *dev_priv,
> -			      unsigned engine_mask)
> -{
> -	struct intel_engine_cs *engine;
> -	enum intel_engine_id id;
> -
> -	if (INTEL_GEN(dev_priv) < 3)
> -		return;
> -
> -	for_each_engine_masked(engine, dev_priv, engine_mask, id)
> -		gen3_stop_engine(engine);
> -}
> -
> -static bool i915_in_reset(struct pci_dev *pdev)
> -{
> -	u8 gdrst;
> -
> -	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
> -	return gdrst & GRDOM_RESET_STATUS;
> -}
> -
> -static int i915_do_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
> -{
> -	struct pci_dev *pdev = dev_priv->drm.pdev;
> -	int err;
> -
> -	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
> -	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> -	usleep_range(50, 200);
> -	err = wait_for(i915_in_reset(pdev), 500);
> -
> -	/* Clear the reset request. */
> -	pci_write_config_byte(pdev, I915_GDRST, 0);
> -	usleep_range(50, 200);
> -	if (!err)
> -		err = wait_for(!i915_in_reset(pdev), 500);
> -
> -	return err;
> -}
> -
> -static bool g4x_reset_complete(struct pci_dev *pdev)
> -{
> -	u8 gdrst;
> -
> -	pci_read_config_byte(pdev, I915_GDRST, &gdrst);
> -	return (gdrst & GRDOM_RESET_ENABLE) == 0;
> -}
> -
> -static int g33_do_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
> -{
> -	struct pci_dev *pdev = dev_priv->drm.pdev;
> -
> -	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> -	return wait_for(g4x_reset_complete(pdev), 500);
> -}
> -
> -static int g4x_do_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
> -{
> -	struct pci_dev *pdev = dev_priv->drm.pdev;
> -	int ret;
> -
> -	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
> -	I915_WRITE(VDECCLK_GATE_D,
> -		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
> -	POSTING_READ(VDECCLK_GATE_D);
> -
> -	pci_write_config_byte(pdev, I915_GDRST,
> -			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
> -	ret =  wait_for(g4x_reset_complete(pdev), 500);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
> -		goto out;
> -	}
> -
> -	pci_write_config_byte(pdev, I915_GDRST,
> -			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
> -	ret =  wait_for(g4x_reset_complete(pdev), 500);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
> -		goto out;
> -	}
> -
> -out:
> -	pci_write_config_byte(pdev, I915_GDRST, 0);
> -
> -	I915_WRITE(VDECCLK_GATE_D,
> -		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
> -	POSTING_READ(VDECCLK_GATE_D);
> -
> -	return ret;
> -}
> -
> -static int ironlake_do_reset(struct drm_i915_private *dev_priv,
> -			     unsigned engine_mask)
> -{
> -	int ret;
> -
> -	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
> -	ret = intel_wait_for_register(dev_priv,
> -				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> -				      500);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
> -		goto out;
> -	}
> -
> -	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
> -	ret = intel_wait_for_register(dev_priv,
> -				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> -				      500);
> -	if (ret) {
> -		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
> -		goto out;
> -	}
> -
> -out:
> -	I915_WRITE(ILK_GDSR, 0);
> -	POSTING_READ(ILK_GDSR);
> -	return ret;
> -}
> -
> -/* Reset the hardware domains (GENX_GRDOM_*) specified by mask */
> -static int gen6_hw_domain_reset(struct drm_i915_private *dev_priv,
> -				u32 hw_domain_mask)
> -{
> -	int err;
> -
> -	/* GEN6_GDRST is not in the gt power well, no need to check
> -	 * for fifo space for the write or forcewake the chip for
> -	 * the read
> -	 */
> -	__raw_i915_write32(dev_priv, GEN6_GDRST, hw_domain_mask);
> -
> -	/* Wait for the device to ack the reset requests */
> -	err = __intel_wait_for_register_fw(dev_priv,
> -					   GEN6_GDRST, hw_domain_mask, 0,
> -					   500, 0,
> -					   NULL);
> -	if (err)
> -		DRM_DEBUG_DRIVER("Wait for 0x%08x engines reset failed\n",
> -				 hw_domain_mask);
> -
> -	return err;
> -}
> -
> -/**
> - * gen6_reset_engines - reset individual engines
> - * @dev_priv: i915 device
> - * @engine_mask: mask of intel_ring_flag() engines or ALL_ENGINES for full reset
> - *
> - * This function will reset the individual engines that are set in engine_mask.
> - * If you provide ALL_ENGINES as mask, full global domain reset will be issued.
> - *
> - * Note: It is responsibility of the caller to handle the difference between
> - * asking full domain reset versus reset for all available individual engines.
> - *
> - * Returns 0 on success, nonzero on error.
> - */
> -static int gen6_reset_engines(struct drm_i915_private *dev_priv,
> -			      unsigned engine_mask)
> -{
> -	struct intel_engine_cs *engine;
> -	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
> -		[RCS] = GEN6_GRDOM_RENDER,
> -		[BCS] = GEN6_GRDOM_BLT,
> -		[VCS] = GEN6_GRDOM_MEDIA,
> -		[VCS2] = GEN8_GRDOM_MEDIA2,
> -		[VECS] = GEN6_GRDOM_VECS,
> -	};
> -	u32 hw_mask;
> -
> -	if (engine_mask == ALL_ENGINES) {
> -		hw_mask = GEN6_GRDOM_FULL;
> -	} else {
> -		unsigned int tmp;
> -
> -		hw_mask = 0;
> -		for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
> -			hw_mask |= hw_engine_mask[engine->id];
> -	}
> -
> -	return gen6_hw_domain_reset(dev_priv, hw_mask);
> -}
> -
> -/**
> - * gen11_reset_engines - reset individual engines
> - * @dev_priv: i915 device
> - * @engine_mask: mask of intel_ring_flag() engines or ALL_ENGINES for full reset
> - *
> - * This function will reset the individual engines that are set in engine_mask.
> - * If you provide ALL_ENGINES as mask, full global domain reset will be issued.
> - *
> - * Note: It is responsibility of the caller to handle the difference between
> - * asking full domain reset versus reset for all available individual engines.
> - *
> - * Returns 0 on success, nonzero on error.
> - */
> -static int gen11_reset_engines(struct drm_i915_private *dev_priv,
> -			       unsigned engine_mask)
> -{
> -	struct intel_engine_cs *engine;
> -	const u32 hw_engine_mask[I915_NUM_ENGINES] = {
> -		[RCS] = GEN11_GRDOM_RENDER,
> -		[BCS] = GEN11_GRDOM_BLT,
> -		[VCS] = GEN11_GRDOM_MEDIA,
> -		[VCS2] = GEN11_GRDOM_MEDIA2,
> -		[VCS3] = GEN11_GRDOM_MEDIA3,
> -		[VCS4] = GEN11_GRDOM_MEDIA4,
> -		[VECS] = GEN11_GRDOM_VECS,
> -		[VECS2] = GEN11_GRDOM_VECS2,
> -	};
> -	u32 hw_mask;
> -
> -	BUILD_BUG_ON(VECS2 + 1 != I915_NUM_ENGINES);
> -
> -	if (engine_mask == ALL_ENGINES) {
> -		hw_mask = GEN11_GRDOM_FULL;
> -	} else {
> -		unsigned int tmp;
> -
> -		hw_mask = 0;
> -		for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
> -			hw_mask |= hw_engine_mask[engine->id];
> -	}
> -
> -	return gen6_hw_domain_reset(dev_priv, hw_mask);
> -}
> -
>  /**
>   * __intel_wait_for_register_fw - wait until register matches expected state
>   * @dev_priv: the i915 device
> @@ -2060,169 +1808,6 @@ int __intel_wait_for_register(struct drm_i915_private *dev_priv,
>  	return ret;
>  }
>  
> -static int gen8_reset_engine_start(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -	int ret;
> -
> -	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
> -		      _MASKED_BIT_ENABLE(RESET_CTL_REQUEST_RESET));
> -
> -	ret = __intel_wait_for_register_fw(dev_priv,
> -					   RING_RESET_CTL(engine->mmio_base),
> -					   RESET_CTL_READY_TO_RESET,
> -					   RESET_CTL_READY_TO_RESET,
> -					   700, 0,
> -					   NULL);
> -	if (ret)
> -		DRM_ERROR("%s: reset request timeout\n", engine->name);
> -
> -	return ret;
> -}
> -
> -static void gen8_reset_engine_cancel(struct intel_engine_cs *engine)
> -{
> -	struct drm_i915_private *dev_priv = engine->i915;
> -
> -	I915_WRITE_FW(RING_RESET_CTL(engine->mmio_base),
> -		      _MASKED_BIT_DISABLE(RESET_CTL_REQUEST_RESET));
> -}
> -
> -static int gen8_reset_engines(struct drm_i915_private *dev_priv,
> -			      unsigned engine_mask)
> -{
> -	struct intel_engine_cs *engine;
> -	unsigned int tmp;
> -	int ret;
> -
> -	for_each_engine_masked(engine, dev_priv, engine_mask, tmp) {
> -		if (gen8_reset_engine_start(engine)) {
> -			ret = -EIO;
> -			goto not_ready;
> -		}
> -	}
> -
> -	if (INTEL_GEN(dev_priv) >= 11)
> -		ret = gen11_reset_engines(dev_priv, engine_mask);
> -	else
> -		ret = gen6_reset_engines(dev_priv, engine_mask);
> -
> -not_ready:
> -	for_each_engine_masked(engine, dev_priv, engine_mask, tmp)
> -		gen8_reset_engine_cancel(engine);
> -
> -	return ret;
> -}
> -
> -typedef int (*reset_func)(struct drm_i915_private *, unsigned engine_mask);
> -
> -static reset_func intel_get_gpu_reset(struct drm_i915_private *dev_priv)
> -{
> -	if (!i915_modparams.reset)
> -		return NULL;
> -
> -	if (INTEL_GEN(dev_priv) >= 8)
> -		return gen8_reset_engines;
> -	else if (INTEL_GEN(dev_priv) >= 6)
> -		return gen6_reset_engines;
> -	else if (IS_GEN5(dev_priv))
> -		return ironlake_do_reset;
> -	else if (IS_G4X(dev_priv))
> -		return g4x_do_reset;
> -	else if (IS_G33(dev_priv) || IS_PINEVIEW(dev_priv))
> -		return g33_do_reset;
> -	else if (INTEL_GEN(dev_priv) >= 3)
> -		return i915_do_reset;
> -	else
> -		return NULL;
> -}
> -
> -int intel_gpu_reset(struct drm_i915_private *dev_priv, unsigned engine_mask)
> -{
> -	reset_func reset = intel_get_gpu_reset(dev_priv);
> -	int retry;
> -	int ret;
> -
> -	/*
> -	 * We want to perform per-engine reset from atomic context (e.g.
> -	 * softirq), which imposes the constraint that we cannot sleep.
> -	 * However, experience suggests that spending a bit of time waiting
> -	 * for a reset helps in various cases, so for a full-device reset
> -	 * we apply the opposite rule and wait if we want to. As we should
> -	 * always follow up a failed per-engine reset with a full device reset,
> -	 * being a little faster, stricter and more error prone for the
> -	 * atomic case seems an acceptable compromise.
> -	 *
> -	 * Unfortunately this leads to a bimodal routine, when the goal was
> -	 * to have a single reset function that worked for resetting any
> -	 * number of engines simultaneously.
> -	 */
> -	might_sleep_if(engine_mask == ALL_ENGINES);
> -
> -	/*
> -	 * If the power well sleeps during the reset, the reset
> -	 * request may be dropped and never completes (causing -EIO).
> -	 */
> -	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> -	for (retry = 0; retry < 3; retry++) {
> -
> -		/*
> -		 * We stop engines, otherwise we might get failed reset and a
> -		 * dead gpu (on elk). Also as modern gpu as kbl can suffer
> -		 * from system hang if batchbuffer is progressing when
> -		 * the reset is issued, regardless of READY_TO_RESET ack.
> -		 * Thus assume it is best to stop engines on all gens
> -		 * where we have a gpu reset.
> -		 *
> -		 * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
> -		 *
> -		 * WaMediaResetMainRingCleanup:ctg,elk (presumably)
> -		 *
> -		 * FIXME: Wa for more modern gens needs to be validated
> -		 */
> -		i915_stop_engines(dev_priv, engine_mask);
> -
> -		ret = -ENODEV;
> -		if (reset) {
> -			GEM_TRACE("engine_mask=%x\n", engine_mask);
> -			ret = reset(dev_priv, engine_mask);
> -		}
> -		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
> -			break;
> -
> -		cond_resched();
> -	}
> -	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> -
> -	return ret;
> -}
> -
> -bool intel_has_gpu_reset(struct drm_i915_private *dev_priv)
> -{
> -	return intel_get_gpu_reset(dev_priv) != NULL;
> -}
> -
> -bool intel_has_reset_engine(struct drm_i915_private *dev_priv)
> -{
> -	return (dev_priv->info.has_reset_engine &&
> -		i915_modparams.reset >= 2);
> -}
> -
> -int intel_reset_guc(struct drm_i915_private *dev_priv)
> -{
> -	u32 guc_domain = INTEL_GEN(dev_priv) >= 11 ? GEN11_GRDOM_GUC :
> -						     GEN9_GRDOM_GUC;
> -	int ret;
> -
> -	GEM_BUG_ON(!HAS_GUC(dev_priv));
> -
> -	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
> -	ret = gen6_hw_domain_reset(dev_priv, guc_domain);
> -	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> -
> -	return ret;
> -}
> -
>  bool intel_uncore_unclaimed_mmio(struct drm_i915_private *dev_priv)
>  {
>  	return check_for_unclaimed_mmio(dev_priv);
> diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> index fafdec3fe83e..7f842dbbea1f 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
> @@ -5,6 +5,7 @@
>   */
>  
>  #include "../i915_selftest.h"
> +#include "../i915_reset.h"
>  
>  #include "mock_context.h"
>  
> -- 
> 2.18.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 6/7] drm/i915: Pull all the reset functionality together into i915_reset.c
  2018-07-11  9:17   ` Daniel Vetter
@ 2018-07-11  9:28     ` Chris Wilson
  0 siblings, 0 replies; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  9:28 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

Quoting Daniel Vetter (2018-07-11 10:17:09)
> On Wed, Jul 11, 2018 at 08:36:07AM +0100, Chris Wilson wrote:
> > Currently the code to reset the GPU and our state is spread widely
> > across a few files. Pull the logic together into a common file.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Hm when reading the fence code I'd suggested more to move the reset/init
> code for a given topic (like fence_reg) into one place. Imo that makes
> reviewing changes to a specific thing easier because it's all in one file.
> 
> Ofc it makes it harder to review large-scale flows like suspend/resume and
> gpu reset, but I think the only thing that can safe us there is lots and
> lots of runtime consistency checks for each component (lockdep,
> GEM_BUG_ON). Reviewing those is doable, and for the big picture
> CI+regrets.
> 
> tldr; not sure this is the best organization we can do.

I am confident that pulling the code flow that was split across i915_irq.c
(of all, this was the silliest), i915_drv.c, intel_uncore.c and then
into i915_gem.c; into a single location is an improvement for
understanding that flow and the subsequent changes.

Whether it stands the test of time, we will see, but the immediate
question is simply whether it is an improvement that opens up further
incremental improvements, or if it is a complete dead end, a maintenance
trap.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex
  2018-07-11  7:36 ` [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex Chris Wilson
  2018-07-11  8:09   ` Daniel Vetter
@ 2018-07-11  9:33   ` Daniel Vetter
  2018-07-11  9:36     ` Daniel Vetter
  1 sibling, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2018-07-11  9:33 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, Jul 11, 2018 at 08:36:02AM +0100, Chris Wilson wrote:
> Add a mutex into struct i915_address_space to be used while operating on
> the vma and their lists for a particular vm. As this may be called from
> the shrinker, we taint the mutex with fs_reclaim so that from the start
> lockdep warns us if we are caught holding the mutex across an
> allocation. (With such small steps we will eventually rid ourselves of
> struct_mutex recursion!)
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Not sure it exists in a branch of yours already, but here's my thoughts on
extending this to the address_space lrus and the shrinker callback (which
I think would be the next step with good pay-off):

1. make sure pin_count is protected by reservation_obj.
2. grab the vm.mutex when walking LRUs everywhere. This is going to be
tricky for ggtt because of runtime PM. Between lock-dropping, carefully
avoiding rpm when cleaning up objects and just grabbing an rpm wakeref
when walking the ggtt vm this should be possible to work around (since for
the fences we clearly need to be able to nest the vm.mutex within rpm or
we're busted).
3. In the shrinker trylock the reservation_obj and treat a failure to get
the lock as if pin_count is elevated. If we can't shrink enough then grab
a temporary reference to the bo using kref_get_unless_zero, drop the
vm.mutex (since that's what gave us the weak ref) and do a blocking
reservation_obj lock.
4. Audit the obj/vma unbind paths and apply reservation_obj or vm.mutex
locking as needed until we can drop the struct_mutex from the shrinker.
Biggest trouble is probably ggtt mmap (but I think ordering of pte
shootdown takes care of that) and drm_mm ("just" needs to be protected by
vm.mutex too I think).

Plan is probably the underestimation of the decade, at least :-)
-Daniel
> ---
>  drivers/gpu/drm/i915/i915_drv.h          |  2 +-
>  drivers/gpu/drm/i915/i915_gem_gtt.c      | 10 ++++++++++
>  drivers/gpu/drm/i915/i915_gem_gtt.h      |  2 ++
>  drivers/gpu/drm/i915/i915_gem_shrinker.c | 12 ++++++++++++
>  4 files changed, 25 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index eeb002a47032..01dd29837233 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3304,7 +3304,7 @@ unsigned long i915_gem_shrink(struct drm_i915_private *i915,
>  unsigned long i915_gem_shrink_all(struct drm_i915_private *i915);
>  void i915_gem_shrinker_register(struct drm_i915_private *i915);
>  void i915_gem_shrinker_unregister(struct drm_i915_private *i915);
> -
> +void i915_gem_shrinker_taints_mutex(struct mutex *mutex);
>  
>  /* i915_gem_tiling.c */
>  static inline bool i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj)
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index abd81fb9b0b6..d0acef299b9c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -531,6 +531,14 @@ static void vm_free_page(struct i915_address_space *vm, struct page *page)
>  static void i915_address_space_init(struct i915_address_space *vm,
>  				    struct drm_i915_private *dev_priv)
>  {
> +	/*
> +	 * The vm->mutex must be reclaim safe (for use in the shrinker).
> +	 * Do a dummy acquire now under fs_reclaim so that any allocation
> +	 * attempt holding the lock is immediately reported by lockdep.
> +	 */
> +	mutex_init(&vm->mutex);
> +	i915_gem_shrinker_taints_mutex(&vm->mutex);
> +
>  	GEM_BUG_ON(!vm->total);
>  	drm_mm_init(&vm->mm, 0, vm->total);
>  	vm->mm.head_node.color = I915_COLOR_UNEVICTABLE;
> @@ -551,6 +559,8 @@ static void i915_address_space_fini(struct i915_address_space *vm)
>  	spin_unlock(&vm->free_pages.lock);
>  
>  	drm_mm_takedown(&vm->mm);
> +
> +	mutex_destroy(&vm->mutex);
>  }
>  
>  static int __setup_page_dma(struct i915_address_space *vm,
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index feda45dfd481..14e62651010b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -293,6 +293,8 @@ struct i915_address_space {
>  
>  	bool closed;
>  
> +	struct mutex mutex; /* protects vma and our lists */
> +
>  	struct i915_page_dma scratch_page;
>  	struct i915_page_table *scratch_pt;
>  	struct i915_page_directory *scratch_pd;
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index c61f5b80fee3..ea90d3a0d511 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -23,6 +23,7 @@
>   */
>  
>  #include <linux/oom.h>
> +#include <linux/sched/mm.h>
>  #include <linux/shmem_fs.h>
>  #include <linux/slab.h>
>  #include <linux/swap.h>
> @@ -531,3 +532,14 @@ void i915_gem_shrinker_unregister(struct drm_i915_private *i915)
>  	WARN_ON(unregister_oom_notifier(&i915->mm.oom_notifier));
>  	unregister_shrinker(&i915->mm.shrinker);
>  }
> +
> +void i915_gem_shrinker_taints_mutex(struct mutex *mutex)
> +{
> +	if (!IS_ENABLED(CONFIG_LOCKDEP))
> +		return;
> +
> +	fs_reclaim_acquire(GFP_KERNEL);
> +	mutex_lock(mutex);
> +	mutex_unlock(mutex);
> +	fs_reclaim_release(GFP_KERNEL);
> +}
> -- 
> 2.18.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex
  2018-07-11  9:33   ` Daniel Vetter
@ 2018-07-11  9:36     ` Daniel Vetter
  2018-07-11  9:49       ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Daniel Vetter @ 2018-07-11  9:36 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, Jul 11, 2018 at 11:33:26AM +0200, Daniel Vetter wrote:
> On Wed, Jul 11, 2018 at 08:36:02AM +0100, Chris Wilson wrote:
> > Add a mutex into struct i915_address_space to be used while operating on
> > the vma and their lists for a particular vm. As this may be called from
> > the shrinker, we taint the mutex with fs_reclaim so that from the start
> > lockdep warns us if we are caught holding the mutex across an
> > allocation. (With such small steps we will eventually rid ourselves of
> > struct_mutex recursion!)
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> Not sure it exists in a branch of yours already, but here's my thoughts on
> extending this to the address_space lrus and the shrinker callback (which
> I think would be the next step with good pay-off):
> 
> 1. make sure pin_count is protected by reservation_obj.
> 2. grab the vm.mutex when walking LRUs everywhere. This is going to be
> tricky for ggtt because of runtime PM. Between lock-dropping, carefully
> avoiding rpm when cleaning up objects and just grabbing an rpm wakeref
> when walking the ggtt vm this should be possible to work around (since for
> the fences we clearly need to be able to nest the vm.mutex within rpm or
> we're busted).
> 3. In the shrinker trylock the reservation_obj and treat a failure to get
> the lock as if pin_count is elevated. If we can't shrink enough then grab
> a temporary reference to the bo using kref_get_unless_zero, drop the
> vm.mutex (since that's what gave us the weak ref) and do a blocking
> reservation_obj lock.

Ok this doesn't work, because reservation_obj needs to allow allocations.
But compared to our current lock stealing trickery the above scheme
reduces possibilities to shrink, or at least rate-limit command submission
somewhat. Not sure how to best tackle that.

Either way a fs_reclaim_*() trick to annotate reservation_obj would be
good ...
-Daniel


> 4. Audit the obj/vma unbind paths and apply reservation_obj or vm.mutex
> locking as needed until we can drop the struct_mutex from the shrinker.
> Biggest trouble is probably ggtt mmap (but I think ordering of pte
> shootdown takes care of that) and drm_mm ("just" needs to be protected by
> vm.mutex too I think).
> 
> Plan is probably the underestimation of the decade, at least :-)
> -Daniel
> > ---
> >  drivers/gpu/drm/i915/i915_drv.h          |  2 +-
> >  drivers/gpu/drm/i915/i915_gem_gtt.c      | 10 ++++++++++
> >  drivers/gpu/drm/i915/i915_gem_gtt.h      |  2 ++
> >  drivers/gpu/drm/i915/i915_gem_shrinker.c | 12 ++++++++++++
> >  4 files changed, 25 insertions(+), 1 deletion(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> > index eeb002a47032..01dd29837233 100644
> > --- a/drivers/gpu/drm/i915/i915_drv.h
> > +++ b/drivers/gpu/drm/i915/i915_drv.h
> > @@ -3304,7 +3304,7 @@ unsigned long i915_gem_shrink(struct drm_i915_private *i915,
> >  unsigned long i915_gem_shrink_all(struct drm_i915_private *i915);
> >  void i915_gem_shrinker_register(struct drm_i915_private *i915);
> >  void i915_gem_shrinker_unregister(struct drm_i915_private *i915);
> > -
> > +void i915_gem_shrinker_taints_mutex(struct mutex *mutex);
> >  
> >  /* i915_gem_tiling.c */
> >  static inline bool i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj)
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > index abd81fb9b0b6..d0acef299b9c 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> > @@ -531,6 +531,14 @@ static void vm_free_page(struct i915_address_space *vm, struct page *page)
> >  static void i915_address_space_init(struct i915_address_space *vm,
> >  				    struct drm_i915_private *dev_priv)
> >  {
> > +	/*
> > +	 * The vm->mutex must be reclaim safe (for use in the shrinker).
> > +	 * Do a dummy acquire now under fs_reclaim so that any allocation
> > +	 * attempt holding the lock is immediately reported by lockdep.
> > +	 */
> > +	mutex_init(&vm->mutex);
> > +	i915_gem_shrinker_taints_mutex(&vm->mutex);
> > +
> >  	GEM_BUG_ON(!vm->total);
> >  	drm_mm_init(&vm->mm, 0, vm->total);
> >  	vm->mm.head_node.color = I915_COLOR_UNEVICTABLE;
> > @@ -551,6 +559,8 @@ static void i915_address_space_fini(struct i915_address_space *vm)
> >  	spin_unlock(&vm->free_pages.lock);
> >  
> >  	drm_mm_takedown(&vm->mm);
> > +
> > +	mutex_destroy(&vm->mutex);
> >  }
> >  
> >  static int __setup_page_dma(struct i915_address_space *vm,
> > diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > index feda45dfd481..14e62651010b 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> > +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> > @@ -293,6 +293,8 @@ struct i915_address_space {
> >  
> >  	bool closed;
> >  
> > +	struct mutex mutex; /* protects vma and our lists */
> > +
> >  	struct i915_page_dma scratch_page;
> >  	struct i915_page_table *scratch_pt;
> >  	struct i915_page_directory *scratch_pd;
> > diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > index c61f5b80fee3..ea90d3a0d511 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> > @@ -23,6 +23,7 @@
> >   */
> >  
> >  #include <linux/oom.h>
> > +#include <linux/sched/mm.h>
> >  #include <linux/shmem_fs.h>
> >  #include <linux/slab.h>
> >  #include <linux/swap.h>
> > @@ -531,3 +532,14 @@ void i915_gem_shrinker_unregister(struct drm_i915_private *i915)
> >  	WARN_ON(unregister_oom_notifier(&i915->mm.oom_notifier));
> >  	unregister_shrinker(&i915->mm.shrinker);
> >  }
> > +
> > +void i915_gem_shrinker_taints_mutex(struct mutex *mutex)
> > +{
> > +	if (!IS_ENABLED(CONFIG_LOCKDEP))
> > +		return;
> > +
> > +	fs_reclaim_acquire(GFP_KERNEL);
> > +	mutex_lock(mutex);
> > +	mutex_unlock(mutex);
> > +	fs_reclaim_release(GFP_KERNEL);
> > +}
> > -- 
> > 2.18.0
> > 
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
> 
> -- 
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex
  2018-07-11  9:36     ` Daniel Vetter
@ 2018-07-11  9:49       ` Chris Wilson
  2018-07-12  7:01         ` Daniel Vetter
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2018-07-11  9:49 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

Quoting Daniel Vetter (2018-07-11 10:36:36)
> On Wed, Jul 11, 2018 at 11:33:26AM +0200, Daniel Vetter wrote:
> > On Wed, Jul 11, 2018 at 08:36:02AM +0100, Chris Wilson wrote:
> > > Add a mutex into struct i915_address_space to be used while operating on
> > > the vma and their lists for a particular vm. As this may be called from
> > > the shrinker, we taint the mutex with fs_reclaim so that from the start
> > > lockdep warns us if we are caught holding the mutex across an
> > > allocation. (With such small steps we will eventually rid ourselves of
> > > struct_mutex recursion!)
> > > 
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > 
> > Not sure it exists in a branch of yours already, but here's my thoughts on
> > extending this to the address_space lrus and the shrinker callback (which
> > I think would be the next step with good pay-off):
> > 
> > 1. make sure pin_count is protected by reservation_obj.

It's vma->pin_count, so I guarded it with vm->mutex (along with the
drm_mm and vm->*list, with some vigorous pruning of lists to reduce the
locking surface). There's also the obj->vma_list and trees that are
moved to a obj->vma_lock.

> > 2. grab the vm.mutex when walking LRUs everywhere. This is going to be
> > tricky for ggtt because of runtime PM. Between lock-dropping, carefully
> > avoiding rpm when cleaning up objects and just grabbing an rpm wakeref
> > when walking the ggtt vm this should be possible to work around (since for
> > the fences we clearly need to be able to nest the vm.mutex within rpm or
> > we're busted).
> > 3. In the shrinker trylock the reservation_obj and treat a failure to get
> > the lock as if pin_count is elevated. If we can't shrink enough then grab
> > a temporary reference to the bo using kref_get_unless_zero, drop the
> > vm.mutex (since that's what gave us the weak ref) and do a blocking
> > reservation_obj lock.
> 
> Ok this doesn't work, because reservation_obj needs to allow allocations.
> But compared to our current lock stealing trickery the above scheme
> reduces possibilities to shrink, or at least rate-limit command submission
> somewhat. Not sure how to best tackle that.

Right, the only way is to avoid us using reservation_obj->lock inside
the shrinker, which is quite easy (for the moment at least, I've haven't
completed the i915_request eradication of struct_mutex for ordering
which is likely to require more reservation_obj or similar).

I do have a branch that gets us to no struct_mutex inside intel_display
and to remove struct_mutex from the shrinker within 10 more ugly patches
that pass CI, which is a start. Something that I want to drip feed
slowly so that we can catch the high MTBF regressions that are inevitable.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/7] drm/i915: Convert fences to use a GGTT lock rather than struct_mutex
  2018-07-11  9:08   ` Daniel Vetter
@ 2018-07-11 10:57     ` Chris Wilson
  2018-07-11 11:12       ` Chris Wilson
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2018-07-11 10:57 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

Quoting Daniel Vetter (2018-07-11 10:08:46)
> On Wed, Jul 11, 2018 at 08:36:04AM +0100, Chris Wilson wrote:
> > Introduce a new mutex to guard all of the vma operations within a vm (as
> > opposed to the BKL struct_mutex) and start by using it to guard the
> > fence operations for a GGTT VMA.
> 
> Commit message is a bit confusing, since you've already introduce this new
> mutex in an earlier patch. Please change to "Switch from dev->struct_mutex
> to ggtt.vm->mutex" or similar ...
> 
> For the reviewers benefit it would also be good to explain how this new
> vm.mutex nests with others stuff here (dev->struct_mutex and rpm come to
> mind, looking from the patch). Probably best here than in docs since it's
> likely going to get outdated.
> 
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/i915_debugfs.c        |  9 ++-
> >  drivers/gpu/drm/i915/i915_gem.c            | 11 +++-
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  5 +-
> >  drivers/gpu/drm/i915/i915_gem_fence_reg.c  | 68 +++++++++++++++++-----
> >  drivers/gpu/drm/i915/i915_vma.c            | 12 ++--
> >  drivers/gpu/drm/i915/i915_vma.h            | 23 +++++++-
> >  6 files changed, 96 insertions(+), 32 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index 75ffed6a3f31..e2ba298a5d88 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -80,7 +80,7 @@ static char get_tiling_flag(struct drm_i915_gem_object *obj)
> >  
> >  static char get_global_flag(struct drm_i915_gem_object *obj)
> >  {
> > -     return obj->userfault_count ? 'g' : ' ';
> > +     return READ_ONCE(obj->userfault_count) ? 'g' : ' ';
> 
> The usefault_count here (and below) look like misplaced hunks?
> 
> >  }
> >  
> >  static char get_pin_mapped_flag(struct drm_i915_gem_object *obj)
> > @@ -914,11 +914,10 @@ static int i915_interrupt_info(struct seq_file *m, void *data)
> >  
> >  static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
> >  {
> > -     struct drm_i915_private *i915 = node_to_i915(m->private);
> > -     const struct i915_ggtt *ggtt = &i915->ggtt;
> > +     struct i915_ggtt *ggtt = &node_to_i915(m->private)->ggtt;
> >       int i, ret;
> >  
> > -     ret = mutex_lock_interruptible(&i915->drm.struct_mutex);
> > +     ret = mutex_lock_interruptible(&ggtt->vm.mutex);
> >       if (ret)
> >               return ret;
> >  
> > @@ -935,7 +934,7 @@ static int i915_gem_fence_regs_info(struct seq_file *m, void *data)
> >               seq_putc(m, '\n');
> >       }
> >  
> > -     mutex_unlock(&i915->drm.struct_mutex);
> > +     mutex_unlock(&ggtt->vm.mutex);
> >       return 0;
> >  }
> >  
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index 356c86071ccc..cbcba613b175 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -2193,8 +2193,8 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
> >        * requirement that operations to the GGTT be made holding the RPM
> >        * wakeref.
> >        */
> > -     lockdep_assert_held(&i915->drm.struct_mutex);
> >       intel_runtime_pm_get(i915);
> > +     mutex_lock(&i915->ggtt.vm.mutex);
> >  
> >       if (!obj->userfault_count)
> >               goto out;
> > @@ -2211,6 +2211,7 @@ i915_gem_release_mmap(struct drm_i915_gem_object *obj)
> >       wmb();
> >  
> >  out:
> > +     mutex_unlock(&i915->ggtt.vm.mutex);
> >       intel_runtime_pm_put(i915);
> >  }
> >  
> > @@ -2223,10 +2224,12 @@ void i915_gem_runtime_suspend(struct drm_i915_private *i915)
> >       /*
> >        * Only called during RPM suspend. All users of the userfault_list
> >        * must be holding an RPM wakeref to ensure that this can not
> > -      * run concurrently with themselves (and use the struct_mutex for
> > +      * run concurrently with themselves (and use the ggtt->mutex for
> >        * protection between themselves).
> >        */
> 
> I think the above change isn't correct, at least not yet at this stage:
> All users of the userfault_list still use dev->struct_mutex, not vm.mutex.
> I guess we could move that over to the ggtt.vm.mutex eventually, but this
> patch doesn't do that.

It does, all those misplaced hunks are not so misplaced.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/7] drm/i915: Convert fences to use a GGTT lock rather than struct_mutex
  2018-07-11 10:57     ` Chris Wilson
@ 2018-07-11 11:12       ` Chris Wilson
  2018-07-12  7:12         ` Daniel Vetter
  0 siblings, 1 reply; 26+ messages in thread
From: Chris Wilson @ 2018-07-11 11:12 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

Quoting Chris Wilson (2018-07-11 11:57:38)
> Quoting Daniel Vetter (2018-07-11 10:08:46)
> > I think the above change isn't correct, at least not yet at this stage:
> > All users of the userfault_list still use dev->struct_mutex, not vm.mutex.
> > I guess we could move that over to the ggtt.vm.mutex eventually, but this
> > patch doesn't do that.
> 
> It does, all those misplaced hunks are not so misplaced.

Since we have differing opinions on whether this is or is not
sufficiently guarding GGTT vs rpm, what test are we missing in CI to
conclusively indicate whether or not this is broken. As it stands, CI is
happy, and I don't have many machines where rpm works (since it requires
fw, sound drivers, etc).
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex
  2018-07-11  9:49       ` Chris Wilson
@ 2018-07-12  7:01         ` Daniel Vetter
  0 siblings, 0 replies; 26+ messages in thread
From: Daniel Vetter @ 2018-07-12  7:01 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, Jul 11, 2018 at 10:49:51AM +0100, Chris Wilson wrote:
> Quoting Daniel Vetter (2018-07-11 10:36:36)
> > On Wed, Jul 11, 2018 at 11:33:26AM +0200, Daniel Vetter wrote:
> > > On Wed, Jul 11, 2018 at 08:36:02AM +0100, Chris Wilson wrote:
> > > > Add a mutex into struct i915_address_space to be used while operating on
> > > > the vma and their lists for a particular vm. As this may be called from
> > > > the shrinker, we taint the mutex with fs_reclaim so that from the start
> > > > lockdep warns us if we are caught holding the mutex across an
> > > > allocation. (With such small steps we will eventually rid ourselves of
> > > > struct_mutex recursion!)
> > > > 
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > 
> > > Not sure it exists in a branch of yours already, but here's my thoughts on
> > > extending this to the address_space lrus and the shrinker callback (which
> > > I think would be the next step with good pay-off):
> > > 
> > > 1. make sure pin_count is protected by reservation_obj.
> 
> It's vma->pin_count, so I guarded it with vm->mutex (along with the
> drm_mm and vm->*list, with some vigorous pruning of lists to reduce the
> locking surface). There's also the obj->vma_list and trees that are
> moved to a obj->vma_lock.

Hm, Christian König's series to wrap dma_buf_map/unmap in the
reservation_obj is why I thought we'd need that. But just for the unmap of
foreign objects I guess we can always punt that to some worker, or just
don't bother if it's contended.

> > > 2. grab the vm.mutex when walking LRUs everywhere. This is going to be
> > > tricky for ggtt because of runtime PM. Between lock-dropping, carefully
> > > avoiding rpm when cleaning up objects and just grabbing an rpm wakeref
> > > when walking the ggtt vm this should be possible to work around (since for
> > > the fences we clearly need to be able to nest the vm.mutex within rpm or
> > > we're busted).
> > > 3. In the shrinker trylock the reservation_obj and treat a failure to get
> > > the lock as if pin_count is elevated. If we can't shrink enough then grab
> > > a temporary reference to the bo using kref_get_unless_zero, drop the
> > > vm.mutex (since that's what gave us the weak ref) and do a blocking
> > > reservation_obj lock.
> > 
> > Ok this doesn't work, because reservation_obj needs to allow allocations.
> > But compared to our current lock stealing trickery the above scheme
> > reduces possibilities to shrink, or at least rate-limit command submission
> > somewhat. Not sure how to best tackle that.
> 
> Right, the only way is to avoid us using reservation_obj->lock inside
> the shrinker, which is quite easy (for the moment at least, I've haven't
> completed the i915_request eradication of struct_mutex for ordering
> which is likely to require more reservation_obj or similar).

Yeah I thihnk as long as i915 doesn't need it we can get away with
trylocks for foreign objects (which I think will need it, sooner or
later).

> I do have a branch that gets us to no struct_mutex inside intel_display
> and to remove struct_mutex from the shrinker within 10 more ugly patches
> that pass CI, which is a start. Something that I want to drip feed
> slowly so that we can catch the high MTBF regressions that are inevitable.

Very much agreed on drip feeding :-)
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [PATCH 3/7] drm/i915: Convert fences to use a GGTT lock rather than struct_mutex
  2018-07-11 11:12       ` Chris Wilson
@ 2018-07-12  7:12         ` Daniel Vetter
  0 siblings, 0 replies; 26+ messages in thread
From: Daniel Vetter @ 2018-07-12  7:12 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Wed, Jul 11, 2018 at 12:12:39PM +0100, Chris Wilson wrote:
> Quoting Chris Wilson (2018-07-11 11:57:38)
> > Quoting Daniel Vetter (2018-07-11 10:08:46)
> > > I think the above change isn't correct, at least not yet at this stage:
> > > All users of the userfault_list still use dev->struct_mutex, not vm.mutex.
> > > I guess we could move that over to the ggtt.vm.mutex eventually, but this
> > > patch doesn't do that.
> > 
> > It does, all those misplaced hunks are not so misplaced.
> 
> Since we have differing opinions on whether this is or is not
> sufficiently guarding GGTT vs rpm, what test are we missing in CI to
> conclusively indicate whether or not this is broken. As it stands, CI is
> happy, and I don't have many machines where rpm works (since it requires
> fw, sound drivers, etc).

So the "misplaced hunk" comment I typed before I fully read through the
patch and spotted the rework of the userfault list. My understanding is
still that the userfault_list is protected by rpm (see some of the
assert_rpm_wakelock_held right next to them), or maybe a combination of
rpm + dev->struct_mutex.

Afaict it's definitely not protected by the new vm.mutex, even after this
patch. That's why I complained about the comment change, and also why I
didn't see why this patch here needs the userfault_count changes.

Now could very well be that I'm missing something around userfault - I
don't remember the details, except that every time I try to reconstruct a
mental model for this I get wrong for a few days. But if that's the case
then I think you're patch isn't sufficient.

- Just for code organization reasons I think we should then move
  mm.userfault_list to the ggtt, like you've done with the fence stuff.
  Separate patch probably, like the fence prep.

- All the other places that touch userfault_* need to be audited/fixed
  too.

This is probably something we need/want to do, but I don't see the
relationship with fences. And assuming the current locking scheme is sound
(with it's funky combination of dev->struct_mutex + rpm) I don't see why
we need to do anything in a patch that only moves the fence stuff over
(and not anything of the other ggtt mmap trickery) to vm.mutex.

Wrt CI coverage: Given that I think the current code without your changes
would be fine, even with fence reg tracking protected by vm.mutex, there's
nothing for CI to prove. I'm just saying you could drop all the
userfault_* related changes from this patch, and it should still all work.

tldr; I think userfault_* is safe as is and will stay safe even with
fences moved to other locks. You can keep the various hunks I complained
about for future patches which will move userfault_* over to
ggtt.vm.mutex.

And oh dear do I not look forward to revieweing userfault_* locking
changes :-)

Cheers, Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2018-07-12  7:12 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-11  7:36 Cleanup live_hangcheck flippers Chris Wilson
2018-07-11  7:36 ` [PATCH 1/7] drm/i915: Introduce i915_address_space.mutex Chris Wilson
2018-07-11  8:09   ` Daniel Vetter
2018-07-11  9:33   ` Daniel Vetter
2018-07-11  9:36     ` Daniel Vetter
2018-07-11  9:49       ` Chris Wilson
2018-07-12  7:01         ` Daniel Vetter
2018-07-11  7:36 ` [PATCH 2/7] drm/i915: Move fence register tracking to GGTT Chris Wilson
2018-07-11  8:19   ` Daniel Vetter
2018-07-11  8:27     ` Chris Wilson
2018-07-11  7:36 ` [PATCH 3/7] drm/i915: Convert fences to use a GGTT lock rather than struct_mutex Chris Wilson
2018-07-11  9:08   ` Daniel Vetter
2018-07-11 10:57     ` Chris Wilson
2018-07-11 11:12       ` Chris Wilson
2018-07-12  7:12         ` Daniel Vetter
2018-07-11  7:36 ` [PATCH 4/7] drm/i915: Move fence-reg interface to i915_gem_fence_reg.h Chris Wilson
2018-07-11  7:36 ` [PATCH 5/7] drm/i915: Dynamically allocate the array of drm_i915_gem_fence_reg Chris Wilson
2018-07-11  9:11   ` Daniel Vetter
2018-07-11  7:36 ` [PATCH 6/7] drm/i915: Pull all the reset functionality together into i915_reset.c Chris Wilson
2018-07-11  9:17   ` Daniel Vetter
2018-07-11  9:28     ` Chris Wilson
2018-07-11  7:36 ` [PATCH 7/7] drm/i915: Remove GPU reset dependence on struct_mutex Chris Wilson
2018-07-11  7:46 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [1/7] drm/i915: Introduce i915_address_space.mutex Patchwork
2018-07-11  7:50 ` ✗ Fi.CI.SPARSE: " Patchwork
2018-07-11  8:03 ` ✓ Fi.CI.BAT: success " Patchwork
2018-07-11  8:59 ` ✗ Fi.CI.IGT: failure " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.