All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset
@ 2019-01-31 12:01 Chris Wilson
  2019-01-31 12:01 ` [PATCH 2/5] drm/i915: Force the GPU reset upon wedging Chris Wilson
                   ` (12 more replies)
  0 siblings, 13 replies; 17+ messages in thread
From: Chris Wilson @ 2019-01-31 12:01 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Previously, we were able to rely on the recursive properties of
struct_mutex to allow us to serialise revoking mmaps and reacquiring the
FENCE registers with them being clobbered over a global device reset.
I then proceeded to throw out the baby with the bath water in order to
pursue a struct_mutex-less reset.

Perusing LWN for alternative strategies, the dilemma on how to serialise
access to a global resource on one side was answered by
https://lwn.net/Articles/202847/ -- Sleepable RCU:

    1  int readside(void) {
    2      int idx;
    3      rcu_read_lock();
    4	   if (nomoresrcu) {
    5          rcu_read_unlock();
    6	       return -EINVAL;
    7      }
    8	   idx = srcu_read_lock(&ss);
    9	   rcu_read_unlock();
    10	   /* SRCU read-side critical section. */
    11	   srcu_read_unlock(&ss, idx);
    12	   return 0;
    13 }
    14
    15 void cleanup(void)
    16 {
    17     nomoresrcu = 1;
    18     synchronize_rcu();
    19     synchronize_srcu(&ss);
    20     cleanup_srcu_struct(&ss);
    21 }

No more worrying about stop_machine, just an uber-complex mutex,
optimised for reads, with the overhead pushed to the rare reset path.

However, we do run the risk of a deadlock as we allocate underneath the
SRCU read lock, and the allocation may require a GPU reset, causing a
dependency cycle via the in-flight requests. We resolve that by declaring
the driver wedged and cancelling all in-flight rendering.

v2: Use expedited rcu barriers to match our earlier timing
characteristics.

Testcase: igt/gem_mmap_gtt/hang
Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  12 +-
 drivers/gpu/drm/i915/i915_drv.h               |  18 +--
 drivers/gpu/drm/i915/i915_gem.c               |  56 +++-------
 drivers/gpu/drm/i915/i915_gem_fence_reg.c     |  31 +-----
 drivers/gpu/drm/i915/i915_gpu_error.h         |  12 +-
 drivers/gpu/drm/i915/i915_reset.c             | 103 +++++++++++-------
 drivers/gpu/drm/i915/i915_reset.h             |   4 +
 .../gpu/drm/i915/selftests/mock_gem_device.c  |   1 +
 8 files changed, 103 insertions(+), 134 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index fa2c226fc779..2cea263b4d79 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1281,14 +1281,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	intel_wakeref_t wakeref;
 	enum intel_engine_id id;
 
+	seq_printf(m, "Reset flags: %lx\n", dev_priv->gpu_error.flags);
 	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Wedged\n");
+		seq_puts(m, "\tWedged\n");
 	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
-		seq_puts(m, "Waiter holding struct mutex\n");
-	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
-		seq_puts(m, "struct_mutex blocked for reset\n");
+		seq_puts(m, "\tDevice (global) reset in progress\n");
 
 	if (!i915_modparams.enable_hangcheck) {
 		seq_puts(m, "Hangcheck disabled\n");
@@ -3885,9 +3882,6 @@ i915_wedged_set(void *data, u64 val)
 	 * while it is writing to 'i915_wedged'
 	 */
 
-	if (i915_reset_backoff(&i915->gpu_error))
-		return -EAGAIN;
-
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 22da9df1f0a7..f7a0e5185874 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2987,7 +2987,12 @@ i915_gem_obj_finish_shmem_access(struct drm_i915_gem_object *obj)
 	i915_gem_object_unpin_pages(obj);
 }
 
-int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
+static inline int __must_check
+i915_mutex_lock_interruptible(struct drm_device *dev)
+{
+	return mutex_lock_interruptible(&dev->struct_mutex);
+}
+
 int i915_gem_dumb_create(struct drm_file *file_priv,
 			 struct drm_device *dev,
 			 struct drm_mode_create_dumb *args);
@@ -3004,21 +3009,11 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine);
 
-static inline bool i915_reset_backoff(struct i915_gpu_error *error)
-{
-	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
-}
-
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
 }
 
-static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
-{
-	return i915_reset_backoff(error) | i915_terminally_wedged(error);
-}
-
 static inline u32 i915_reset_count(struct i915_gpu_error *error)
 {
 	return READ_ONCE(error->reset_count);
@@ -3091,7 +3086,6 @@ struct drm_i915_fence_reg *
 i915_reserve_fence(struct drm_i915_private *dev_priv);
 void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
 
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv);
 void i915_gem_restore_fences(struct drm_i915_private *dev_priv);
 
 void i915_gem_detect_bit_6_swizzle(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e802af64d628..caccff87a2a1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -100,47 +100,6 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
 	spin_unlock(&dev_priv->mm.object_stat_lock);
 }
 
-static int
-i915_gem_wait_for_error(struct i915_gpu_error *error)
-{
-	int ret;
-
-	might_sleep();
-
-	/*
-	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
-	 * userspace. If it takes that long something really bad is going on and
-	 * we should simply try to bail out and fail as gracefully as possible.
-	 */
-	ret = wait_event_interruptible_timeout(error->reset_queue,
-					       !i915_reset_backoff(error),
-					       I915_RESET_TIMEOUT);
-	if (ret == 0) {
-		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
-		return -EIO;
-	} else if (ret < 0) {
-		return ret;
-	} else {
-		return 0;
-	}
-}
-
-int i915_mutex_lock_interruptible(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	int ret;
-
-	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
-	if (ret)
-		return ret;
-
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 static u32 __i915_gem_park(struct drm_i915_private *i915)
 {
 	intel_wakeref_t wakeref;
@@ -1869,6 +1828,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 	intel_wakeref_t wakeref;
 	struct i915_vma *vma;
 	pgoff_t page_offset;
+	int srcu;
 	int ret;
 
 	/* Sanity check that we allow writing into this object */
@@ -1908,7 +1868,6 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 		goto err_unlock;
 	}
 
-
 	/* Now pin it into the GTT as needed */
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
 				       PIN_MAPPABLE |
@@ -1946,9 +1905,15 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 	if (ret)
 		goto err_unpin;
 
+	srcu = i915_reset_lock(dev_priv);
+	if (srcu < 0) {
+		ret = srcu;
+		goto err_unpin;
+	}
+
 	ret = i915_vma_pin_fence(vma);
 	if (ret)
-		goto err_unpin;
+		goto err_reset;
 
 	/* Finally, remap it using the new GTT offset */
 	ret = remap_io_mapping(area,
@@ -1969,6 +1934,8 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 
 err_fence:
 	i915_vma_unpin_fence(vma);
+err_reset:
+	i915_reset_unlock(dev_priv, srcu);
 err_unpin:
 	__i915_vma_unpin(vma);
 err_unlock:
@@ -5324,6 +5291,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
 	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
 	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
 	mutex_init(&dev_priv->gpu_error.wedge_mutex);
+	init_srcu_struct(&dev_priv->gpu_error.srcu);
 
 	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
 
@@ -5356,6 +5324,8 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
 	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
 	WARN_ON(dev_priv->mm.object_count);
 
+	cleanup_srcu_struct(&dev_priv->gpu_error.srcu);
+
 	kmem_cache_destroy(dev_priv->priorities);
 	kmem_cache_destroy(dev_priv->dependencies);
 	kmem_cache_destroy(dev_priv->requests);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 46e259661294..069db4175971 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -240,6 +240,10 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		i915_vma_flush_writes(old);
 	}
 
+	ret = i915_reset_lock(fence->i915);
+	if (ret < 0)
+		return ret;
+
 	if (fence->vma && fence->vma != vma) {
 		/* Ensure that all userspace CPU access is completed before
 		 * stealing the fence.
@@ -272,6 +276,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
 	}
 
+	i915_reset_unlock(fence->i915, ret);
 	return 0;
 }
 
@@ -435,32 +440,6 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
 	list_add(&fence->link, &fence->i915->mm.fence_list);
 }
 
-/**
- * i915_gem_revoke_fences - revoke fence state
- * @dev_priv: i915 device private
- *
- * Removes all GTT mmappings via the fence registers. This forces any user
- * of the fence to reacquire that fence before continuing with their access.
- * One use is during GPU reset where the fence register is lost and we need to
- * revoke concurrent userspace access via GTT mmaps until the hardware has been
- * reset and the fence registers have been restored.
- */
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
-{
-	int i;
-
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
-
-		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
-
-		if (fence->vma)
-			i915_vma_revoke_mmap(fence->vma);
-	}
-}
-
 /**
  * i915_gem_restore_fences - restore fence state
  * @dev_priv: i915 device private
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 53b1f22dd365..4e797c552b96 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -231,12 +231,10 @@ struct i915_gpu_error {
 	/**
 	 * flags: Control various stages of the GPU reset
 	 *
-	 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
-	 * other users acquiring the struct_mutex. To do this we set the
-	 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
-	 * and then check for that bit before acquiring the struct_mutex (in
-	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
-	 * secondary role in preventing two concurrent global reset attempts.
+	 * #I915_RESET_BACKOFF - When we start a global reset, we need to
+	 * serialise with any other users attempting to do the same, and
+	 * any global resources that may be clobber by the reset (such as
+	 * FENCE registers).
 	 *
 	 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
 	 * acquire the struct_mutex to reset an engine, we need an explicit
@@ -272,6 +270,8 @@ struct i915_gpu_error {
 	 */
 	wait_queue_head_t reset_queue;
 
+	struct srcu_struct srcu;
+
 	struct i915_gpu_restart *restart;
 };
 
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 4462007a681c..0a6784cec4fd 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -639,6 +639,31 @@ static void reset_prepare_engine(struct intel_engine_cs *engine)
 	engine->reset.prepare(engine);
 }
 
+static void revoke_mmaps(struct drm_i915_private *i915)
+{
+	int i;
+
+	for (i = 0; i < i915->num_fence_regs; i++) {
+		struct i915_vma *vma = i915->fence_regs[i].vma;
+		struct drm_vma_offset_node *node;
+		u64 vma_offset;
+
+		if (!vma)
+			continue;
+
+		GEM_BUG_ON(vma->fence != &i915->fence_regs[i]);
+		if (!i915_vma_has_userfault(vma))
+			continue;
+
+		node = &vma->obj->base.vma_node;
+		vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
+		unmap_mapping_range(i915->drm.anon_inode->i_mapping,
+				    drm_vma_node_offset_addr(node) + vma_offset,
+				    vma->size,
+				    1);
+	}
+}
+
 static void reset_prepare(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
@@ -648,6 +673,7 @@ static void reset_prepare(struct drm_i915_private *i915)
 		reset_prepare_engine(engine);
 
 	intel_uc_sanitize(i915);
+	revoke_mmaps(i915);
 }
 
 static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
@@ -911,50 +937,22 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	return ret;
 }
 
-struct __i915_reset {
-	struct drm_i915_private *i915;
-	unsigned int stalled_mask;
-};
-
-static int __i915_reset__BKL(void *data)
-{
-	struct __i915_reset *arg = data;
-	int err;
-
-	err = intel_gpu_reset(arg->i915, ALL_ENGINES);
-	if (err)
-		return err;
-
-	return gt_reset(arg->i915, arg->stalled_mask);
-}
-
-#if RESET_UNDER_STOP_MACHINE
-/*
- * XXX An alternative to using stop_machine would be to park only the
- * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
- * we should be able to prevent their memmory accesses via the lost fence
- * registers over the course of the reset without the potential recursive
- * of mutexes between the pagefault handler and reset.
- *
- * See igt/gem_mmap_gtt/hang
- */
-#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
-#else
-#define __do_reset(fn, arg) fn(arg)
-#endif
-
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
-	struct __i915_reset arg = { i915, stalled_mask };
 	int err, i;
 
-	err = __do_reset(__i915_reset__BKL, &arg);
+	/* Flush everyone currently using a resource about to be clobbered */
+	synchronize_srcu(&i915->gpu_error.srcu);
+
+	err = intel_gpu_reset(i915, ALL_ENGINES);
 	for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
-		msleep(100);
-		err = __do_reset(__i915_reset__BKL, &arg);
+		msleep(10 * (i + 1));
+		err = intel_gpu_reset(i915, ALL_ENGINES);
 	}
+	if (err)
+		return err;
 
-	return err;
+	return gt_reset(i915, stalled_mask);
 }
 
 /**
@@ -1274,9 +1272,12 @@ void i915_handle_error(struct drm_i915_private *i915,
 		wait_event(i915->gpu_error.reset_queue,
 			   !test_bit(I915_RESET_BACKOFF,
 				     &i915->gpu_error.flags));
-		goto out;
+		goto out; /* piggy-back on the other reset */
 	}
 
+	/* Make sure i915_reset_lock() sees the I915_RESET_BACKOFF */
+	synchronize_rcu_expedited();
+
 	/* Prevent any other reset-engine attempt. */
 	for_each_engine(engine, i915, tmp) {
 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
@@ -1300,6 +1301,32 @@ void i915_handle_error(struct drm_i915_private *i915,
 	intel_runtime_pm_put(i915, wakeref);
 }
 
+int i915_reset_lock(struct drm_i915_private *i915)
+{
+	int srcu;
+
+	rcu_read_lock();
+	while (test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
+		rcu_read_unlock();
+
+		if (wait_event_interruptible(i915->gpu_error.reset_queue,
+					     !test_bit(I915_RESET_BACKOFF,
+						       &i915->gpu_error.flags)))
+			return -EINTR;
+
+		rcu_read_lock();
+	}
+	srcu = srcu_read_lock(&i915->gpu_error.srcu);
+	rcu_read_unlock();
+
+	return srcu;
+}
+
+void i915_reset_unlock(struct drm_i915_private *i915, int tag)
+{
+	srcu_read_unlock(&i915->gpu_error.srcu, tag);
+}
+
 bool i915_reset_flush(struct drm_i915_private *i915)
 {
 	int err;
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index f2d347f319df..eb412e0158da 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -9,6 +9,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/srcu.h>
 
 struct drm_i915_private;
 struct intel_engine_cs;
@@ -32,6 +33,9 @@ int i915_reset_engine(struct intel_engine_cs *engine,
 void i915_reset_request(struct i915_request *rq, bool guilty);
 bool i915_reset_flush(struct drm_i915_private *i915);
 
+int i915_reset_lock(struct drm_i915_private *i915);
+void i915_reset_unlock(struct drm_i915_private *i915, int tag);
+
 bool intel_has_gpu_reset(struct drm_i915_private *i915);
 bool intel_has_reset_engine(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 14ae46fda49f..074a0d9cbf26 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -189,6 +189,7 @@ struct drm_i915_private *mock_gem_device(void)
 
 	init_waitqueue_head(&i915->gpu_error.wait_queue);
 	init_waitqueue_head(&i915->gpu_error.reset_queue);
+	init_srcu_struct(&i915->gpu_error.srcu);
 	mutex_init(&i915->gpu_error.wedge_mutex);
 
 	i915->wq = alloc_ordered_workqueue("mock", 0);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/5] drm/i915: Force the GPU reset upon wedging
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
@ 2019-01-31 12:01 ` Chris Wilson
  2019-01-31 12:01 ` [PATCH 3/5] drm/i915: Uninterruptibly drain the timelines on unwedging Chris Wilson
                   ` (11 subsequent siblings)
  12 siblings, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2019-01-31 12:01 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

When declaring the GPU wedged, we do need to hit the GPU with the reset
hammer so that its state matches our presumed state during cleanup. If
the reset fails, it fails, and we may be unhappy but wedged. However, if
we are testing our wedge/unwedged handling, the desync carries over into
the next test and promptly explodes.

References: https://bugs.freedesktop.org/show_bug.cgi?id=106702
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_reset.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 0a6784cec4fd..db37fe819504 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -532,9 +532,6 @@ typedef int (*reset_func)(struct drm_i915_private *,
 
 static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
 {
-	if (!i915_modparams.reset)
-		return NULL;
-
 	if (INTEL_GEN(i915) >= 8)
 		return gen8_reset_engines;
 	else if (INTEL_GEN(i915) >= 6)
@@ -599,6 +596,9 @@ bool intel_has_gpu_reset(struct drm_i915_private *i915)
 	if (USES_GUC(i915))
 		return false;
 
+	if (!i915_modparams.reset)
+		return NULL;
+
 	return intel_get_gpu_reset(i915);
 }
 
@@ -823,7 +823,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 		reset_prepare_engine(engine);
 
 	/* Even if the GPU reset fails, it should still stop the engines */
-	if (INTEL_GEN(i915) >= 5)
+	if (!INTEL_INFO(i915)->gpu_reset_clobbers_display)
 		intel_gpu_reset(i915, ALL_ENGINES);
 
 	for_each_engine(engine, i915, id) {
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 3/5] drm/i915: Uninterruptibly drain the timelines on unwedging
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
  2019-01-31 12:01 ` [PATCH 2/5] drm/i915: Force the GPU reset upon wedging Chris Wilson
@ 2019-01-31 12:01 ` Chris Wilson
  2019-01-31 12:01 ` [PATCH 4/5] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
                   ` (10 subsequent siblings)
  12 siblings, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2019-01-31 12:01 UTC (permalink / raw)
  To: intel-gfx

On wedging, we mark all executing requests as complete and all pending
requests completed as soon as they are ready. Before unwedging though we
wish to flush those pending requests prior to restoring default
execution, and so we must wait. Do so interruptibly as we do not provide
the EINTR gracefully back to userspace in this case but persistent in
the permanently wedged start without restarting the syscall.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index db37fe819504..bbe3141c5c72 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -861,7 +861,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct i915_timeline *tl;
-	bool ret = false;
 
 	if (!test_bit(I915_WEDGED, &error->flags))
 		return true;
@@ -886,30 +885,20 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	mutex_lock(&i915->gt.timelines.mutex);
 	list_for_each_entry(tl, &i915->gt.timelines.active_list, link) {
 		struct i915_request *rq;
-		long timeout;
 
 		rq = i915_gem_active_get_unlocked(&tl->last_request);
 		if (!rq)
 			continue;
 
 		/*
-		 * We can't use our normal waiter as we want to
-		 * avoid recursively trying to handle the current
-		 * reset. The basic dma_fence_default_wait() installs
-		 * a callback for dma_fence_signal(), which is
-		 * triggered by our nop handler (indirectly, the
-		 * callback enables the signaler thread which is
-		 * woken by the nop_submit_request() advancing the seqno
-		 * and when the seqno passes the fence, the signaler
-		 * then signals the fence waking us up).
+		 * All internal dependencies (i915_requests) will have
+		 * been flushed by the set-wedge, but we may be stuck waiting
+		 * for external fences. These should all be capped to 10s
+		 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
+		 * in the worst case.
 		 */
-		timeout = dma_fence_default_wait(&rq->fence, true,
-						 MAX_SCHEDULE_TIMEOUT);
+		dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT);
 		i915_request_put(rq);
-		if (timeout < 0) {
-			mutex_unlock(&i915->gt.timelines.mutex);
-			goto unlock;
-		}
 	}
 	mutex_unlock(&i915->gt.timelines.mutex);
 
@@ -930,11 +919,10 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
-	ret = true;
-unlock:
+
 	mutex_unlock(&i915->gpu_error.wedge_mutex);
 
-	return ret;
+	return true;
 }
 
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 4/5] drm/i915: Wait for old resets before applying debugfs/i915_wedged
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
  2019-01-31 12:01 ` [PATCH 2/5] drm/i915: Force the GPU reset upon wedging Chris Wilson
  2019-01-31 12:01 ` [PATCH 3/5] drm/i915: Uninterruptibly drain the timelines on unwedging Chris Wilson
@ 2019-01-31 12:01 ` Chris Wilson
  2019-01-31 12:01 ` [PATCH 5/5] drm/i915: Serialise resets with wedging Chris Wilson
                   ` (9 subsequent siblings)
  12 siblings, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2019-01-31 12:01 UTC (permalink / raw)
  To: intel-gfx

Since we use the debugfs to recover the device after modifying the
i915.reset parameter, we need to be sure that we apply the reset and not
piggy-back onto a concurrent one in order for the parameter to take
effect.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 2cea263b4d79..54e426883529 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3874,13 +3874,9 @@ i915_wedged_set(void *data, u64 val)
 {
 	struct drm_i915_private *i915 = data;
 
-	/*
-	 * There is no safeguard against this debugfs entry colliding
-	 * with the hangcheck calling same i915_handle_error() in
-	 * parallel, causing an explosion. For now we assume that the
-	 * test harness is responsible enough not to inject gpu hangs
-	 * while it is writing to 'i915_wedged'
-	 */
+	/* Flush any previous reset before applying for a new one */
+	wait_event(i915->gpu_error.reset_queue,
+		   !test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags));
 
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 5/5] drm/i915: Serialise resets with wedging
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (2 preceding siblings ...)
  2019-01-31 12:01 ` [PATCH 4/5] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
@ 2019-01-31 12:01 ` Chris Wilson
  2019-01-31 12:13   ` [PATCH] " Chris Wilson
  2019-01-31 12:14   ` Chris Wilson
  2019-01-31 12:17 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Patchwork
                   ` (8 subsequent siblings)
  12 siblings, 2 replies; 17+ messages in thread
From: Chris Wilson @ 2019-01-31 12:01 UTC (permalink / raw)
  To: intel-gfx

Prevent concurrent set-wedge with ongoing resets (and vice versa) by
taking the same wedge_mutex around both operations.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 71 ++++++++++++++++++-------------
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index bbe3141c5c72..3a14109435d4 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -793,18 +793,12 @@ static void nop_submit_request(struct i915_request *request)
 	intel_engine_queue_breadcrumbs(engine);
 }
 
-void i915_gem_set_wedged(struct drm_i915_private *i915)
+static void __i915_gem_set_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	mutex_lock(&error->wedge_mutex);
-	if (test_bit(I915_WEDGED, &error->flags)) {
-		mutex_unlock(&error->wedge_mutex);
-		return;
-	}
-
 	if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
 		struct drm_printer p = drm_debug_printer(__func__);
 
@@ -852,12 +846,20 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	set_bit(I915_WEDGED, &error->flags);
 
 	GEM_TRACE("end\n");
-	mutex_unlock(&error->wedge_mutex);
+}
 
-	wake_up_all(&error->reset_queue);
+void i915_gem_set_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+
+	mutex_lock(&error->wedge_mutex);
+	if (!test_bit(I915_WEDGED, &error->flags)) {
+		__i915_gem_set_wedged(i915);
+		mutex_unlock(&error->wedge_mutex);
+	}
 }
 
-bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+static bool __i915_gem_unset_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct i915_timeline *tl;
@@ -868,8 +870,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
 		return false;
 
-	mutex_lock(&error->wedge_mutex);
-
 	GEM_TRACE("start\n");
 
 	/*
@@ -920,11 +920,21 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
 
-	mutex_unlock(&i915->gpu_error.wedge_mutex);
-
 	return true;
 }
 
+bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	bool result;
+
+	mutex_lock(&error->wedge_mutex);
+	result = __i915_gem_unset_wedged(i915);
+	mutex_unlock(&error->wedge_mutex);
+
+	return result;
+}
+
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
 	int err, i;
@@ -976,7 +986,7 @@ void i915_reset(struct drm_i915_private *i915,
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
-	if (!i915_gem_unset_wedged(i915))
+	if (!__i915_gem_unset_wedged(i915))
 		return;
 
 	if (reason)
@@ -1038,7 +1048,7 @@ void i915_reset(struct drm_i915_private *i915,
 	 */
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 error:
-	i915_gem_set_wedged(i915);
+	__i915_gem_set_wedged(i915);
 	goto finish;
 }
 
@@ -1198,6 +1208,7 @@ void i915_handle_error(struct drm_i915_private *i915,
 		       unsigned long flags,
 		       const char *fmt, ...)
 {
+	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	intel_wakeref_t wakeref;
 	unsigned int tmp;
@@ -1234,20 +1245,19 @@ void i915_handle_error(struct drm_i915_private *i915,
 	 * Try engine reset when available. We fall back to full reset if
 	 * single reset fails.
 	 */
-	if (intel_has_reset_engine(i915) &&
-	    !i915_terminally_wedged(&i915->gpu_error)) {
+	if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
 		for_each_engine_masked(engine, i915, engine_mask, tmp) {
 			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
 			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					     &i915->gpu_error.flags))
+					     &error->flags))
 				continue;
 
 			if (i915_reset_engine(engine, msg) == 0)
 				engine_mask &= ~intel_engine_flag(engine);
 
 			clear_bit(I915_RESET_ENGINE + engine->id,
-				  &i915->gpu_error.flags);
-			wake_up_bit(&i915->gpu_error.flags,
+				  &error->flags);
+			wake_up_bit(&error->flags,
 				    I915_RESET_ENGINE + engine->id);
 		}
 	}
@@ -1256,10 +1266,9 @@ void i915_handle_error(struct drm_i915_private *i915,
 		goto out;
 
 	/* Full reset needs the mutex, stop any other user trying to do so. */
-	if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
-		wait_event(i915->gpu_error.reset_queue,
-			   !test_bit(I915_RESET_BACKOFF,
-				     &i915->gpu_error.flags));
+	if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) {
+		wait_event(error->reset_queue,
+			   !test_bit(I915_RESET_BACKOFF, &error->flags));
 		goto out; /* piggy-back on the other reset */
 	}
 
@@ -1269,21 +1278,23 @@ void i915_handle_error(struct drm_i915_private *i915,
 	/* Prevent any other reset-engine attempt. */
 	for_each_engine(engine, i915, tmp) {
 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					&i915->gpu_error.flags))
-			wait_on_bit(&i915->gpu_error.flags,
+					&error->flags))
+			wait_on_bit(&error->flags,
 				    I915_RESET_ENGINE + engine->id,
 				    TASK_UNINTERRUPTIBLE);
 	}
 
+	mutex_lock(&error->wedge_mutex);
 	i915_reset_device(i915, engine_mask, msg);
+	mutex_unlock(&error->wedge_mutex);
 
 	for_each_engine(engine, i915, tmp) {
 		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &i915->gpu_error.flags);
+			  &error->flags);
 	}
 
-	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
-	wake_up_all(&i915->gpu_error.reset_queue);
+	clear_bit(I915_RESET_BACKOFF, &error->flags);
+	wake_up_all(&error->reset_queue);
 
 out:
 	intel_runtime_pm_put(i915, wakeref);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH] drm/i915: Serialise resets with wedging
  2019-01-31 12:01 ` [PATCH 5/5] drm/i915: Serialise resets with wedging Chris Wilson
@ 2019-01-31 12:13   ` Chris Wilson
  2019-01-31 12:14   ` Chris Wilson
  1 sibling, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2019-01-31 12:13 UTC (permalink / raw)
  To: intel-gfx

Prevent concurrent set-wedge with ongoing resets (and vice versa) by
taking the same wedge_mutex around both operations.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 70 ++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index bbe3141c5c72..ce266865b594 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -793,18 +793,12 @@ static void nop_submit_request(struct i915_request *request)
 	intel_engine_queue_breadcrumbs(engine);
 }
 
-void i915_gem_set_wedged(struct drm_i915_private *i915)
+static void __i915_gem_set_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	mutex_lock(&error->wedge_mutex);
-	if (test_bit(I915_WEDGED, &error->flags)) {
-		mutex_unlock(&error->wedge_mutex);
-		return;
-	}
-
 	if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
 		struct drm_printer p = drm_debug_printer(__func__);
 
@@ -852,12 +846,19 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	set_bit(I915_WEDGED, &error->flags);
 
 	GEM_TRACE("end\n");
-	mutex_unlock(&error->wedge_mutex);
+}
 
-	wake_up_all(&error->reset_queue);
+void i915_gem_set_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+
+	mutex_lock(&error->wedge_mutex);
+	if (!test_bit(I915_WEDGED, &error->flags))
+		__i915_gem_set_wedged(i915);
+	mutex_unlock(&error->wedge_mutex);
 }
 
-bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+static bool __i915_gem_unset_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct i915_timeline *tl;
@@ -868,8 +869,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
 		return false;
 
-	mutex_lock(&error->wedge_mutex);
-
 	GEM_TRACE("start\n");
 
 	/*
@@ -920,11 +919,21 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
 
-	mutex_unlock(&i915->gpu_error.wedge_mutex);
-
 	return true;
 }
 
+bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	bool result;
+
+	mutex_lock(&error->wedge_mutex);
+	result = __i915_gem_unset_wedged(i915);
+	mutex_unlock(&error->wedge_mutex);
+
+	return result;
+}
+
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
 	int err, i;
@@ -976,7 +985,7 @@ void i915_reset(struct drm_i915_private *i915,
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
-	if (!i915_gem_unset_wedged(i915))
+	if (!__i915_gem_unset_wedged(i915))
 		return;
 
 	if (reason)
@@ -1038,7 +1047,7 @@ void i915_reset(struct drm_i915_private *i915,
 	 */
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 error:
-	i915_gem_set_wedged(i915);
+	__i915_gem_set_wedged(i915);
 	goto finish;
 }
 
@@ -1198,6 +1207,7 @@ void i915_handle_error(struct drm_i915_private *i915,
 		       unsigned long flags,
 		       const char *fmt, ...)
 {
+	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	intel_wakeref_t wakeref;
 	unsigned int tmp;
@@ -1234,20 +1244,19 @@ void i915_handle_error(struct drm_i915_private *i915,
 	 * Try engine reset when available. We fall back to full reset if
 	 * single reset fails.
 	 */
-	if (intel_has_reset_engine(i915) &&
-	    !i915_terminally_wedged(&i915->gpu_error)) {
+	if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
 		for_each_engine_masked(engine, i915, engine_mask, tmp) {
 			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
 			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					     &i915->gpu_error.flags))
+					     &error->flags))
 				continue;
 
 			if (i915_reset_engine(engine, msg) == 0)
 				engine_mask &= ~intel_engine_flag(engine);
 
 			clear_bit(I915_RESET_ENGINE + engine->id,
-				  &i915->gpu_error.flags);
-			wake_up_bit(&i915->gpu_error.flags,
+				  &error->flags);
+			wake_up_bit(&error->flags,
 				    I915_RESET_ENGINE + engine->id);
 		}
 	}
@@ -1256,10 +1265,9 @@ void i915_handle_error(struct drm_i915_private *i915,
 		goto out;
 
 	/* Full reset needs the mutex, stop any other user trying to do so. */
-	if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
-		wait_event(i915->gpu_error.reset_queue,
-			   !test_bit(I915_RESET_BACKOFF,
-				     &i915->gpu_error.flags));
+	if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) {
+		wait_event(error->reset_queue,
+			   !test_bit(I915_RESET_BACKOFF, &error->flags));
 		goto out; /* piggy-back on the other reset */
 	}
 
@@ -1269,21 +1277,23 @@ void i915_handle_error(struct drm_i915_private *i915,
 	/* Prevent any other reset-engine attempt. */
 	for_each_engine(engine, i915, tmp) {
 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					&i915->gpu_error.flags))
-			wait_on_bit(&i915->gpu_error.flags,
+					&error->flags))
+			wait_on_bit(&error->flags,
 				    I915_RESET_ENGINE + engine->id,
 				    TASK_UNINTERRUPTIBLE);
 	}
 
+	mutex_lock(&error->wedge_mutex);
 	i915_reset_device(i915, engine_mask, msg);
+	mutex_unlock(&error->wedge_mutex);
 
 	for_each_engine(engine, i915, tmp) {
 		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &i915->gpu_error.flags);
+			  &error->flags);
 	}
 
-	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
-	wake_up_all(&i915->gpu_error.reset_queue);
+	clear_bit(I915_RESET_BACKOFF, &error->flags);
+	wake_up_all(&error->reset_queue);
 
 out:
 	intel_runtime_pm_put(i915, wakeref);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH] drm/i915: Serialise resets with wedging
  2019-01-31 12:01 ` [PATCH 5/5] drm/i915: Serialise resets with wedging Chris Wilson
  2019-01-31 12:13   ` [PATCH] " Chris Wilson
@ 2019-01-31 12:14   ` Chris Wilson
  1 sibling, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2019-01-31 12:14 UTC (permalink / raw)
  To: intel-gfx

Prevent concurrent set-wedge with ongoing resets (and vice versa) by
taking the same wedge_mutex around both operations.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 68 ++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index bbe3141c5c72..5e21bf781c0b 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -793,17 +793,14 @@ static void nop_submit_request(struct i915_request *request)
 	intel_engine_queue_breadcrumbs(engine);
 }
 
-void i915_gem_set_wedged(struct drm_i915_private *i915)
+static void __i915_gem_set_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	mutex_lock(&error->wedge_mutex);
-	if (test_bit(I915_WEDGED, &error->flags)) {
-		mutex_unlock(&error->wedge_mutex);
+	if (test_bit(I915_WEDGED, &error->flags))
 		return;
-	}
 
 	if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
 		struct drm_printer p = drm_debug_printer(__func__);
@@ -852,12 +849,18 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	set_bit(I915_WEDGED, &error->flags);
 
 	GEM_TRACE("end\n");
-	mutex_unlock(&error->wedge_mutex);
+}
 
-	wake_up_all(&error->reset_queue);
+void i915_gem_set_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+
+	mutex_lock(&error->wedge_mutex);
+	__i915_gem_set_wedged(i915);
+	mutex_unlock(&error->wedge_mutex);
 }
 
-bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+static bool __i915_gem_unset_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct i915_timeline *tl;
@@ -868,8 +871,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
 		return false;
 
-	mutex_lock(&error->wedge_mutex);
-
 	GEM_TRACE("start\n");
 
 	/*
@@ -920,11 +921,21 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
 
-	mutex_unlock(&i915->gpu_error.wedge_mutex);
-
 	return true;
 }
 
+bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	bool result;
+
+	mutex_lock(&error->wedge_mutex);
+	result = __i915_gem_unset_wedged(i915);
+	mutex_unlock(&error->wedge_mutex);
+
+	return result;
+}
+
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
 	int err, i;
@@ -976,7 +987,7 @@ void i915_reset(struct drm_i915_private *i915,
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
-	if (!i915_gem_unset_wedged(i915))
+	if (!__i915_gem_unset_wedged(i915))
 		return;
 
 	if (reason)
@@ -1038,7 +1049,7 @@ void i915_reset(struct drm_i915_private *i915,
 	 */
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 error:
-	i915_gem_set_wedged(i915);
+	__i915_gem_set_wedged(i915);
 	goto finish;
 }
 
@@ -1198,6 +1209,7 @@ void i915_handle_error(struct drm_i915_private *i915,
 		       unsigned long flags,
 		       const char *fmt, ...)
 {
+	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	intel_wakeref_t wakeref;
 	unsigned int tmp;
@@ -1234,20 +1246,19 @@ void i915_handle_error(struct drm_i915_private *i915,
 	 * Try engine reset when available. We fall back to full reset if
 	 * single reset fails.
 	 */
-	if (intel_has_reset_engine(i915) &&
-	    !i915_terminally_wedged(&i915->gpu_error)) {
+	if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
 		for_each_engine_masked(engine, i915, engine_mask, tmp) {
 			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
 			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					     &i915->gpu_error.flags))
+					     &error->flags))
 				continue;
 
 			if (i915_reset_engine(engine, msg) == 0)
 				engine_mask &= ~intel_engine_flag(engine);
 
 			clear_bit(I915_RESET_ENGINE + engine->id,
-				  &i915->gpu_error.flags);
-			wake_up_bit(&i915->gpu_error.flags,
+				  &error->flags);
+			wake_up_bit(&error->flags,
 				    I915_RESET_ENGINE + engine->id);
 		}
 	}
@@ -1256,10 +1267,9 @@ void i915_handle_error(struct drm_i915_private *i915,
 		goto out;
 
 	/* Full reset needs the mutex, stop any other user trying to do so. */
-	if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
-		wait_event(i915->gpu_error.reset_queue,
-			   !test_bit(I915_RESET_BACKOFF,
-				     &i915->gpu_error.flags));
+	if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) {
+		wait_event(error->reset_queue,
+			   !test_bit(I915_RESET_BACKOFF, &error->flags));
 		goto out; /* piggy-back on the other reset */
 	}
 
@@ -1269,21 +1279,23 @@ void i915_handle_error(struct drm_i915_private *i915,
 	/* Prevent any other reset-engine attempt. */
 	for_each_engine(engine, i915, tmp) {
 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					&i915->gpu_error.flags))
-			wait_on_bit(&i915->gpu_error.flags,
+					&error->flags))
+			wait_on_bit(&error->flags,
 				    I915_RESET_ENGINE + engine->id,
 				    TASK_UNINTERRUPTIBLE);
 	}
 
+	mutex_lock(&error->wedge_mutex);
 	i915_reset_device(i915, engine_mask, msg);
+	mutex_unlock(&error->wedge_mutex);
 
 	for_each_engine(engine, i915, tmp) {
 		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &i915->gpu_error.flags);
+			  &error->flags);
 	}
 
-	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
-	wake_up_all(&i915->gpu_error.reset_queue);
+	clear_bit(I915_RESET_BACKOFF, &error->flags);
+	wake_up_all(&error->reset_queue);
 
 out:
 	intel_runtime_pm_put(i915, wakeref);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* ✗ Fi.CI.SPARSE: warning for series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (3 preceding siblings ...)
  2019-01-31 12:01 ` [PATCH 5/5] drm/i915: Serialise resets with wedging Chris Wilson
@ 2019-01-31 12:17 ` Patchwork
  2019-01-31 12:39 ` ✗ Fi.CI.BAT: failure " Patchwork
                   ` (7 subsequent siblings)
  12 siblings, 0 replies; 17+ messages in thread
From: Patchwork @ 2019-01-31 12:17 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset
URL   : https://patchwork.freedesktop.org/series/56042/
State : warning

== Summary ==

$ dim sparse origin/drm-tip
Sparse version: v0.5.2
Commit: drm/i915: Revoke mmaps and prevent access to fence registers across reset
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_reset.c:1304:5: warning: context imbalance in 'i915_reset_lock' - different lock contexts for basic block
+drivers/gpu/drm/i915/i915_reset.c:1325:6: warning: context imbalance in 'i915_reset_unlock' - unexpected unlock
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3549:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3543:16: warning: expression using sizeof(void)

Commit: drm/i915: Force the GPU reset upon wedging
Okay!

Commit: drm/i915: Uninterruptibly drain the timelines on unwedging
Okay!

Commit: drm/i915: Wait for old resets before applying debugfs/i915_wedged
Okay!

Commit: drm/i915: Serialise resets with wedging
Okay!

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (4 preceding siblings ...)
  2019-01-31 12:17 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Patchwork
@ 2019-01-31 12:39 ` Patchwork
  2019-01-31 12:40 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3) Patchwork
                   ` (6 subsequent siblings)
  12 siblings, 0 replies; 17+ messages in thread
From: Patchwork @ 2019-01-31 12:39 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset
URL   : https://patchwork.freedesktop.org/series/56042/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_5518 -> Patchwork_12107
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_12107 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_12107, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/56042/revisions/1/mbox/

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_12107:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_selftest@live_hangcheck:
    - fi-bdw-gvtdvm:      PASS -> TIMEOUT
    - fi-bdw-5557u:       PASS -> TIMEOUT
    - fi-bwr-2160:        PASS -> INCOMPLETE
    - fi-kbl-7567u:       PASS -> TIMEOUT
    - fi-glk-j4005:       PASS -> TIMEOUT
    - fi-cfl-8109u:       PASS -> TIMEOUT
    - fi-snb-2520m:       PASS -> TIMEOUT
    - fi-cfl-8700k:       PASS -> INCOMPLETE
    - fi-whl-u:           PASS -> TIMEOUT
    - fi-ilk-650:         PASS -> TIMEOUT
    - fi-ivb-3520m:       PASS -> TIMEOUT
    - fi-kbl-x1275:       PASS -> TIMEOUT
    - fi-kbl-8809g:       PASS -> TIMEOUT
    - fi-bsw-kefka:       NOTRUN -> TIMEOUT
    - fi-blb-e6850:       PASS -> TIMEOUT
    - fi-byt-clapper:     PASS -> TIMEOUT
    - fi-skl-6600u:       NOTRUN -> TIMEOUT
    - fi-bsw-n3050:       PASS -> INCOMPLETE
    - fi-byt-j1900:       PASS -> TIMEOUT
    - fi-hsw-4770:        PASS -> INCOMPLETE
    - fi-skl-6770hq:      PASS -> TIMEOUT
    - fi-byt-n2820:       PASS -> TIMEOUT
    - fi-ivb-3770:        NOTRUN -> INCOMPLETE
    - fi-skl-6700k2:      PASS -> TIMEOUT
    - fi-snb-2600:        NOTRUN -> TIMEOUT
    - fi-hsw-4770r:       PASS -> TIMEOUT
    - fi-hsw-peppy:       PASS -> TIMEOUT

  
Known issues
------------

  Here are the changes found in Patchwork_12107 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_selftest@live_hangcheck:
    - fi-kbl-r:           PASS -> INCOMPLETE [fdo#108744]
    - fi-elk-e7500:       PASS -> INCOMPLETE [fdo#103989]
    - fi-skl-gvtdvm:      PASS -> INCOMPLETE [fdo#105600] / [fdo#108744]
    - fi-bxt-j4205:       PASS -> INCOMPLETE [fdo#103927]
    - fi-skl-6700hq:      PASS -> INCOMPLETE [fdo#108744]
    - fi-kbl-7500u:       PASS -> INCOMPLETE [fdo#108744]
    - fi-kbl-7560u:       PASS -> INCOMPLETE [fdo#108044] / [fdo#108744]
    - fi-skl-6260u:       PASS -> INCOMPLETE [fdo#108744]

  * igt@kms_busy@basic-flip-b:
    - fi-gdg-551:         PASS -> FAIL [fdo#103182]

  * igt@kms_pipe_crc_basic@suspend-read-crc-pipe-b:
    - fi-byt-clapper:     PASS -> FAIL [fdo#103191] / [fdo#107362] +1

  
#### Possible fixes ####

  * igt@kms_busy@basic-flip-a:
    - fi-kbl-7567u:       {SKIP} [fdo#109271] / [fdo#109278] -> PASS +2
    - fi-gdg-551:         FAIL [fdo#103182] -> PASS

  * igt@kms_chamelium@hdmi-hpd-fast:
    - fi-kbl-7500u:       FAIL [fdo#109485] -> PASS

  * igt@kms_pipe_crc_basic@read-crc-pipe-b-frame-sequence:
    - fi-byt-clapper:     FAIL [fdo#103191] / [fdo#107362] -> PASS

  * igt@pm_rpm@module-reload:
    - fi-skl-6770hq:      FAIL [fdo#108511] -> PASS

  * igt@prime_vgem@basic-fence-flip:
    - fi-gdg-551:         DMESG-FAIL [fdo#103182] -> PASS

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#103182]: https://bugs.freedesktop.org/show_bug.cgi?id=103182
  [fdo#103191]: https://bugs.freedesktop.org/show_bug.cgi?id=103191
  [fdo#103927]: https://bugs.freedesktop.org/show_bug.cgi?id=103927
  [fdo#103989]: https://bugs.freedesktop.org/show_bug.cgi?id=103989
  [fdo#105600]: https://bugs.freedesktop.org/show_bug.cgi?id=105600
  [fdo#107362]: https://bugs.freedesktop.org/show_bug.cgi?id=107362
  [fdo#108044]: https://bugs.freedesktop.org/show_bug.cgi?id=108044
  [fdo#108511]: https://bugs.freedesktop.org/show_bug.cgi?id=108511
  [fdo#108569]: https://bugs.freedesktop.org/show_bug.cgi?id=108569
  [fdo#108744]: https://bugs.freedesktop.org/show_bug.cgi?id=108744
  [fdo#108915]: https://bugs.freedesktop.org/show_bug.cgi?id=108915
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109485]: https://bugs.freedesktop.org/show_bug.cgi?id=109485
  [fdo#109516]: https://bugs.freedesktop.org/show_bug.cgi?id=109516


Participating hosts (43 -> 45)
------------------------------

  Additional (6): fi-bxt-dsi fi-ivb-3770 fi-icl-y fi-bsw-kefka fi-skl-6600u fi-snb-2600 
  Missing    (4): fi-ilk-m540 fi-byt-squawks fi-bsw-cyan fi-pnv-d510 


Build changes
-------------

    * Linux: CI_DRM_5518 -> Patchwork_12107

  CI_DRM_5518: 2369fd28d3a46b865f6d4f1d309a4c6b7b4e6d93 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4801: 6f6bacf12759fb319ade3ba37861ae711f8a5cd9 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12107: 575ec763f436b9f173f86b120bc99fbdc971391c @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

575ec763f436 drm/i915: Serialise resets with wedging
3163e10f9ffc drm/i915: Wait for old resets before applying debugfs/i915_wedged
12fc774a1e46 drm/i915: Uninterruptibly drain the timelines on unwedging
66073a0b814f drm/i915: Force the GPU reset upon wedging
c80d908c0b3f drm/i915: Revoke mmaps and prevent access to fence registers across reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12107/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* ✗ Fi.CI.SPARSE: warning for series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3)
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (5 preceding siblings ...)
  2019-01-31 12:39 ` ✗ Fi.CI.BAT: failure " Patchwork
@ 2019-01-31 12:40 ` Patchwork
  2019-01-31 12:59 ` ✗ Fi.CI.BAT: failure " Patchwork
                   ` (5 subsequent siblings)
  12 siblings, 0 replies; 17+ messages in thread
From: Patchwork @ 2019-01-31 12:40 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3)
URL   : https://patchwork.freedesktop.org/series/56042/
State : warning

== Summary ==

$ dim sparse origin/drm-tip
Sparse version: v0.5.2
Commit: drm/i915: Revoke mmaps and prevent access to fence registers across reset
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_reset.c:1304:5: warning: context imbalance in 'i915_reset_lock' - different lock contexts for basic block
+drivers/gpu/drm/i915/i915_reset.c:1325:6: warning: context imbalance in 'i915_reset_unlock' - unexpected unlock
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3549:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3543:16: warning: expression using sizeof(void)

Commit: drm/i915: Force the GPU reset upon wedging
Okay!

Commit: drm/i915: Uninterruptibly drain the timelines on unwedging
Okay!

Commit: drm/i915: Wait for old resets before applying debugfs/i915_wedged
Okay!

Commit: drm/i915: Serialise resets with wedging
Okay!

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3)
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (6 preceding siblings ...)
  2019-01-31 12:40 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3) Patchwork
@ 2019-01-31 12:59 ` Patchwork
  2019-01-31 13:12 ` [PATCH] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (4 subsequent siblings)
  12 siblings, 0 replies; 17+ messages in thread
From: Patchwork @ 2019-01-31 12:59 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3)
URL   : https://patchwork.freedesktop.org/series/56042/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_5518 -> Patchwork_12108
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_12108 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_12108, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/56042/revisions/3/mbox/

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_12108:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_selftest@live_hangcheck:
    - fi-bdw-gvtdvm:      PASS -> TIMEOUT
    - fi-bdw-5557u:       PASS -> TIMEOUT
    - fi-bwr-2160:        PASS -> INCOMPLETE
    - fi-kbl-7567u:       PASS -> TIMEOUT
    - fi-glk-j4005:       PASS -> TIMEOUT
    - fi-cfl-8109u:       PASS -> TIMEOUT
    - fi-snb-2520m:       PASS -> TIMEOUT
    - fi-pnv-d510:        PASS -> TIMEOUT
    - fi-cfl-8700k:       PASS -> INCOMPLETE
    - fi-whl-u:           PASS -> TIMEOUT
    - fi-ilk-650:         PASS -> TIMEOUT
    - fi-ivb-3520m:       PASS -> INCOMPLETE
    - fi-kbl-x1275:       PASS -> TIMEOUT
    - fi-bsw-kefka:       NOTRUN -> TIMEOUT
    - fi-byt-clapper:     PASS -> TIMEOUT
    - fi-bsw-n3050:       PASS -> TIMEOUT
    - fi-hsw-4770:        PASS -> INCOMPLETE
    - fi-skl-6770hq:      PASS -> TIMEOUT
    - fi-ivb-3770:        NOTRUN -> INCOMPLETE
    - fi-skl-6260u:       PASS -> TIMEOUT
    - fi-snb-2600:        NOTRUN -> TIMEOUT
    - fi-hsw-4770r:       PASS -> INCOMPLETE
    - fi-hsw-peppy:       PASS -> TIMEOUT

  
Known issues
------------

  Here are the changes found in Patchwork_12108 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@amdgpu/amd_basic@userptr:
    - fi-kbl-8809g:       PASS -> DMESG-WARN [fdo#108965]

  * igt@gem_exec_suspend@basic-s4-devices:
    - fi-blb-e6850:       PASS -> INCOMPLETE [fdo#107718]

  * igt@i915_selftest@live_hangcheck:
    - fi-kbl-r:           PASS -> INCOMPLETE [fdo#108744]
    - fi-elk-e7500:       PASS -> INCOMPLETE [fdo#103989]
    - fi-skl-gvtdvm:      PASS -> INCOMPLETE [fdo#105600] / [fdo#108744]
    - fi-bxt-j4205:       PASS -> INCOMPLETE [fdo#103927]
    - fi-skl-6700hq:      PASS -> INCOMPLETE [fdo#108744]
    - fi-kbl-7500u:       PASS -> INCOMPLETE [fdo#108744]
    - fi-byt-j1900:       PASS -> INCOMPLETE [fdo#102657]
    - fi-kbl-7560u:       PASS -> INCOMPLETE [fdo#108044] / [fdo#108744]
    - fi-byt-n2820:       PASS -> INCOMPLETE [fdo#102657]
    - fi-skl-6700k2:      PASS -> INCOMPLETE [fdo#108744]

  * igt@kms_pipe_crc_basic@nonblocking-crc-pipe-a-frame-sequence:
    - fi-byt-clapper:     PASS -> FAIL [fdo#103191] / [fdo#107362]

  
#### Possible fixes ####

  * igt@i915_selftest@live_execlists:
    - fi-apl-guc:         INCOMPLETE [fdo#103927] -> PASS

  * igt@kms_busy@basic-flip-a:
    - fi-kbl-7567u:       {SKIP} [fdo#109271] / [fdo#109278] -> PASS +2
    - fi-gdg-551:         FAIL [fdo#103182] -> PASS

  * igt@kms_chamelium@hdmi-hpd-fast:
    - fi-kbl-7500u:       FAIL [fdo#109485] -> PASS

  * igt@kms_pipe_crc_basic@read-crc-pipe-b-frame-sequence:
    - fi-byt-clapper:     FAIL [fdo#103191] / [fdo#107362] -> PASS

  * igt@pm_rpm@module-reload:
    - fi-skl-6770hq:      FAIL [fdo#108511] -> PASS

  * igt@prime_vgem@basic-fence-flip:
    - fi-gdg-551:         DMESG-FAIL [fdo#103182] -> PASS

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#102657]: https://bugs.freedesktop.org/show_bug.cgi?id=102657
  [fdo#103182]: https://bugs.freedesktop.org/show_bug.cgi?id=103182
  [fdo#103191]: https://bugs.freedesktop.org/show_bug.cgi?id=103191
  [fdo#103927]: https://bugs.freedesktop.org/show_bug.cgi?id=103927
  [fdo#103989]: https://bugs.freedesktop.org/show_bug.cgi?id=103989
  [fdo#105600]: https://bugs.freedesktop.org/show_bug.cgi?id=105600
  [fdo#107362]: https://bugs.freedesktop.org/show_bug.cgi?id=107362
  [fdo#107718]: https://bugs.freedesktop.org/show_bug.cgi?id=107718
  [fdo#108044]: https://bugs.freedesktop.org/show_bug.cgi?id=108044
  [fdo#108511]: https://bugs.freedesktop.org/show_bug.cgi?id=108511
  [fdo#108569]: https://bugs.freedesktop.org/show_bug.cgi?id=108569
  [fdo#108654]: https://bugs.freedesktop.org/show_bug.cgi?id=108654
  [fdo#108744]: https://bugs.freedesktop.org/show_bug.cgi?id=108744
  [fdo#108965]: https://bugs.freedesktop.org/show_bug.cgi?id=108965
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109485]: https://bugs.freedesktop.org/show_bug.cgi?id=109485
  [fdo#109516]: https://bugs.freedesktop.org/show_bug.cgi?id=109516


Participating hosts (43 -> 44)
------------------------------

  Additional (4): fi-bsw-kefka fi-bxt-dsi fi-ivb-3770 fi-snb-2600 
  Missing    (3): fi-ilk-m540 fi-byt-squawks fi-bsw-cyan 


Build changes
-------------

    * Linux: CI_DRM_5518 -> Patchwork_12108

  CI_DRM_5518: 2369fd28d3a46b865f6d4f1d309a4c6b7b4e6d93 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4801: 6f6bacf12759fb319ade3ba37861ae711f8a5cd9 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12108: 91dfc51e3be0c229cd8ca3b044a75b02379680b9 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

91dfc51e3be0 drm/i915: Serialise resets with wedging
fb235c8c0069 drm/i915: Wait for old resets before applying debugfs/i915_wedged
8e711e1e8325 drm/i915: Uninterruptibly drain the timelines on unwedging
1de71716d9dc drm/i915: Force the GPU reset upon wedging
03fc31d218e7 drm/i915: Revoke mmaps and prevent access to fence registers across reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12108/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (7 preceding siblings ...)
  2019-01-31 12:59 ` ✗ Fi.CI.BAT: failure " Patchwork
@ 2019-01-31 13:12 ` Chris Wilson
  2019-01-31 13:23 ` ✗ Fi.CI.SPARSE: warning for series starting with drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev4) Patchwork
                   ` (3 subsequent siblings)
  12 siblings, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2019-01-31 13:12 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Previously, we were able to rely on the recursive properties of
struct_mutex to allow us to serialise revoking mmaps and reacquiring the
FENCE registers with them being clobbered over a global device reset.
I then proceeded to throw out the baby with the bath water in order to
pursue a struct_mutex-less reset.

Perusing LWN for alternative strategies, the dilemma on how to serialise
access to a global resource on one side was answered by
https://lwn.net/Articles/202847/ -- Sleepable RCU:

    1  int readside(void) {
    2      int idx;
    3      rcu_read_lock();
    4	   if (nomoresrcu) {
    5          rcu_read_unlock();
    6	       return -EINVAL;
    7      }
    8	   idx = srcu_read_lock(&ss);
    9	   rcu_read_unlock();
    10	   /* SRCU read-side critical section. */
    11	   srcu_read_unlock(&ss, idx);
    12	   return 0;
    13 }
    14
    15 void cleanup(void)
    16 {
    17     nomoresrcu = 1;
    18     synchronize_rcu();
    19     synchronize_srcu(&ss);
    20     cleanup_srcu_struct(&ss);
    21 }

No more worrying about stop_machine, just an uber-complex mutex,
optimised for reads, with the overhead pushed to the rare reset path.

However, we do run the risk of a deadlock as we allocate underneath the
SRCU read lock, and the allocation may require a GPU reset, causing a
dependency cycle via the in-flight requests. We resolve that by declaring
the driver wedged and cancelling all in-flight rendering.

v2: Use expedited rcu barriers to match our earlier timing
characteristics.
v3: Try to annotate locking contexts for sparse
v4: Reduce selftest lock duration to avoid a reset deadlock with fences

Testcase: igt/gem_mmap_gtt/hang
Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  12 +-
 drivers/gpu/drm/i915/i915_drv.h               |  18 +--
 drivers/gpu/drm/i915/i915_gem.c               |  56 +++------
 drivers/gpu/drm/i915/i915_gem_fence_reg.c     |  31 +----
 drivers/gpu/drm/i915/i915_gpu_error.h         |  12 +-
 drivers/gpu/drm/i915/i915_reset.c             | 107 +++++++++++-------
 drivers/gpu/drm/i915/i915_reset.h             |   4 +
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |   5 +-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |   1 +
 9 files changed, 109 insertions(+), 137 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index fa2c226fc779..2cea263b4d79 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1281,14 +1281,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	intel_wakeref_t wakeref;
 	enum intel_engine_id id;
 
+	seq_printf(m, "Reset flags: %lx\n", dev_priv->gpu_error.flags);
 	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Wedged\n");
+		seq_puts(m, "\tWedged\n");
 	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
-		seq_puts(m, "Waiter holding struct mutex\n");
-	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
-		seq_puts(m, "struct_mutex blocked for reset\n");
+		seq_puts(m, "\tDevice (global) reset in progress\n");
 
 	if (!i915_modparams.enable_hangcheck) {
 		seq_puts(m, "Hangcheck disabled\n");
@@ -3885,9 +3882,6 @@ i915_wedged_set(void *data, u64 val)
 	 * while it is writing to 'i915_wedged'
 	 */
 
-	if (i915_reset_backoff(&i915->gpu_error))
-		return -EAGAIN;
-
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 22da9df1f0a7..f7a0e5185874 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2987,7 +2987,12 @@ i915_gem_obj_finish_shmem_access(struct drm_i915_gem_object *obj)
 	i915_gem_object_unpin_pages(obj);
 }
 
-int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
+static inline int __must_check
+i915_mutex_lock_interruptible(struct drm_device *dev)
+{
+	return mutex_lock_interruptible(&dev->struct_mutex);
+}
+
 int i915_gem_dumb_create(struct drm_file *file_priv,
 			 struct drm_device *dev,
 			 struct drm_mode_create_dumb *args);
@@ -3004,21 +3009,11 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine);
 
-static inline bool i915_reset_backoff(struct i915_gpu_error *error)
-{
-	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
-}
-
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
 }
 
-static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
-{
-	return i915_reset_backoff(error) | i915_terminally_wedged(error);
-}
-
 static inline u32 i915_reset_count(struct i915_gpu_error *error)
 {
 	return READ_ONCE(error->reset_count);
@@ -3091,7 +3086,6 @@ struct drm_i915_fence_reg *
 i915_reserve_fence(struct drm_i915_private *dev_priv);
 void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
 
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv);
 void i915_gem_restore_fences(struct drm_i915_private *dev_priv);
 
 void i915_gem_detect_bit_6_swizzle(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e802af64d628..a7a9bb0e3b56 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -100,47 +100,6 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
 	spin_unlock(&dev_priv->mm.object_stat_lock);
 }
 
-static int
-i915_gem_wait_for_error(struct i915_gpu_error *error)
-{
-	int ret;
-
-	might_sleep();
-
-	/*
-	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
-	 * userspace. If it takes that long something really bad is going on and
-	 * we should simply try to bail out and fail as gracefully as possible.
-	 */
-	ret = wait_event_interruptible_timeout(error->reset_queue,
-					       !i915_reset_backoff(error),
-					       I915_RESET_TIMEOUT);
-	if (ret == 0) {
-		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
-		return -EIO;
-	} else if (ret < 0) {
-		return ret;
-	} else {
-		return 0;
-	}
-}
-
-int i915_mutex_lock_interruptible(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	int ret;
-
-	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
-	if (ret)
-		return ret;
-
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 static u32 __i915_gem_park(struct drm_i915_private *i915)
 {
 	intel_wakeref_t wakeref;
@@ -1869,6 +1828,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 	intel_wakeref_t wakeref;
 	struct i915_vma *vma;
 	pgoff_t page_offset;
+	int srcu;
 	int ret;
 
 	/* Sanity check that we allow writing into this object */
@@ -1908,7 +1868,6 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 		goto err_unlock;
 	}
 
-
 	/* Now pin it into the GTT as needed */
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
 				       PIN_MAPPABLE |
@@ -1946,9 +1905,15 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 	if (ret)
 		goto err_unpin;
 
+	srcu = i915_reset_trylock(dev_priv);
+	if (srcu < 0) {
+		ret = srcu;
+		goto err_unpin;
+	}
+
 	ret = i915_vma_pin_fence(vma);
 	if (ret)
-		goto err_unpin;
+		goto err_reset;
 
 	/* Finally, remap it using the new GTT offset */
 	ret = remap_io_mapping(area,
@@ -1969,6 +1934,8 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 
 err_fence:
 	i915_vma_unpin_fence(vma);
+err_reset:
+	i915_reset_unlock(dev_priv, srcu);
 err_unpin:
 	__i915_vma_unpin(vma);
 err_unlock:
@@ -5324,6 +5291,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
 	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
 	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
 	mutex_init(&dev_priv->gpu_error.wedge_mutex);
+	init_srcu_struct(&dev_priv->gpu_error.srcu);
 
 	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
 
@@ -5356,6 +5324,8 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
 	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
 	WARN_ON(dev_priv->mm.object_count);
 
+	cleanup_srcu_struct(&dev_priv->gpu_error.srcu);
+
 	kmem_cache_destroy(dev_priv->priorities);
 	kmem_cache_destroy(dev_priv->dependencies);
 	kmem_cache_destroy(dev_priv->requests);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 46e259661294..bd0d5b8d6c96 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -240,6 +240,10 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		i915_vma_flush_writes(old);
 	}
 
+	ret = i915_reset_trylock(fence->i915);
+	if (ret < 0)
+		return ret;
+
 	if (fence->vma && fence->vma != vma) {
 		/* Ensure that all userspace CPU access is completed before
 		 * stealing the fence.
@@ -272,6 +276,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
 	}
 
+	i915_reset_unlock(fence->i915, ret);
 	return 0;
 }
 
@@ -435,32 +440,6 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
 	list_add(&fence->link, &fence->i915->mm.fence_list);
 }
 
-/**
- * i915_gem_revoke_fences - revoke fence state
- * @dev_priv: i915 device private
- *
- * Removes all GTT mmappings via the fence registers. This forces any user
- * of the fence to reacquire that fence before continuing with their access.
- * One use is during GPU reset where the fence register is lost and we need to
- * revoke concurrent userspace access via GTT mmaps until the hardware has been
- * reset and the fence registers have been restored.
- */
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
-{
-	int i;
-
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
-
-		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
-
-		if (fence->vma)
-			i915_vma_revoke_mmap(fence->vma);
-	}
-}
-
 /**
  * i915_gem_restore_fences - restore fence state
  * @dev_priv: i915 device private
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 53b1f22dd365..4e797c552b96 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -231,12 +231,10 @@ struct i915_gpu_error {
 	/**
 	 * flags: Control various stages of the GPU reset
 	 *
-	 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
-	 * other users acquiring the struct_mutex. To do this we set the
-	 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
-	 * and then check for that bit before acquiring the struct_mutex (in
-	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
-	 * secondary role in preventing two concurrent global reset attempts.
+	 * #I915_RESET_BACKOFF - When we start a global reset, we need to
+	 * serialise with any other users attempting to do the same, and
+	 * any global resources that may be clobber by the reset (such as
+	 * FENCE registers).
 	 *
 	 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
 	 * acquire the struct_mutex to reset an engine, we need an explicit
@@ -272,6 +270,8 @@ struct i915_gpu_error {
 	 */
 	wait_queue_head_t reset_queue;
 
+	struct srcu_struct srcu;
+
 	struct i915_gpu_restart *restart;
 };
 
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 4462007a681c..f58fae457ec6 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -639,6 +639,31 @@ static void reset_prepare_engine(struct intel_engine_cs *engine)
 	engine->reset.prepare(engine);
 }
 
+static void revoke_mmaps(struct drm_i915_private *i915)
+{
+	int i;
+
+	for (i = 0; i < i915->num_fence_regs; i++) {
+		struct i915_vma *vma = i915->fence_regs[i].vma;
+		struct drm_vma_offset_node *node;
+		u64 vma_offset;
+
+		if (!vma)
+			continue;
+
+		GEM_BUG_ON(vma->fence != &i915->fence_regs[i]);
+		if (!i915_vma_has_userfault(vma))
+			continue;
+
+		node = &vma->obj->base.vma_node;
+		vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
+		unmap_mapping_range(i915->drm.anon_inode->i_mapping,
+				    drm_vma_node_offset_addr(node) + vma_offset,
+				    vma->size,
+				    1);
+	}
+}
+
 static void reset_prepare(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
@@ -648,6 +673,7 @@ static void reset_prepare(struct drm_i915_private *i915)
 		reset_prepare_engine(engine);
 
 	intel_uc_sanitize(i915);
+	revoke_mmaps(i915);
 }
 
 static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
@@ -911,50 +937,22 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	return ret;
 }
 
-struct __i915_reset {
-	struct drm_i915_private *i915;
-	unsigned int stalled_mask;
-};
-
-static int __i915_reset__BKL(void *data)
-{
-	struct __i915_reset *arg = data;
-	int err;
-
-	err = intel_gpu_reset(arg->i915, ALL_ENGINES);
-	if (err)
-		return err;
-
-	return gt_reset(arg->i915, arg->stalled_mask);
-}
-
-#if RESET_UNDER_STOP_MACHINE
-/*
- * XXX An alternative to using stop_machine would be to park only the
- * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
- * we should be able to prevent their memmory accesses via the lost fence
- * registers over the course of the reset without the potential recursive
- * of mutexes between the pagefault handler and reset.
- *
- * See igt/gem_mmap_gtt/hang
- */
-#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
-#else
-#define __do_reset(fn, arg) fn(arg)
-#endif
-
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
-	struct __i915_reset arg = { i915, stalled_mask };
 	int err, i;
 
-	err = __do_reset(__i915_reset__BKL, &arg);
+	/* Flush everyone currently using a resource about to be clobbered */
+	synchronize_srcu(&i915->gpu_error.srcu);
+
+	err = intel_gpu_reset(i915, ALL_ENGINES);
 	for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
-		msleep(100);
-		err = __do_reset(__i915_reset__BKL, &arg);
+		msleep(10 * (i + 1));
+		err = intel_gpu_reset(i915, ALL_ENGINES);
 	}
+	if (err)
+		return err;
 
-	return err;
+	return gt_reset(i915, stalled_mask);
 }
 
 /**
@@ -1274,9 +1272,12 @@ void i915_handle_error(struct drm_i915_private *i915,
 		wait_event(i915->gpu_error.reset_queue,
 			   !test_bit(I915_RESET_BACKOFF,
 				     &i915->gpu_error.flags));
-		goto out;
+		goto out; /* piggy-back on the other reset */
 	}
 
+	/* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
+	synchronize_rcu_expedited();
+
 	/* Prevent any other reset-engine attempt. */
 	for_each_engine(engine, i915, tmp) {
 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
@@ -1300,6 +1301,36 @@ void i915_handle_error(struct drm_i915_private *i915,
 	intel_runtime_pm_put(i915, wakeref);
 }
 
+int i915_reset_trylock(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	int srcu;
+
+	rcu_read_lock();
+	while (test_bit(I915_RESET_BACKOFF, &error->flags)) {
+		rcu_read_unlock();
+
+		if (wait_event_interruptible(error->reset_queue,
+					     !test_bit(I915_RESET_BACKOFF,
+						       &error->flags)))
+			return -EINTR;
+
+		rcu_read_lock();
+	}
+	srcu = srcu_read_lock(&error->srcu);
+	rcu_read_unlock();
+
+	return srcu;
+}
+
+void i915_reset_unlock(struct drm_i915_private *i915, int tag)
+__releases(&i915->gpu_error.srcu)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+
+	srcu_read_unlock(&error->srcu, tag);
+}
+
 bool i915_reset_flush(struct drm_i915_private *i915)
 {
 	int err;
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index f2d347f319df..893c5d1c2eb8 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -9,6 +9,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/srcu.h>
 
 struct drm_i915_private;
 struct intel_engine_cs;
@@ -32,6 +33,9 @@ int i915_reset_engine(struct intel_engine_cs *engine,
 void i915_reset_request(struct i915_request *rq, bool guilty);
 bool i915_reset_flush(struct drm_i915_private *i915);
 
+int __must_check i915_reset_trylock(struct drm_i915_private *i915);
+void i915_reset_unlock(struct drm_i915_private *i915, int tag);
+
 bool intel_has_gpu_reset(struct drm_i915_private *i915);
 bool intel_has_reset_engine(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 7b6f3bea9ef8..4886fac12628 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -1039,8 +1039,6 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 
 	/* Check that we can recover an unbind stuck on a hanging request */
 
-	igt_global_reset_lock(i915);
-
 	mutex_lock(&i915->drm.struct_mutex);
 	err = hang_init(&h, i915);
 	if (err)
@@ -1138,7 +1136,9 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 	}
 
 out_reset:
+	igt_global_reset_lock(i915);
 	fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
+	igt_global_reset_unlock(i915);
 
 	if (tsk) {
 		struct igt_wedge_me w;
@@ -1159,7 +1159,6 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 	hang_fini(&h);
 unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
-	igt_global_reset_unlock(i915);
 
 	if (i915_terminally_wedged(&i915->gpu_error))
 		return -EIO;
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 14ae46fda49f..074a0d9cbf26 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -189,6 +189,7 @@ struct drm_i915_private *mock_gem_device(void)
 
 	init_waitqueue_head(&i915->gpu_error.wait_queue);
 	init_waitqueue_head(&i915->gpu_error.reset_queue);
+	init_srcu_struct(&i915->gpu_error.srcu);
 	mutex_init(&i915->gpu_error.wedge_mutex);
 
 	i915->wq = alloc_ordered_workqueue("mock", 0);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* ✗ Fi.CI.SPARSE: warning for series starting with drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev4)
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (8 preceding siblings ...)
  2019-01-31 13:12 ` [PATCH] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
@ 2019-01-31 13:23 ` Patchwork
  2019-01-31 13:45 ` ✓ Fi.CI.BAT: success " Patchwork
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 17+ messages in thread
From: Patchwork @ 2019-01-31 13:23 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev4)
URL   : https://patchwork.freedesktop.org/series/56042/
State : warning

== Summary ==

$ dim sparse origin/drm-tip
Sparse version: v0.5.2
Commit: drm/i915: Revoke mmaps and prevent access to fence registers across reset
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_reset.c:1304:5: warning: context imbalance in 'i915_reset_trylock' - different lock contexts for basic block
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3549:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3543:16: warning: expression using sizeof(void)

Commit: drm/i915: Force the GPU reset upon wedging
Okay!

Commit: drm/i915: Uninterruptibly drain the timelines on unwedging
Okay!

Commit: drm/i915: Wait for old resets before applying debugfs/i915_wedged
Okay!

Commit: drm/i915: Serialise resets with wedging
Okay!

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev4)
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (9 preceding siblings ...)
  2019-01-31 13:23 ` ✗ Fi.CI.SPARSE: warning for series starting with drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev4) Patchwork
@ 2019-01-31 13:45 ` Patchwork
  2019-01-31 22:33 ` ✗ Fi.CI.IGT: failure " Patchwork
  2019-02-01 21:25 ` [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset kbuild test robot
  12 siblings, 0 replies; 17+ messages in thread
From: Patchwork @ 2019-01-31 13:45 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev4)
URL   : https://patchwork.freedesktop.org/series/56042/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_5518 -> Patchwork_12109
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/56042/revisions/4/mbox/

Known issues
------------

  Here are the changes found in Patchwork_12109 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_exec_suspend@basic-s4-devices:
    - fi-blb-e6850:       PASS -> INCOMPLETE [fdo#107718]

  
#### Possible fixes ####

  * igt@i915_selftest@live_hangcheck:
    - fi-skl-iommu:       INCOMPLETE [fdo#108602] / [fdo#108744] -> PASS

  * igt@kms_busy@basic-flip-a:
    - fi-kbl-7567u:       {SKIP} [fdo#109271] / [fdo#109278] -> PASS +2

  * igt@kms_pipe_crc_basic@read-crc-pipe-b-frame-sequence:
    - fi-byt-clapper:     FAIL [fdo#103191] / [fdo#107362] -> PASS

  * igt@pm_rpm@module-reload:
    - fi-skl-6770hq:      FAIL [fdo#108511] -> PASS

  * igt@prime_vgem@basic-fence-flip:
    - fi-gdg-551:         DMESG-FAIL [fdo#103182] -> PASS

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#103182]: https://bugs.freedesktop.org/show_bug.cgi?id=103182
  [fdo#103191]: https://bugs.freedesktop.org/show_bug.cgi?id=103191
  [fdo#107362]: https://bugs.freedesktop.org/show_bug.cgi?id=107362
  [fdo#107718]: https://bugs.freedesktop.org/show_bug.cgi?id=107718
  [fdo#108511]: https://bugs.freedesktop.org/show_bug.cgi?id=108511
  [fdo#108602]: https://bugs.freedesktop.org/show_bug.cgi?id=108602
  [fdo#108744]: https://bugs.freedesktop.org/show_bug.cgi?id=108744
  [fdo#108915]: https://bugs.freedesktop.org/show_bug.cgi?id=108915
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109516]: https://bugs.freedesktop.org/show_bug.cgi?id=109516


Participating hosts (43 -> 46)
------------------------------

  Additional (6): fi-bxt-dsi fi-ivb-3770 fi-icl-y fi-bsw-kefka fi-skl-6600u fi-snb-2600 
  Missing    (3): fi-ilk-m540 fi-byt-squawks fi-bsw-cyan 


Build changes
-------------

    * Linux: CI_DRM_5518 -> Patchwork_12109

  CI_DRM_5518: 2369fd28d3a46b865f6d4f1d309a4c6b7b4e6d93 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4801: 6f6bacf12759fb319ade3ba37861ae711f8a5cd9 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12109: b14cb740ba0da05e4c0bc3d7c4384487aa938dd8 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

b14cb740ba0d drm/i915: Serialise resets with wedging
4e2a0fb09b7a drm/i915: Wait for old resets before applying debugfs/i915_wedged
6c7dbf50ca32 drm/i915: Uninterruptibly drain the timelines on unwedging
2b82a38d5c70 drm/i915: Force the GPU reset upon wedging
2d631da15f0d drm/i915: Revoke mmaps and prevent access to fence registers across reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12109/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* ✗ Fi.CI.IGT: failure for series starting with drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev4)
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (10 preceding siblings ...)
  2019-01-31 13:45 ` ✓ Fi.CI.BAT: success " Patchwork
@ 2019-01-31 22:33 ` Patchwork
  2019-01-31 22:37   ` Chris Wilson
  2019-02-01 21:25 ` [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset kbuild test robot
  12 siblings, 1 reply; 17+ messages in thread
From: Patchwork @ 2019-01-31 22:33 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev4)
URL   : https://patchwork.freedesktop.org/series/56042/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_5518_full -> Patchwork_12109_full
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_12109_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_12109_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_12109_full:

### IGT changes ###

#### Possible regressions ####

  * igt@gem_eio@unwedge-stress:
    - shard-snb:          PASS -> FAIL

  
Known issues
------------

  Here are the changes found in Patchwork_12109_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_exec_schedule@pi-ringfull-render:
    - shard-apl:          NOTRUN -> FAIL [fdo#103158]

  * igt@kms_color@pipe-a-legacy-gamma:
    - shard-apl:          PASS -> FAIL [fdo#104782] / [fdo#108145]

  * igt@kms_cursor_crc@cursor-128x128-sliding:
    - shard-glk:          PASS -> FAIL [fdo#103232]

  * igt@kms_cursor_crc@cursor-256x85-random:
    - shard-apl:          PASS -> FAIL [fdo#103232]

  * igt@kms_plane_alpha_blend@pipe-b-constant-alpha-max:
    - shard-glk:          PASS -> FAIL [fdo#108145] +1

  * igt@kms_plane_multiple@atomic-pipe-a-tiling-none:
    - shard-apl:          NOTRUN -> FAIL [fdo#103166]

  * igt@kms_plane_multiple@atomic-pipe-c-tiling-y:
    - shard-apl:          PASS -> FAIL [fdo#103166]

  
#### Possible fixes ####

  * igt@gem_eio@reset-stress:
    - shard-hsw:          INCOMPLETE [fdo#103540] / [fdo#109482] -> PASS

  * igt@gem_mmap_gtt@hang:
    - shard-kbl:          FAIL [fdo#109469] -> PASS
    - shard-hsw:          FAIL [fdo#109469] -> PASS
    - shard-snb:          FAIL [fdo#109469] -> PASS
    - shard-glk:          FAIL [fdo#109469] -> PASS
    - shard-apl:          FAIL [fdo#109469] -> PASS

  * igt@kms_cursor_crc@cursor-128x42-sliding:
    - shard-apl:          FAIL [fdo#103232] -> PASS +1

  * igt@kms_cursor_crc@cursor-256x256-suspend:
    - shard-hsw:          INCOMPLETE [fdo#103540] -> PASS

  * igt@kms_cursor_crc@cursor-64x64-sliding:
    - shard-glk:          FAIL [fdo#103232] -> PASS

  * igt@kms_plane@plane-position-covered-pipe-c-planes:
    - shard-apl:          FAIL [fdo#103166] -> PASS +2

  * igt@kms_setmode@basic:
    - shard-apl:          FAIL [fdo#99912] -> PASS

  * igt@pm_rc6_residency@rc6-accuracy:
    - shard-snb:          {SKIP} [fdo#109271] -> PASS

  * igt@sw_sync@sync_busy_fork:
    - shard-snb:          INCOMPLETE [fdo#105411] -> PASS +1

  
#### Warnings ####

  * igt@kms_busy@extended-modeset-hang-newfb-with-reset-render-a:
    - shard-snb:          DMESG-WARN [fdo#107956] -> INCOMPLETE [fdo#105411]

  * igt@kms_busy@extended-modeset-hang-newfb-with-reset-render-b:
    - shard-snb:          {SKIP} [fdo#109271] / [fdo#109278] -> INCOMPLETE [fdo#105411]

  * igt@kms_busy@extended-modeset-hang-newfb-with-reset-render-c:
    - shard-hsw:          DMESG-WARN [fdo#107956] -> INCOMPLETE [fdo#103540] +2

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#103158]: https://bugs.freedesktop.org/show_bug.cgi?id=103158
  [fdo#103166]: https://bugs.freedesktop.org/show_bug.cgi?id=103166
  [fdo#103232]: https://bugs.freedesktop.org/show_bug.cgi?id=103232
  [fdo#103540]: https://bugs.freedesktop.org/show_bug.cgi?id=103540
  [fdo#104782]: https://bugs.freedesktop.org/show_bug.cgi?id=104782
  [fdo#105411]: https://bugs.freedesktop.org/show_bug.cgi?id=105411
  [fdo#107956]: https://bugs.freedesktop.org/show_bug.cgi?id=107956
  [fdo#108145]: https://bugs.freedesktop.org/show_bug.cgi?id=108145
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109469]: https://bugs.freedesktop.org/show_bug.cgi?id=109469
  [fdo#109482]: https://bugs.freedesktop.org/show_bug.cgi?id=109482
  [fdo#99912]: https://bugs.freedesktop.org/show_bug.cgi?id=99912


Participating hosts (7 -> 5)
------------------------------

  Missing    (2): shard-skl shard-iclb 


Build changes
-------------

    * Linux: CI_DRM_5518 -> Patchwork_12109

  CI_DRM_5518: 2369fd28d3a46b865f6d4f1d309a4c6b7b4e6d93 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4801: 6f6bacf12759fb319ade3ba37861ae711f8a5cd9 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12109: b14cb740ba0da05e4c0bc3d7c4384487aa938dd8 @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12109/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: ✗ Fi.CI.IGT: failure for series starting with drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev4)
  2019-01-31 22:33 ` ✗ Fi.CI.IGT: failure " Patchwork
@ 2019-01-31 22:37   ` Chris Wilson
  0 siblings, 0 replies; 17+ messages in thread
From: Chris Wilson @ 2019-01-31 22:37 UTC (permalink / raw)
  To: Patchwork; +Cc: intel-gfx

Quoting Patchwork (2019-01-31 22:33:09)
> #### Possible regressions ####
> 
>   * igt@gem_eio@unwedge-stress:
>     - shard-snb:          PASS -> FAIL

Old, random 2s delay.

> #### Possible fixes ####
> 
>   * igt@gem_eio@reset-stress:
>     - shard-hsw:          INCOMPLETE [fdo#103540] / [fdo#109482] -> PASS
> 
>   * igt@gem_mmap_gtt@hang:
>     - shard-kbl:          FAIL [fdo#109469] -> PASS
>     - shard-hsw:          FAIL [fdo#109469] -> PASS
>     - shard-snb:          FAIL [fdo#109469] -> PASS
>     - shard-glk:          FAIL [fdo#109469] -> PASS
>     - shard-apl:          FAIL [fdo#109469] -> PASS

I declare thee done!
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (11 preceding siblings ...)
  2019-01-31 22:33 ` ✗ Fi.CI.IGT: failure " Patchwork
@ 2019-02-01 21:25 ` kbuild test robot
  12 siblings, 0 replies; 17+ messages in thread
From: kbuild test robot @ 2019-02-01 21:25 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx, kbuild-all, Mika Kuoppala

[-- Attachment #1: Type: text/plain, Size: 1920 bytes --]

Hi Chris,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on drm-intel/for-linux-next]
[also build test WARNING on next-20190131]
[cannot apply to v5.0-rc4]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Chris-Wilson/drm-i915-Revoke-mmaps-and-prevent-access-to-fence-registers-across-reset/20190201-043323
base:   git://anongit.freedesktop.org/drm-intel for-linux-next
reproduce:
        # apt-get install sparse
        make ARCH=x86_64 allmodconfig
        make C=1 CF='-fdiagnostic-prefix -D__CHECK_ENDIAN__'

All warnings (new ones prefixed by >>):

>> drivers/gpu/drm/i915/i915_reset.c:1304:5: sparse: warning: context imbalance in 'i915_reset_lock' - different lock contexts for basic block
>> drivers/gpu/drm/i915/i915_reset.c:1325:6: sparse: warning: context imbalance in 'i915_reset_unlock' - unexpected unlock

vim +/i915_reset_lock +1304 drivers/gpu/drm/i915/i915_reset.c

  1303	
> 1304	int i915_reset_lock(struct drm_i915_private *i915)
  1305	{
  1306		int srcu;
  1307	
  1308		rcu_read_lock();
  1309		while (test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
  1310			rcu_read_unlock();
  1311	
  1312			if (wait_event_interruptible(i915->gpu_error.reset_queue,
  1313						     !test_bit(I915_RESET_BACKOFF,
  1314							       &i915->gpu_error.flags)))
  1315				return -EINTR;
  1316	
  1317			rcu_read_lock();
  1318		}
  1319		srcu = srcu_read_lock(&i915->gpu_error.srcu);
  1320		rcu_read_unlock();
  1321	
  1322		return srcu;
  1323	}
  1324	
> 1325	void i915_reset_unlock(struct drm_i915_private *i915, int tag)
  1326	{
  1327		srcu_read_unlock(&i915->gpu_error.srcu, tag);
  1328	}
  1329	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 67050 bytes --]

[-- Attachment #3: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2019-02-01 21:25 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-31 12:01 [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
2019-01-31 12:01 ` [PATCH 2/5] drm/i915: Force the GPU reset upon wedging Chris Wilson
2019-01-31 12:01 ` [PATCH 3/5] drm/i915: Uninterruptibly drain the timelines on unwedging Chris Wilson
2019-01-31 12:01 ` [PATCH 4/5] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
2019-01-31 12:01 ` [PATCH 5/5] drm/i915: Serialise resets with wedging Chris Wilson
2019-01-31 12:13   ` [PATCH] " Chris Wilson
2019-01-31 12:14   ` Chris Wilson
2019-01-31 12:17 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset Patchwork
2019-01-31 12:39 ` ✗ Fi.CI.BAT: failure " Patchwork
2019-01-31 12:40 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3) Patchwork
2019-01-31 12:59 ` ✗ Fi.CI.BAT: failure " Patchwork
2019-01-31 13:12 ` [PATCH] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
2019-01-31 13:23 ` ✗ Fi.CI.SPARSE: warning for series starting with drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev4) Patchwork
2019-01-31 13:45 ` ✓ Fi.CI.BAT: success " Patchwork
2019-01-31 22:33 ` ✗ Fi.CI.IGT: failure " Patchwork
2019-01-31 22:37   ` Chris Wilson
2019-02-01 21:25 ` [PATCH 1/5] drm/i915: Revoke mmaps and prevent access to fence registers across reset kbuild test robot

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.