All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs
@ 2019-02-06 17:11 Chris Wilson
  2019-02-06 17:11 ` [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup Chris Wilson
                   ` (8 more replies)
  0 siblings, 9 replies; 14+ messages in thread
From: Chris Wilson @ 2019-02-06 17:11 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Apply backpressure to hogs that emit requests faster than the GPU can
process them by waiting for their ring to be less than half-full before
proceeding with taking the struct_mutex.

This is a gross hack to apply throttling backpressure, the long term
goal is to remove the struct_mutex contention so that each client
naturally waits, preferably in an asynchronous, nonblocking fashion
(pipelined operations for the win), for their own resources and never
blocks another client within the driver at least. (Realtime priority
goals would extend to ensuring that resource contention favours high
priority clients as well.)

This patch only limits excessive request production and does not attempt
to throttle clients that block waiting for eviction (either global GTT or
system memory) or any other global resources, see above for the long term
goal.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 63 ++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.c    | 13 -----
 drivers/gpu/drm/i915/intel_ringbuffer.h    | 12 +++++
 3 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 8eedf7cac493..84ef3abc567e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -753,6 +753,64 @@ static int eb_select_context(struct i915_execbuffer *eb)
 	return 0;
 }
 
+static struct i915_request *__eb_wait_for_ring(struct intel_ring *ring)
+{
+	struct i915_request *rq;
+
+	if (intel_ring_update_space(ring) >= PAGE_SIZE)
+		return NULL;
+
+	/*
+	 * Find a request that after waiting upon, there will be at least half
+	 * the ring available. The hystersis allows us to compete for the
+	 * shared ring and should mean that we sleep less often prior to
+	 * claiming our resources, but not so long that the ring completely
+	 * drains before we can submit our next request.
+	 */
+	list_for_each_entry(rq, &ring->request_list, ring_link) {
+		if (__intel_ring_space(rq->postfix,
+				       ring->emit, ring->size) > ring->size / 2)
+			break;
+	}
+	if (&rq->ring_link == &ring->request_list)
+		return NULL; /* weird, we will check again later for real */
+
+	return i915_request_get(rq);
+}
+
+static int eb_wait_for_ring(const struct i915_execbuffer *eb)
+{
+	const struct intel_context *ce;
+	struct i915_request *rq;
+	int ret = 0;
+
+	/*
+	 * Apply a light amount of backpressure to prevent excessive hogs
+	 * from blocking waiting for space whilst holding struct_mutex and
+	 * keeping all of their resources pinned.
+	 */
+
+	ce = to_intel_context(eb->ctx, eb->engine);
+	if (!ce->ring) /* first use, assume empty! */
+		return 0;
+
+	rq = __eb_wait_for_ring(ce->ring);
+	if (rq) {
+		mutex_unlock(&eb->i915->drm.struct_mutex);
+
+		if (i915_request_wait(rq,
+				      I915_WAIT_INTERRUPTIBLE,
+				      MAX_SCHEDULE_TIMEOUT) < 0)
+			ret = -EINTR;
+
+		i915_request_put(rq);
+
+		mutex_lock(&eb->i915->drm.struct_mutex);
+	}
+
+	return ret;
+}
+
 static int eb_lookup_vmas(struct i915_execbuffer *eb)
 {
 	struct radix_tree_root *handles_vma = &eb->ctx->handles_vma;
@@ -2291,6 +2349,10 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	if (err)
 		goto err_rpm;
 
+	err = eb_wait_for_ring(&eb); /* may temporarily drop struct_mutex */
+	if (unlikely(err))
+		goto err_unlock;
+
 	err = eb_relocate(&eb);
 	if (err) {
 		/*
@@ -2435,6 +2497,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 err_vma:
 	if (eb.exec)
 		eb_release_vmas(&eb);
+err_unlock:
 	mutex_unlock(&dev->struct_mutex);
 err_rpm:
 	intel_runtime_pm_put(eb.i915, wakeref);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index b889b27f8aeb..7f841dba87b3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -49,19 +49,6 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 		I915_GEM_HWS_INDEX_ADDR);
 }
 
-static unsigned int __intel_ring_space(unsigned int head,
-				       unsigned int tail,
-				       unsigned int size)
-{
-	/*
-	 * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
-	 * same cacheline, the Head Pointer must not be greater than the Tail
-	 * Pointer."
-	 */
-	GEM_BUG_ON(!is_power_of_2(size));
-	return (head - tail - CACHELINE_BYTES) & (size - 1);
-}
-
 unsigned int intel_ring_update_space(struct intel_ring *ring)
 {
 	unsigned int space;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 4d4ea6963a72..710ffb221775 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -832,6 +832,18 @@ intel_ring_set_tail(struct intel_ring *ring, unsigned int tail)
 	return tail;
 }
 
+static inline unsigned int
+__intel_ring_space(unsigned int head, unsigned int tail, unsigned int size)
+{
+	/*
+	 * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
+	 * same cacheline, the Head Pointer must not be greater than the Tail
+	 * Pointer."
+	 */
+	GEM_BUG_ON(!is_power_of_2(size));
+	return (head - tail - CACHELINE_BYTES) & (size - 1);
+}
+
 void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno);
 
 int intel_engine_setup_common(struct intel_engine_cs *engine);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup
  2019-02-06 17:11 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
@ 2019-02-06 17:11 ` Chris Wilson
  2019-02-06 17:11 ` [PATCH 3/8] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (7 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2019-02-06 17:11 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Currently, we may simultaneously release the fence register from both
fence_update() and i915_gem_restore_fences(). This is dangerous, so
defer the bookkeeping entirely to i915_gem_restore_fences() when the
device is asleep.

Reported-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_fence_reg.c | 27 +++++++++++++++--------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index e037e94792f3..4331dadc7000 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -240,6 +240,22 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		i915_vma_flush_writes(old);
 	}
 
+	/*
+	 * We only need to update the register itself if the device is awake.
+	 * If the device is currently powered down, we will defer the write
+	 * to the runtime resume, see i915_gem_restore_fences().
+	 *
+	 * This only works for removing the fence register, on acquisition
+	 * the caller must hold the rpm wakeref. The fence register must
+	 * be cleared before we can use any other fences to ensure that
+	 * the new fences do not overlap the elided clears, confusing HW.
+	 */
+	wakeref = intel_runtime_pm_get_if_in_use(fence->i915);
+	if (!wakeref) {
+		GEM_BUG_ON(vma);
+		return 0;
+	}
+
 	if (fence->vma && fence->vma != vma) {
 		/* Ensure that all userspace CPU access is completed before
 		 * stealing the fence.
@@ -253,15 +269,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		list_move(&fence->link, &fence->i915->mm.fence_list);
 	}
 
-	/* We only need to update the register itself if the device is awake.
-	 * If the device is currently powered down, we will defer the write
-	 * to the runtime resume, see i915_gem_restore_fences().
-	 */
-	wakeref = intel_runtime_pm_get_if_in_use(fence->i915);
-	if (wakeref) {
-		fence_write(fence, vma);
-		intel_runtime_pm_put(fence->i915, wakeref);
-	}
+	fence_write(fence, vma);
 
 	if (vma) {
 		if (fence->vma != vma) {
@@ -272,6 +280,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
 	}
 
+	intel_runtime_pm_put(fence->i915, wakeref);
 	return 0;
 }
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/8] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-02-06 17:11 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
  2019-02-06 17:11 ` [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup Chris Wilson
@ 2019-02-06 17:11 ` Chris Wilson
  2019-02-06 17:11 ` [PATCH 4/8] drm/i915: Force the GPU reset upon wedging Chris Wilson
                   ` (6 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2019-02-06 17:11 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Previously, we were able to rely on the recursive properties of
struct_mutex to allow us to serialise revoking mmaps and reacquiring the
FENCE registers with them being clobbered over a global device reset.
I then proceeded to throw out the baby with the bath water in order to
pursue a struct_mutex-less reset.

Perusing LWN for alternative strategies, the dilemma on how to serialise
access to a global resource on one side was answered by
https://lwn.net/Articles/202847/ -- Sleepable RCU:

    1  int readside(void) {
    2      int idx;
    3      rcu_read_lock();
    4	   if (nomoresrcu) {
    5          rcu_read_unlock();
    6	       return -EINVAL;
    7      }
    8	   idx = srcu_read_lock(&ss);
    9	   rcu_read_unlock();
    10	   /* SRCU read-side critical section. */
    11	   srcu_read_unlock(&ss, idx);
    12	   return 0;
    13 }
    14
    15 void cleanup(void)
    16 {
    17     nomoresrcu = 1;
    18     synchronize_rcu();
    19     synchronize_srcu(&ss);
    20     cleanup_srcu_struct(&ss);
    21 }

No more worrying about stop_machine, just an uber-complex mutex,
optimised for reads, with the overhead pushed to the rare reset path.

However, we do run the risk of a deadlock as we allocate underneath the
SRCU read lock, and the allocation may require a GPU reset, causing a
dependency cycle via the in-flight requests. We resolve that by declaring
the driver wedged and cancelling all in-flight rendering.

v2: Use expedited rcu barriers to match our earlier timing
characteristics.
v3: Try to annotate locking contexts for sparse
v4: Reduce selftest lock duration to avoid a reset deadlock with fences
v5: s/srcu/reset_backoff_srcu/
v6: Remove more stale comments

Testcase: igt/gem_mmap_gtt/hang
Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  12 +-
 drivers/gpu/drm/i915/i915_drv.h               |  18 +--
 drivers/gpu/drm/i915/i915_gem.c               |  56 +++------
 drivers/gpu/drm/i915/i915_gem_fence_reg.c     |  34 ++----
 drivers/gpu/drm/i915/i915_gpu_error.h         |  39 ++-----
 drivers/gpu/drm/i915/i915_reset.c             | 110 +++++++++++-------
 drivers/gpu/drm/i915/i915_reset.h             |   4 +
 drivers/gpu/drm/i915/intel_drv.h              |   3 -
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |   5 +-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |   1 +
 10 files changed, 115 insertions(+), 167 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 0bd890c04fe4..a6fd157b1637 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1281,14 +1281,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	intel_wakeref_t wakeref;
 	enum intel_engine_id id;
 
+	seq_printf(m, "Reset flags: %lx\n", dev_priv->gpu_error.flags);
 	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Wedged\n");
+		seq_puts(m, "\tWedged\n");
 	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
-		seq_puts(m, "Waiter holding struct mutex\n");
-	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
-		seq_puts(m, "struct_mutex blocked for reset\n");
+		seq_puts(m, "\tDevice (global) reset in progress\n");
 
 	if (!i915_modparams.enable_hangcheck) {
 		seq_puts(m, "Hangcheck disabled\n");
@@ -3885,9 +3882,6 @@ i915_wedged_set(void *data, u64 val)
 	 * while it is writing to 'i915_wedged'
 	 */
 
-	if (i915_reset_backoff(&i915->gpu_error))
-		return -EAGAIN;
-
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a2293152cb6a..37230ae7fbe6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2989,7 +2989,12 @@ i915_gem_obj_finish_shmem_access(struct drm_i915_gem_object *obj)
 	i915_gem_object_unpin_pages(obj);
 }
 
-int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
+static inline int __must_check
+i915_mutex_lock_interruptible(struct drm_device *dev)
+{
+	return mutex_lock_interruptible(&dev->struct_mutex);
+}
+
 int i915_gem_dumb_create(struct drm_file *file_priv,
 			 struct drm_device *dev,
 			 struct drm_mode_create_dumb *args);
@@ -3006,21 +3011,11 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine);
 
-static inline bool i915_reset_backoff(struct i915_gpu_error *error)
-{
-	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
-}
-
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
 }
 
-static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
-{
-	return i915_reset_backoff(error) | i915_terminally_wedged(error);
-}
-
 static inline u32 i915_reset_count(struct i915_gpu_error *error)
 {
 	return READ_ONCE(error->reset_count);
@@ -3093,7 +3088,6 @@ struct drm_i915_fence_reg *
 i915_reserve_fence(struct drm_i915_private *dev_priv);
 void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
 
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv);
 void i915_gem_restore_fences(struct drm_i915_private *dev_priv);
 
 void i915_gem_detect_bit_6_swizzle(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 05ce9176ac4e..1eb3a5f8654c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -100,47 +100,6 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
 	spin_unlock(&dev_priv->mm.object_stat_lock);
 }
 
-static int
-i915_gem_wait_for_error(struct i915_gpu_error *error)
-{
-	int ret;
-
-	might_sleep();
-
-	/*
-	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
-	 * userspace. If it takes that long something really bad is going on and
-	 * we should simply try to bail out and fail as gracefully as possible.
-	 */
-	ret = wait_event_interruptible_timeout(error->reset_queue,
-					       !i915_reset_backoff(error),
-					       I915_RESET_TIMEOUT);
-	if (ret == 0) {
-		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
-		return -EIO;
-	} else if (ret < 0) {
-		return ret;
-	} else {
-		return 0;
-	}
-}
-
-int i915_mutex_lock_interruptible(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	int ret;
-
-	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
-	if (ret)
-		return ret;
-
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 static u32 __i915_gem_park(struct drm_i915_private *i915)
 {
 	intel_wakeref_t wakeref;
@@ -1869,6 +1828,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 	intel_wakeref_t wakeref;
 	struct i915_vma *vma;
 	pgoff_t page_offset;
+	int srcu;
 	int ret;
 
 	/* Sanity check that we allow writing into this object */
@@ -1908,7 +1868,6 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 		goto err_unlock;
 	}
 
-
 	/* Now pin it into the GTT as needed */
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
 				       PIN_MAPPABLE |
@@ -1946,9 +1905,15 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 	if (ret)
 		goto err_unpin;
 
+	srcu = i915_reset_trylock(dev_priv);
+	if (srcu < 0) {
+		ret = srcu;
+		goto err_unpin;
+	}
+
 	ret = i915_vma_pin_fence(vma);
 	if (ret)
-		goto err_unpin;
+		goto err_reset;
 
 	/* Finally, remap it using the new GTT offset */
 	ret = remap_io_mapping(area,
@@ -1969,6 +1934,8 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 
 err_fence:
 	i915_vma_unpin_fence(vma);
+err_reset:
+	i915_reset_unlock(dev_priv, srcu);
 err_unpin:
 	__i915_vma_unpin(vma);
 err_unlock:
@@ -5326,6 +5293,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
 	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
 	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
 	mutex_init(&dev_priv->gpu_error.wedge_mutex);
+	init_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
 
 	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
 
@@ -5358,6 +5326,8 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
 	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
 	WARN_ON(dev_priv->mm.object_count);
 
+	cleanup_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
+
 	kmem_cache_destroy(dev_priv->priorities);
 	kmem_cache_destroy(dev_priv->dependencies);
 	kmem_cache_destroy(dev_priv->requests);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 4331dadc7000..8ff78fd8020d 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -256,6 +256,10 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		return 0;
 	}
 
+	ret = i915_reset_trylock(fence->i915);
+	if (ret < 0)
+		goto out_rpm;
+
 	if (fence->vma && fence->vma != vma) {
 		/* Ensure that all userspace CPU access is completed before
 		 * stealing the fence.
@@ -280,8 +284,10 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
 	}
 
+	i915_reset_unlock(fence->i915, ret);
+out_rpm:
 	intel_runtime_pm_put(fence->i915, wakeref);
-	return 0;
+	return ret;
 }
 
 /**
@@ -444,32 +450,6 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
 	list_add(&fence->link, &fence->i915->mm.fence_list);
 }
 
-/**
- * i915_gem_revoke_fences - revoke fence state
- * @dev_priv: i915 device private
- *
- * Removes all GTT mmappings via the fence registers. This forces any user
- * of the fence to reacquire that fence before continuing with their access.
- * One use is during GPU reset where the fence register is lost and we need to
- * revoke concurrent userspace access via GTT mmaps until the hardware has been
- * reset and the fence registers have been restored.
- */
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
-{
-	int i;
-
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
-
-		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
-
-		if (fence->vma)
-			i915_vma_revoke_mmap(fence->vma);
-	}
-}
-
 /**
  * i915_gem_restore_fences - restore fence state
  * @dev_priv: i915 device private
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 53b1f22dd365..afa3adb28f02 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -204,39 +204,13 @@ struct i915_gpu_error {
 
 	atomic_t pending_fb_pin;
 
-	/**
-	 * State variable controlling the reset flow and count
-	 *
-	 * This is a counter which gets incremented when reset is triggered,
-	 *
-	 * Before the reset commences, the I915_RESET_BACKOFF bit is set
-	 * meaning that any waiters holding onto the struct_mutex should
-	 * relinquish the lock immediately in order for the reset to start.
-	 *
-	 * If reset is not completed successfully, the I915_WEDGE bit is
-	 * set meaning that hardware is terminally sour and there is no
-	 * recovery. All waiters on the reset_queue will be woken when
-	 * that happens.
-	 *
-	 * This counter is used by the wait_seqno code to notice that reset
-	 * event happened and it needs to restart the entire ioctl (since most
-	 * likely the seqno it waited for won't ever signal anytime soon).
-	 *
-	 * This is important for lock-free wait paths, where no contended lock
-	 * naturally enforces the correct ordering between the bail-out of the
-	 * waiter and the gpu reset work code.
-	 */
-	unsigned long reset_count;
-
 	/**
 	 * flags: Control various stages of the GPU reset
 	 *
-	 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
-	 * other users acquiring the struct_mutex. To do this we set the
-	 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
-	 * and then check for that bit before acquiring the struct_mutex (in
-	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
-	 * secondary role in preventing two concurrent global reset attempts.
+	 * #I915_RESET_BACKOFF - When we start a global reset, we need to
+	 * serialise with any other users attempting to do the same, and
+	 * any global resources that may be clobber by the reset (such as
+	 * FENCE registers).
 	 *
 	 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
 	 * acquire the struct_mutex to reset an engine, we need an explicit
@@ -255,6 +229,9 @@ struct i915_gpu_error {
 #define I915_RESET_ENGINE	2
 #define I915_WEDGED		(BITS_PER_LONG - 1)
 
+	/** Number of times the device has been reset (global) */
+	u32 reset_count;
+
 	/** Number of times an engine has been reset */
 	u32 reset_engine_count[I915_NUM_ENGINES];
 
@@ -272,6 +249,8 @@ struct i915_gpu_error {
 	 */
 	wait_queue_head_t reset_queue;
 
+	struct srcu_struct reset_backoff_srcu;
+
 	struct i915_gpu_restart *restart;
 };
 
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 0e0ddf2e6815..c67d6c2a09a2 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -639,6 +639,32 @@ static void reset_prepare_engine(struct intel_engine_cs *engine)
 	engine->reset.prepare(engine);
 }
 
+static void revoke_mmaps(struct drm_i915_private *i915)
+{
+	int i;
+
+	for (i = 0; i < i915->num_fence_regs; i++) {
+		struct drm_vma_offset_node *node;
+		struct i915_vma *vma;
+		u64 vma_offset;
+
+		vma = READ_ONCE(i915->fence_regs[i].vma);
+		if (!vma)
+			continue;
+
+		if (!i915_vma_has_userfault(vma))
+			continue;
+
+		GEM_BUG_ON(vma->fence != &i915->fence_regs[i]);
+		node = &vma->obj->base.vma_node;
+		vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
+		unmap_mapping_range(i915->drm.anon_inode->i_mapping,
+				    drm_vma_node_offset_addr(node) + vma_offset,
+				    vma->size,
+				    1);
+	}
+}
+
 static void reset_prepare(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
@@ -648,6 +674,7 @@ static void reset_prepare(struct drm_i915_private *i915)
 		reset_prepare_engine(engine);
 
 	intel_uc_sanitize(i915);
+	revoke_mmaps(i915);
 }
 
 static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
@@ -911,50 +938,22 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	return ret;
 }
 
-struct __i915_reset {
-	struct drm_i915_private *i915;
-	unsigned int stalled_mask;
-};
-
-static int __i915_reset__BKL(void *data)
-{
-	struct __i915_reset *arg = data;
-	int err;
-
-	err = intel_gpu_reset(arg->i915, ALL_ENGINES);
-	if (err)
-		return err;
-
-	return gt_reset(arg->i915, arg->stalled_mask);
-}
-
-#if RESET_UNDER_STOP_MACHINE
-/*
- * XXX An alternative to using stop_machine would be to park only the
- * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
- * we should be able to prevent their memmory accesses via the lost fence
- * registers over the course of the reset without the potential recursive
- * of mutexes between the pagefault handler and reset.
- *
- * See igt/gem_mmap_gtt/hang
- */
-#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
-#else
-#define __do_reset(fn, arg) fn(arg)
-#endif
-
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
-	struct __i915_reset arg = { i915, stalled_mask };
 	int err, i;
 
-	err = __do_reset(__i915_reset__BKL, &arg);
+	/* Flush everyone currently using a resource about to be clobbered */
+	synchronize_srcu(&i915->gpu_error.reset_backoff_srcu);
+
+	err = intel_gpu_reset(i915, ALL_ENGINES);
 	for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
-		msleep(100);
-		err = __do_reset(__i915_reset__BKL, &arg);
+		msleep(10 * (i + 1));
+		err = intel_gpu_reset(i915, ALL_ENGINES);
 	}
+	if (err)
+		return err;
 
-	return err;
+	return gt_reset(i915, stalled_mask);
 }
 
 /**
@@ -966,8 +965,6 @@ static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
  * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
  * on failure.
  *
- * Caller must hold the struct_mutex.
- *
  * Procedure is fairly simple:
  *   - reset the chip using the reset reg
  *   - re-init context state
@@ -1274,9 +1271,12 @@ void i915_handle_error(struct drm_i915_private *i915,
 		wait_event(i915->gpu_error.reset_queue,
 			   !test_bit(I915_RESET_BACKOFF,
 				     &i915->gpu_error.flags));
-		goto out;
+		goto out; /* piggy-back on the other reset */
 	}
 
+	/* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
+	synchronize_rcu_expedited();
+
 	/* Prevent any other reset-engine attempt. */
 	for_each_engine(engine, i915, tmp) {
 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
@@ -1300,6 +1300,36 @@ void i915_handle_error(struct drm_i915_private *i915,
 	intel_runtime_pm_put(i915, wakeref);
 }
 
+int i915_reset_trylock(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	int srcu;
+
+	rcu_read_lock();
+	while (test_bit(I915_RESET_BACKOFF, &error->flags)) {
+		rcu_read_unlock();
+
+		if (wait_event_interruptible(error->reset_queue,
+					     !test_bit(I915_RESET_BACKOFF,
+						       &error->flags)))
+			return -EINTR;
+
+		rcu_read_lock();
+	}
+	srcu = srcu_read_lock(&error->reset_backoff_srcu);
+	rcu_read_unlock();
+
+	return srcu;
+}
+
+void i915_reset_unlock(struct drm_i915_private *i915, int tag)
+__releases(&i915->gpu_error.reset_backoff_srcu)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+
+	srcu_read_unlock(&error->reset_backoff_srcu, tag);
+}
+
 bool i915_reset_flush(struct drm_i915_private *i915)
 {
 	int err;
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index f2d347f319df..893c5d1c2eb8 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -9,6 +9,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/srcu.h>
 
 struct drm_i915_private;
 struct intel_engine_cs;
@@ -32,6 +33,9 @@ int i915_reset_engine(struct intel_engine_cs *engine,
 void i915_reset_request(struct i915_request *rq, bool guilty);
 bool i915_reset_flush(struct drm_i915_private *i915);
 
+int __must_check i915_reset_trylock(struct drm_i915_private *i915);
+void i915_reset_unlock(struct drm_i915_private *i915, int tag);
+
 bool intel_has_gpu_reset(struct drm_i915_private *i915);
 bool intel_has_reset_engine(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 90ba5436370e..3500d1ee32ab 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -979,9 +979,6 @@ struct intel_crtc {
 
 	struct intel_crtc_state *config;
 
-	/* global reset count when the last flip was submitted */
-	unsigned int reset_count;
-
 	/* Access to these should be protected by dev_priv->irq_lock. */
 	bool cpu_fifo_underrun_disabled;
 	bool pch_fifo_underrun_disabled;
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 7b6f3bea9ef8..4886fac12628 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -1039,8 +1039,6 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 
 	/* Check that we can recover an unbind stuck on a hanging request */
 
-	igt_global_reset_lock(i915);
-
 	mutex_lock(&i915->drm.struct_mutex);
 	err = hang_init(&h, i915);
 	if (err)
@@ -1138,7 +1136,9 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 	}
 
 out_reset:
+	igt_global_reset_lock(i915);
 	fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
+	igt_global_reset_unlock(i915);
 
 	if (tsk) {
 		struct igt_wedge_me w;
@@ -1159,7 +1159,6 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 	hang_fini(&h);
 unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
-	igt_global_reset_unlock(i915);
 
 	if (i915_terminally_wedged(&i915->gpu_error))
 		return -EIO;
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 14ae46fda49f..fc516a2970f4 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -189,6 +189,7 @@ struct drm_i915_private *mock_gem_device(void)
 
 	init_waitqueue_head(&i915->gpu_error.wait_queue);
 	init_waitqueue_head(&i915->gpu_error.reset_queue);
+	init_srcu_struct(&i915->gpu_error.reset_backoff_srcu);
 	mutex_init(&i915->gpu_error.wedge_mutex);
 
 	i915->wq = alloc_ordered_workqueue("mock", 0);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/8] drm/i915: Force the GPU reset upon wedging
  2019-02-06 17:11 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
  2019-02-06 17:11 ` [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup Chris Wilson
  2019-02-06 17:11 ` [PATCH 3/8] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
@ 2019-02-06 17:11 ` Chris Wilson
  2019-02-06 17:11 ` [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging Chris Wilson
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2019-02-06 17:11 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

When declaring the GPU wedged, we do need to hit the GPU with the reset
hammer so that its state matches our presumed state during cleanup. If
the reset fails, it fails, and we may be unhappy but wedged. However, if
we are testing our wedge/unwedged handling, the desync carries over into
the next test and promptly explodes.

References: https://bugs.freedesktop.org/show_bug.cgi?id=106702
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_reset.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index c67d6c2a09a2..64f26e17243a 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -532,9 +532,6 @@ typedef int (*reset_func)(struct drm_i915_private *,
 
 static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
 {
-	if (!i915_modparams.reset)
-		return NULL;
-
 	if (INTEL_GEN(i915) >= 8)
 		return gen8_reset_engines;
 	else if (INTEL_GEN(i915) >= 6)
@@ -599,6 +596,9 @@ bool intel_has_gpu_reset(struct drm_i915_private *i915)
 	if (USES_GUC(i915))
 		return false;
 
+	if (!i915_modparams.reset)
+		return NULL;
+
 	return intel_get_gpu_reset(i915);
 }
 
@@ -824,7 +824,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 		reset_prepare_engine(engine);
 
 	/* Even if the GPU reset fails, it should still stop the engines */
-	if (INTEL_GEN(i915) >= 5)
+	if (!INTEL_INFO(i915)->gpu_reset_clobbers_display)
 		intel_gpu_reset(i915, ALL_ENGINES);
 
 	for_each_engine(engine, i915, id) {
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging
  2019-02-06 17:11 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (2 preceding siblings ...)
  2019-02-06 17:11 ` [PATCH 4/8] drm/i915: Force the GPU reset upon wedging Chris Wilson
@ 2019-02-06 17:11 ` Chris Wilson
  2019-02-06 17:11 ` [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
                   ` (4 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2019-02-06 17:11 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

On wedging, we mark all executing requests as complete and all pending
requests completed as soon as they are ready. Before unwedging though we
wish to flush those pending requests prior to restoring default
execution, and so we must wait. Do so interruptibly as we do not provide
the EINTR gracefully back to userspace in this case but persistent in
the permanently wedged start without restarting the syscall.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 64f26e17243a..c4fcb450bd80 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -862,7 +862,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct i915_timeline *tl;
-	bool ret = false;
 
 	if (!test_bit(I915_WEDGED, &error->flags))
 		return true;
@@ -887,30 +886,20 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	mutex_lock(&i915->gt.timelines.mutex);
 	list_for_each_entry(tl, &i915->gt.timelines.active_list, link) {
 		struct i915_request *rq;
-		long timeout;
 
 		rq = i915_active_request_get_unlocked(&tl->last_request);
 		if (!rq)
 			continue;
 
 		/*
-		 * We can't use our normal waiter as we want to
-		 * avoid recursively trying to handle the current
-		 * reset. The basic dma_fence_default_wait() installs
-		 * a callback for dma_fence_signal(), which is
-		 * triggered by our nop handler (indirectly, the
-		 * callback enables the signaler thread which is
-		 * woken by the nop_submit_request() advancing the seqno
-		 * and when the seqno passes the fence, the signaler
-		 * then signals the fence waking us up).
+		 * All internal dependencies (i915_requests) will have
+		 * been flushed by the set-wedge, but we may be stuck waiting
+		 * for external fences. These should all be capped to 10s
+		 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
+		 * in the worst case.
 		 */
-		timeout = dma_fence_default_wait(&rq->fence, true,
-						 MAX_SCHEDULE_TIMEOUT);
+		dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT);
 		i915_request_put(rq);
-		if (timeout < 0) {
-			mutex_unlock(&i915->gt.timelines.mutex);
-			goto unlock;
-		}
 	}
 	mutex_unlock(&i915->gt.timelines.mutex);
 
@@ -931,11 +920,10 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
-	ret = true;
-unlock:
+
 	mutex_unlock(&i915->gpu_error.wedge_mutex);
 
-	return ret;
+	return true;
 }
 
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged
  2019-02-06 17:11 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (3 preceding siblings ...)
  2019-02-06 17:11 ` [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging Chris Wilson
@ 2019-02-06 17:11 ` Chris Wilson
  2019-02-06 17:11 ` [PATCH 7/8] drm/i915: Serialise resets with wedging Chris Wilson
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2019-02-06 17:11 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Since we use the debugfs to recover the device after modifying the
i915.reset parameter, we need to be sure that we apply the reset and not
piggy-back onto a concurrent one in order for the parameter to take
effect.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a6fd157b1637..8a488ffc8b7d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3874,13 +3874,9 @@ i915_wedged_set(void *data, u64 val)
 {
 	struct drm_i915_private *i915 = data;
 
-	/*
-	 * There is no safeguard against this debugfs entry colliding
-	 * with the hangcheck calling same i915_handle_error() in
-	 * parallel, causing an explosion. For now we assume that the
-	 * test harness is responsible enough not to inject gpu hangs
-	 * while it is writing to 'i915_wedged'
-	 */
+	/* Flush any previous reset before applying for a new one */
+	wait_event(i915->gpu_error.reset_queue,
+		   !test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags));
 
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 7/8] drm/i915: Serialise resets with wedging
  2019-02-06 17:11 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (4 preceding siblings ...)
  2019-02-06 17:11 ` [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
@ 2019-02-06 17:11 ` Chris Wilson
  2019-02-06 17:11 ` [PATCH 8/8] drm/i915: Don't claim an unstarted request was guilty Chris Wilson
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2019-02-06 17:11 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Prevent concurrent set-wedge with ongoing resets (and vice versa) by
taking the same wedge_mutex around both operations.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 68 ++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index c4fcb450bd80..9494b015185a 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -794,17 +794,14 @@ static void nop_submit_request(struct i915_request *request)
 	intel_engine_queue_breadcrumbs(engine);
 }
 
-void i915_gem_set_wedged(struct drm_i915_private *i915)
+static void __i915_gem_set_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	mutex_lock(&error->wedge_mutex);
-	if (test_bit(I915_WEDGED, &error->flags)) {
-		mutex_unlock(&error->wedge_mutex);
+	if (test_bit(I915_WEDGED, &error->flags))
 		return;
-	}
 
 	if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
 		struct drm_printer p = drm_debug_printer(__func__);
@@ -853,12 +850,18 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	set_bit(I915_WEDGED, &error->flags);
 
 	GEM_TRACE("end\n");
-	mutex_unlock(&error->wedge_mutex);
+}
 
-	wake_up_all(&error->reset_queue);
+void i915_gem_set_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+
+	mutex_lock(&error->wedge_mutex);
+	__i915_gem_set_wedged(i915);
+	mutex_unlock(&error->wedge_mutex);
 }
 
-bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+static bool __i915_gem_unset_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct i915_timeline *tl;
@@ -869,8 +872,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
 		return false;
 
-	mutex_lock(&error->wedge_mutex);
-
 	GEM_TRACE("start\n");
 
 	/*
@@ -921,11 +922,21 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
 
-	mutex_unlock(&i915->gpu_error.wedge_mutex);
-
 	return true;
 }
 
+bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	bool result;
+
+	mutex_lock(&error->wedge_mutex);
+	result = __i915_gem_unset_wedged(i915);
+	mutex_unlock(&error->wedge_mutex);
+
+	return result;
+}
+
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
 	int err, i;
@@ -975,7 +986,7 @@ void i915_reset(struct drm_i915_private *i915,
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
-	if (!i915_gem_unset_wedged(i915))
+	if (!__i915_gem_unset_wedged(i915))
 		return;
 
 	if (reason)
@@ -1037,7 +1048,7 @@ void i915_reset(struct drm_i915_private *i915,
 	 */
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 error:
-	i915_gem_set_wedged(i915);
+	__i915_gem_set_wedged(i915);
 	goto finish;
 }
 
@@ -1129,7 +1140,9 @@ static void i915_reset_device(struct drm_i915_private *i915,
 	i915_wedge_on_timeout(&w, i915, 5 * HZ) {
 		intel_prepare_reset(i915);
 
+		mutex_lock(&error->wedge_mutex);
 		i915_reset(i915, engine_mask, reason);
+		mutex_unlock(&error->wedge_mutex);
 
 		intel_finish_reset(i915);
 	}
@@ -1197,6 +1210,7 @@ void i915_handle_error(struct drm_i915_private *i915,
 		       unsigned long flags,
 		       const char *fmt, ...)
 {
+	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	intel_wakeref_t wakeref;
 	unsigned int tmp;
@@ -1233,20 +1247,19 @@ void i915_handle_error(struct drm_i915_private *i915,
 	 * Try engine reset when available. We fall back to full reset if
 	 * single reset fails.
 	 */
-	if (intel_has_reset_engine(i915) &&
-	    !i915_terminally_wedged(&i915->gpu_error)) {
+	if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
 		for_each_engine_masked(engine, i915, engine_mask, tmp) {
 			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
 			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					     &i915->gpu_error.flags))
+					     &error->flags))
 				continue;
 
 			if (i915_reset_engine(engine, msg) == 0)
 				engine_mask &= ~intel_engine_flag(engine);
 
 			clear_bit(I915_RESET_ENGINE + engine->id,
-				  &i915->gpu_error.flags);
-			wake_up_bit(&i915->gpu_error.flags,
+				  &error->flags);
+			wake_up_bit(&error->flags,
 				    I915_RESET_ENGINE + engine->id);
 		}
 	}
@@ -1255,10 +1268,9 @@ void i915_handle_error(struct drm_i915_private *i915,
 		goto out;
 
 	/* Full reset needs the mutex, stop any other user trying to do so. */
-	if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
-		wait_event(i915->gpu_error.reset_queue,
-			   !test_bit(I915_RESET_BACKOFF,
-				     &i915->gpu_error.flags));
+	if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) {
+		wait_event(error->reset_queue,
+			   !test_bit(I915_RESET_BACKOFF, &error->flags));
 		goto out; /* piggy-back on the other reset */
 	}
 
@@ -1268,8 +1280,8 @@ void i915_handle_error(struct drm_i915_private *i915,
 	/* Prevent any other reset-engine attempt. */
 	for_each_engine(engine, i915, tmp) {
 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					&i915->gpu_error.flags))
-			wait_on_bit(&i915->gpu_error.flags,
+					&error->flags))
+			wait_on_bit(&error->flags,
 				    I915_RESET_ENGINE + engine->id,
 				    TASK_UNINTERRUPTIBLE);
 	}
@@ -1278,11 +1290,11 @@ void i915_handle_error(struct drm_i915_private *i915,
 
 	for_each_engine(engine, i915, tmp) {
 		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &i915->gpu_error.flags);
+			  &error->flags);
 	}
 
-	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
-	wake_up_all(&i915->gpu_error.reset_queue);
+	clear_bit(I915_RESET_BACKOFF, &error->flags);
+	wake_up_all(&error->reset_queue);
 
 out:
 	intel_runtime_pm_put(i915, wakeref);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 8/8] drm/i915: Don't claim an unstarted request was guilty
  2019-02-06 17:11 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (5 preceding siblings ...)
  2019-02-06 17:11 ` [PATCH 7/8] drm/i915: Serialise resets with wedging Chris Wilson
@ 2019-02-06 17:11 ` Chris Wilson
  2019-02-06 17:35 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs Patchwork
  2019-02-06 18:01 ` ✗ Fi.CI.BAT: failure " Patchwork
  8 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2019-02-06 17:11 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

If we haven't even begun executing the payload of the stalled request,
then we should not claim that its userspace context was guilty of
submitting a hanging batch.

v2: Check for context corruption before trying to restart.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c              | 34 ++++++++++++++++++-
 drivers/gpu/drm/i915/selftests/igt_spinner.c  |  9 ++++-
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |  6 ++++
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5e98fd79bd9d..5d5ce91a5dfa 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1878,6 +1878,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
+static bool lrc_regs_ok(const struct i915_request *rq)
+{
+	const struct intel_ring *ring = rq->ring;
+	const u32 *regs = rq->hw_context->lrc_reg_state;
+
+	/* Quick spot check for the common signs of context corruption */
+
+	if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
+	    (RING_CTL_SIZE(ring->size) | RING_VALID))
+		return false;
+
+	if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
+		return false;
+
+	return true;
+}
+
 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1912,6 +1929,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	if (!rq)
 		goto out_unlock;
 
+	/*
+	 * If this request hasn't started yet, e.g. it is waiting on a
+	 * semaphore, we need to avoid skipping the request or else we
+	 * break the signaling chain. However, if the context is corrupt
+	 * the request will not restart and we will be stuck with a wedged
+	 * device. It is quite often the case that if we issue a reset
+	 * while the GPU is loading the context image, that context image
+	 * becomes corrupt.
+	 *
+	 * Otherwise, if we have not started yet, the request should replay
+	 * perfectly and we do not need to flag the result as being erroneous.
+	 */
+	if (!i915_request_started(rq) && lrc_regs_ok(rq))
+		goto out_unlock;
+
 	/*
 	 * If the request was innocent, we leave the request in the ELSP
 	 * and will try to replay it on restarting. The context image may
@@ -1924,7 +1956,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	 * image back to the expected values to skip over the guilty request.
 	 */
 	i915_reset_request(rq, stalled);
-	if (!stalled)
+	if (!stalled && lrc_regs_ok(rq))
 		goto out_unlock;
 
 	/*
diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c
index 9ebd9225684e..86354e51bdd3 100644
--- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
+++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
@@ -142,10 +142,17 @@ igt_spinner_create_request(struct igt_spinner *spin,
 	*batch++ = upper_32_bits(vma->node.start);
 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
 
-	i915_gem_chipset_flush(spin->i915);
+	if (engine->emit_init_breadcrumb &&
+	    rq->timeline->has_initial_breadcrumb) {
+		err = engine->emit_init_breadcrumb(rq);
+		if (err)
+			goto cancel_rq;
+	}
 
 	err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
 
+	i915_gem_chipset_flush(spin->i915);
+
 cancel_rq:
 	if (err) {
 		i915_request_skip(rq, err);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4886fac12628..36c17bfe05a7 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -246,6 +246,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 	if (INTEL_GEN(vm->i915) <= 5)
 		flags |= I915_DISPATCH_SECURE;
 
+	if (rq->engine->emit_init_breadcrumb) {
+		err = rq->engine->emit_init_breadcrumb(rq);
+		if (err)
+			goto cancel_rq;
+	}
+
 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 
 cancel_rq:
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* ✗ Fi.CI.SPARSE: warning for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs
  2019-02-06 17:11 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (6 preceding siblings ...)
  2019-02-06 17:11 ` [PATCH 8/8] drm/i915: Don't claim an unstarted request was guilty Chris Wilson
@ 2019-02-06 17:35 ` Patchwork
  2019-02-06 18:01 ` ✗ Fi.CI.BAT: failure " Patchwork
  8 siblings, 0 replies; 14+ messages in thread
From: Patchwork @ 2019-02-06 17:35 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs
URL   : https://patchwork.freedesktop.org/series/56294/
State : warning

== Summary ==

$ dim sparse origin/drm-tip
Sparse version: v0.5.2
Commit: drm/i915: Hack and slash, throttle execbuffer hogs
Okay!

Commit: drm/i915: Defer removing fence register tracking to rpm wakeup
Okay!

Commit: drm/i915: Revoke mmaps and prevent access to fence registers across reset
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_reset.c:1303:5: warning: context imbalance in 'i915_reset_trylock' - different lock contexts for basic block
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3565:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3559:16: warning: expression using sizeof(void)

Commit: drm/i915: Force the GPU reset upon wedging
Okay!

Commit: drm/i915: Uninterruptibly drain the timelines on unwedging
Okay!

Commit: drm/i915: Wait for old resets before applying debugfs/i915_wedged
Okay!

Commit: drm/i915: Serialise resets with wedging
Okay!

Commit: drm/i915: Don't claim an unstarted request was guilty
Okay!

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs
  2019-02-06 17:11 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (7 preceding siblings ...)
  2019-02-06 17:35 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs Patchwork
@ 2019-02-06 18:01 ` Patchwork
  2019-02-06 18:03   ` Chris Wilson
  8 siblings, 1 reply; 14+ messages in thread
From: Patchwork @ 2019-02-06 18:01 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs
URL   : https://patchwork.freedesktop.org/series/56294/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_5552 -> Patchwork_12157
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_12157 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_12157, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/56294/revisions/1/mbox/

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_12157:

### IGT changes ###

#### Possible regressions ####

  * igt@i915_selftest@live_hangcheck:
    - fi-bdw-5557u:       PASS -> INCOMPLETE
    - fi-cfl-8700k:       PASS -> INCOMPLETE
    - fi-bsw-n3050:       PASS -> INCOMPLETE

  * igt@kms_busy@basic-flip-a:
    - fi-byt-n2820:       PASS -> CRASH
    - fi-elk-e7500:       PASS -> CRASH
    - fi-byt-j1900:       PASS -> CRASH
    - fi-snb-2600:        PASS -> CRASH
    - fi-hsw-peppy:       PASS -> CRASH
    - fi-gdg-551:         PASS -> CRASH
    - fi-bwr-2160:        PASS -> CRASH
    - fi-snb-2520m:       NOTRUN -> CRASH
    - fi-ivb-3520m:       PASS -> CRASH
    - fi-hsw-4770:        PASS -> CRASH
    - fi-ivb-3770:        PASS -> CRASH
    - fi-ilk-650:         NOTRUN -> CRASH
    - fi-blb-e6850:       NOTRUN -> CRASH
    - fi-hsw-4770r:       PASS -> CRASH
    - fi-byt-clapper:     PASS -> CRASH
    - fi-pnv-d510:        PASS -> CRASH

  
#### Suppressed ####

  The following results come from untrusted machines, tests, or statuses.
  They do not affect the overall result.

  * igt@i915_selftest@live_hangcheck:
    - {fi-whl-u}:         PASS -> INCOMPLETE

  * {igt@runner@aborted}:
    - fi-pnv-d510:        NOTRUN -> FAIL
    - fi-hsw-peppy:       NOTRUN -> FAIL
    - fi-gdg-551:         NOTRUN -> FAIL
    - fi-snb-2520m:       NOTRUN -> FAIL
    - fi-hsw-4770:        NOTRUN -> FAIL
    - fi-ivb-3770:        NOTRUN -> FAIL
    - fi-byt-j1900:       NOTRUN -> FAIL
    - fi-blb-e6850:       NOTRUN -> FAIL
    - fi-hsw-4770r:       NOTRUN -> FAIL
    - fi-byt-clapper:     NOTRUN -> FAIL
    - fi-bwr-2160:        NOTRUN -> FAIL
    - fi-byt-n2820:       NOTRUN -> FAIL
    - fi-ivb-3520m:       NOTRUN -> FAIL
    - fi-snb-2600:        NOTRUN -> FAIL
    - fi-elk-e7500:       NOTRUN -> FAIL

  
Known issues
------------

  Here are the changes found in Patchwork_12157 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_selftest@live_evict:
    - fi-bsw-kefka:       PASS -> DMESG-WARN [fdo#107709]

  * igt@i915_selftest@live_execlists:
    - fi-apl-guc:         PASS -> INCOMPLETE [fdo#103927]

  * igt@i915_selftest@live_hangcheck:
    - fi-bdw-gvtdvm:      PASS -> INCOMPLETE [fdo#105600]
    - fi-kbl-r:           PASS -> INCOMPLETE [fdo#108744]
    - fi-kbl-7567u:       PASS -> INCOMPLETE [fdo#108744]
    - fi-glk-j4005:       PASS -> INCOMPLETE [fdo#103359] / [k.org#198133]
    - fi-cfl-8109u:       PASS -> INCOMPLETE [fdo#106070]
    - fi-skl-iommu:       PASS -> INCOMPLETE [fdo#108744]
    - fi-skl-gvtdvm:      PASS -> INCOMPLETE [fdo#105600] / [fdo#108744]
    - fi-kbl-x1275:       PASS -> INCOMPLETE [fdo#108744]
    - fi-bxt-j4205:       PASS -> INCOMPLETE [fdo#103927]
    - fi-skl-6700hq:      PASS -> INCOMPLETE [fdo#108744]
    - fi-kbl-7500u:       PASS -> INCOMPLETE [fdo#108744]
    - fi-kbl-8809g:       PASS -> INCOMPLETE [fdo#108744]
    - fi-skl-6600u:       PASS -> INCOMPLETE [fdo#108744]
    - fi-kbl-7560u:       PASS -> INCOMPLETE [fdo#108044] / [fdo#108744]
    - fi-skl-6770hq:      PASS -> INCOMPLETE [fdo#108744]
    - fi-skl-6700k2:      PASS -> INCOMPLETE [fdo#108744]
    - fi-skl-6260u:       PASS -> INCOMPLETE [fdo#108744]

  
#### Possible fixes ####

  * igt@gem_exec_suspend@basic-s4-devices:
    - fi-blb-e6850:       INCOMPLETE [fdo#107718] -> PASS

  * igt@pm_rpm@basic-pci-d3-state:
    - fi-bsw-kefka:       {SKIP} [fdo#109271] -> PASS

  * igt@pm_rpm@basic-rte:
    - fi-bsw-kefka:       FAIL [fdo#108800] -> PASS

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#103359]: https://bugs.freedesktop.org/show_bug.cgi?id=103359
  [fdo#103927]: https://bugs.freedesktop.org/show_bug.cgi?id=103927
  [fdo#105600]: https://bugs.freedesktop.org/show_bug.cgi?id=105600
  [fdo#106070]: https://bugs.freedesktop.org/show_bug.cgi?id=106070
  [fdo#107709]: https://bugs.freedesktop.org/show_bug.cgi?id=107709
  [fdo#107718]: https://bugs.freedesktop.org/show_bug.cgi?id=107718
  [fdo#108044]: https://bugs.freedesktop.org/show_bug.cgi?id=108044
  [fdo#108569]: https://bugs.freedesktop.org/show_bug.cgi?id=108569
  [fdo#108622]: https://bugs.freedesktop.org/show_bug.cgi?id=108622
  [fdo#108744]: https://bugs.freedesktop.org/show_bug.cgi?id=108744
  [fdo#108788]: https://bugs.freedesktop.org/show_bug.cgi?id=108788
  [fdo#108800]: https://bugs.freedesktop.org/show_bug.cgi?id=108800
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [k.org#198133]: https://bugzilla.kernel.org/show_bug.cgi?id=198133


Participating hosts (48 -> 45)
------------------------------

  Additional (2): fi-ilk-650 fi-snb-2520m 
  Missing    (5): fi-kbl-soraka fi-ilk-m540 fi-byt-squawks fi-bsw-cyan fi-icl-y 


Build changes
-------------

    * Linux: CI_DRM_5552 -> Patchwork_12157

  CI_DRM_5552: b7cd857f32e0b1d5ea61e5eeebc0334bafb34eaa @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4812: 592b854fead32c2b0dac7198edfb9a6bffd66932 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12157: 8dbe9c760b28086c24328d779beec1293b385203 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

8dbe9c760b28 drm/i915: Don't claim an unstarted request was guilty
f598e54a7d20 drm/i915: Serialise resets with wedging
912154f2cc59 drm/i915: Wait for old resets before applying debugfs/i915_wedged
04acd9626e6b drm/i915: Uninterruptibly drain the timelines on unwedging
09ae27568e0d drm/i915: Force the GPU reset upon wedging
c9140ab1a345 drm/i915: Revoke mmaps and prevent access to fence registers across reset
0c30a4ac6fa6 drm/i915: Defer removing fence register tracking to rpm wakeup
561d3cea096c drm/i915: Hack and slash, throttle execbuffer hogs

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12157/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: ✗ Fi.CI.BAT: failure for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs
  2019-02-06 18:01 ` ✗ Fi.CI.BAT: failure " Patchwork
@ 2019-02-06 18:03   ` Chris Wilson
  0 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2019-02-06 18:03 UTC (permalink / raw)
  To: Patchwork; +Cc: intel-gfx

Quoting Patchwork (2019-02-06 18:01:31)
> == Series Details ==
> 
> Series: series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs
> URL   : https://patchwork.freedesktop.org/series/56294/
> State : failure
> 
> == Summary ==
> 
> CI Bug Log - changes from CI_DRM_5552 -> Patchwork_12157
> ====================================================
> 
> Summary
> -------
> 
>   **FAILURE**
> 
>   Serious unknown changes coming with Patchwork_12157 absolutely need to be
>   verified manually.
>   
>   If you think the reported changes have nothing to do with the changes
>   introduced in Patchwork_12157, please notify your bug team to allow them
>   to document this new failure mode, which will reduce false positives in CI.
> 
>   External URL: https://patchwork.freedesktop.org/api/1.0/series/56294/revisions/1/mbox/
> 
> Possible new issues
> -------------------
> 
>   Here are the unknown changes that may have been introduced in Patchwork_12157:
> 
> ### IGT changes ###
> 
> #### Possible regressions ####
> 
>   * igt@i915_selftest@live_hangcheck:
>     - fi-bdw-5557u:       PASS -> INCOMPLETE
>     - fi-cfl-8700k:       PASS -> INCOMPLETE
>     - fi-bsw-n3050:       PASS -> INCOMPLETE
> 
>   * igt@kms_busy@basic-flip-a:
>     - fi-byt-n2820:       PASS -> CRASH
>     - fi-elk-e7500:       PASS -> CRASH
>     - fi-byt-j1900:       PASS -> CRASH
>     - fi-snb-2600:        PASS -> CRASH
>     - fi-hsw-peppy:       PASS -> CRASH
>     - fi-gdg-551:         PASS -> CRASH
>     - fi-bwr-2160:        PASS -> CRASH
>     - fi-snb-2520m:       NOTRUN -> CRASH
>     - fi-ivb-3520m:       PASS -> CRASH
>     - fi-hsw-4770:        PASS -> CRASH
>     - fi-ivb-3770:        PASS -> CRASH
>     - fi-ilk-650:         NOTRUN -> CRASH
>     - fi-blb-e6850:       NOTRUN -> CRASH
>     - fi-hsw-4770r:       PASS -> CRASH
>     - fi-byt-clapper:     PASS -> CRASH
>     - fi-pnv-d510:        PASS -> CRASH

Rebase wasn't careful enough and returned the srcu tag from
fence_update().
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged
  2019-02-08  9:56   ` Mika Kuoppala
@ 2019-02-08 10:01     ` Chris Wilson
  0 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2019-02-08 10:01 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2019-02-08 09:56:59)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Since we use the debugfs to recover the device after modifying the
> > i915.reset parameter, we need to be sure that we apply the reset and not
> > piggy-back onto a concurrent one in order for the parameter to take
> > effect.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/i915_debugfs.c | 10 +++-------
> >  1 file changed, 3 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index a6fd157b1637..8a488ffc8b7d 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -3874,13 +3874,9 @@ i915_wedged_set(void *data, u64 val)
> >  {
> >       struct drm_i915_private *i915 = data;
> >  
> > -     /*
> > -      * There is no safeguard against this debugfs entry colliding
> > -      * with the hangcheck calling same i915_handle_error() in
> > -      * parallel, causing an explosion. For now we assume that the
> > -      * test harness is responsible enough not to inject gpu hangs
> > -      * while it is writing to 'i915_wedged'
> > -      */
> > +     /* Flush any previous reset before applying for a new one */
> > +     wait_event(i915->gpu_error.reset_queue,
> > +                !test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags));
> 
> You removed the comment and yes this makes us wait on our turn
> to flip the switch. But the hangcheck vs this race still holds.

Concurrent resets have been safe for yonks... But what I realised
about our piggy-backing of the 2 resets into one meant that if the value
of the i915.reset modparam changed we didn't run with the updated value.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged
  2019-02-07  7:18 ` [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
@ 2019-02-08  9:56   ` Mika Kuoppala
  2019-02-08 10:01     ` Chris Wilson
  0 siblings, 1 reply; 14+ messages in thread
From: Mika Kuoppala @ 2019-02-08  9:56 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Since we use the debugfs to recover the device after modifying the
> i915.reset parameter, we need to be sure that we apply the reset and not
> piggy-back onto a concurrent one in order for the parameter to take
> effect.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c | 10 +++-------
>  1 file changed, 3 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index a6fd157b1637..8a488ffc8b7d 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3874,13 +3874,9 @@ i915_wedged_set(void *data, u64 val)
>  {
>  	struct drm_i915_private *i915 = data;
>  
> -	/*
> -	 * There is no safeguard against this debugfs entry colliding
> -	 * with the hangcheck calling same i915_handle_error() in
> -	 * parallel, causing an explosion. For now we assume that the
> -	 * test harness is responsible enough not to inject gpu hangs
> -	 * while it is writing to 'i915_wedged'
> -	 */
> +	/* Flush any previous reset before applying for a new one */
> +	wait_event(i915->gpu_error.reset_queue,
> +		   !test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags));

You removed the comment and yes this makes us wait on our turn
to flip the switch. But the hangcheck vs this race still holds.

Now even if they would two pile on this switch...there should be
no harm as in that case we see two log entries
resulting in a one reset.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

-Mika
>  
>  	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
>  			  "Manually set wedged engine mask = %llx", val);
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged
  2019-02-07  7:18 [PATCH 1/8] " Chris Wilson
@ 2019-02-07  7:18 ` Chris Wilson
  2019-02-08  9:56   ` Mika Kuoppala
  0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2019-02-07  7:18 UTC (permalink / raw)
  To: intel-gfx

Since we use the debugfs to recover the device after modifying the
i915.reset parameter, we need to be sure that we apply the reset and not
piggy-back onto a concurrent one in order for the parameter to take
effect.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a6fd157b1637..8a488ffc8b7d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3874,13 +3874,9 @@ i915_wedged_set(void *data, u64 val)
 {
 	struct drm_i915_private *i915 = data;
 
-	/*
-	 * There is no safeguard against this debugfs entry colliding
-	 * with the hangcheck calling same i915_handle_error() in
-	 * parallel, causing an explosion. For now we assume that the
-	 * test harness is responsible enough not to inject gpu hangs
-	 * while it is writing to 'i915_wedged'
-	 */
+	/* Flush any previous reset before applying for a new one */
+	wait_event(i915->gpu_error.reset_queue,
+		   !test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags));
 
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2019-02-08 10:02 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-06 17:11 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
2019-02-06 17:11 ` [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup Chris Wilson
2019-02-06 17:11 ` [PATCH 3/8] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
2019-02-06 17:11 ` [PATCH 4/8] drm/i915: Force the GPU reset upon wedging Chris Wilson
2019-02-06 17:11 ` [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging Chris Wilson
2019-02-06 17:11 ` [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
2019-02-06 17:11 ` [PATCH 7/8] drm/i915: Serialise resets with wedging Chris Wilson
2019-02-06 17:11 ` [PATCH 8/8] drm/i915: Don't claim an unstarted request was guilty Chris Wilson
2019-02-06 17:35 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs Patchwork
2019-02-06 18:01 ` ✗ Fi.CI.BAT: failure " Patchwork
2019-02-06 18:03   ` Chris Wilson
2019-02-07  7:18 [PATCH 1/8] " Chris Wilson
2019-02-07  7:18 ` [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
2019-02-08  9:56   ` Mika Kuoppala
2019-02-08 10:01     ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.