All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs
@ 2019-02-07  7:18 Chris Wilson
  2019-02-07  7:18 ` [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup Chris Wilson
                   ` (10 more replies)
  0 siblings, 11 replies; 33+ messages in thread
From: Chris Wilson @ 2019-02-07  7:18 UTC (permalink / raw)
  To: intel-gfx

Apply backpressure to hogs that emit requests faster than the GPU can
process them by waiting for their ring to be less than half-full before
proceeding with taking the struct_mutex.

This is a gross hack to apply throttling backpressure, the long term
goal is to remove the struct_mutex contention so that each client
naturally waits, preferably in an asynchronous, nonblocking fashion
(pipelined operations for the win), for their own resources and never
blocks another client within the driver at least. (Realtime priority
goals would extend to ensuring that resource contention favours high
priority clients as well.)

This patch only limits excessive request production and does not attempt
to throttle clients that block waiting for eviction (either global GTT or
system memory) or any other global resources, see above for the long term
goal.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 63 ++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.c    | 13 -----
 drivers/gpu/drm/i915/intel_ringbuffer.h    | 12 +++++
 3 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 8eedf7cac493..84ef3abc567e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -753,6 +753,64 @@ static int eb_select_context(struct i915_execbuffer *eb)
 	return 0;
 }
 
+static struct i915_request *__eb_wait_for_ring(struct intel_ring *ring)
+{
+	struct i915_request *rq;
+
+	if (intel_ring_update_space(ring) >= PAGE_SIZE)
+		return NULL;
+
+	/*
+	 * Find a request that after waiting upon, there will be at least half
+	 * the ring available. The hystersis allows us to compete for the
+	 * shared ring and should mean that we sleep less often prior to
+	 * claiming our resources, but not so long that the ring completely
+	 * drains before we can submit our next request.
+	 */
+	list_for_each_entry(rq, &ring->request_list, ring_link) {
+		if (__intel_ring_space(rq->postfix,
+				       ring->emit, ring->size) > ring->size / 2)
+			break;
+	}
+	if (&rq->ring_link == &ring->request_list)
+		return NULL; /* weird, we will check again later for real */
+
+	return i915_request_get(rq);
+}
+
+static int eb_wait_for_ring(const struct i915_execbuffer *eb)
+{
+	const struct intel_context *ce;
+	struct i915_request *rq;
+	int ret = 0;
+
+	/*
+	 * Apply a light amount of backpressure to prevent excessive hogs
+	 * from blocking waiting for space whilst holding struct_mutex and
+	 * keeping all of their resources pinned.
+	 */
+
+	ce = to_intel_context(eb->ctx, eb->engine);
+	if (!ce->ring) /* first use, assume empty! */
+		return 0;
+
+	rq = __eb_wait_for_ring(ce->ring);
+	if (rq) {
+		mutex_unlock(&eb->i915->drm.struct_mutex);
+
+		if (i915_request_wait(rq,
+				      I915_WAIT_INTERRUPTIBLE,
+				      MAX_SCHEDULE_TIMEOUT) < 0)
+			ret = -EINTR;
+
+		i915_request_put(rq);
+
+		mutex_lock(&eb->i915->drm.struct_mutex);
+	}
+
+	return ret;
+}
+
 static int eb_lookup_vmas(struct i915_execbuffer *eb)
 {
 	struct radix_tree_root *handles_vma = &eb->ctx->handles_vma;
@@ -2291,6 +2349,10 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	if (err)
 		goto err_rpm;
 
+	err = eb_wait_for_ring(&eb); /* may temporarily drop struct_mutex */
+	if (unlikely(err))
+		goto err_unlock;
+
 	err = eb_relocate(&eb);
 	if (err) {
 		/*
@@ -2435,6 +2497,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 err_vma:
 	if (eb.exec)
 		eb_release_vmas(&eb);
+err_unlock:
 	mutex_unlock(&dev->struct_mutex);
 err_rpm:
 	intel_runtime_pm_put(eb.i915, wakeref);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index b889b27f8aeb..7f841dba87b3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -49,19 +49,6 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 		I915_GEM_HWS_INDEX_ADDR);
 }
 
-static unsigned int __intel_ring_space(unsigned int head,
-				       unsigned int tail,
-				       unsigned int size)
-{
-	/*
-	 * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
-	 * same cacheline, the Head Pointer must not be greater than the Tail
-	 * Pointer."
-	 */
-	GEM_BUG_ON(!is_power_of_2(size));
-	return (head - tail - CACHELINE_BYTES) & (size - 1);
-}
-
 unsigned int intel_ring_update_space(struct intel_ring *ring)
 {
 	unsigned int space;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 4d4ea6963a72..710ffb221775 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -832,6 +832,18 @@ intel_ring_set_tail(struct intel_ring *ring, unsigned int tail)
 	return tail;
 }
 
+static inline unsigned int
+__intel_ring_space(unsigned int head, unsigned int tail, unsigned int size)
+{
+	/*
+	 * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
+	 * same cacheline, the Head Pointer must not be greater than the Tail
+	 * Pointer."
+	 */
+	GEM_BUG_ON(!is_power_of_2(size));
+	return (head - tail - CACHELINE_BYTES) & (size - 1);
+}
+
 void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno);
 
 int intel_engine_setup_common(struct intel_engine_cs *engine);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
@ 2019-02-07  7:18 ` Chris Wilson
  2019-02-07 13:22   ` Mika Kuoppala
  2019-02-07  7:18 ` [PATCH 3/8] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (9 subsequent siblings)
  10 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-07  7:18 UTC (permalink / raw)
  To: intel-gfx

Currently, we may simultaneously release the fence register from both
fence_update() and i915_gem_restore_fences(). This is dangerous, so
defer the bookkeeping entirely to i915_gem_restore_fences() when the
device is asleep.

Reported-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_fence_reg.c | 62 ++++++++++++-----------
 1 file changed, 32 insertions(+), 30 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index e037e94792f3..be89bd95ab7c 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -210,6 +210,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 			struct i915_vma *vma)
 {
 	intel_wakeref_t wakeref;
+	struct i915_vma *old;
 	int ret;
 
 	if (vma) {
@@ -229,49 +230,55 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 			return ret;
 	}
 
-	if (fence->vma) {
-		struct i915_vma *old = fence->vma;
-
+	old = xchg(&fence->vma, NULL);
+	if (old) {
 		ret = i915_active_request_retire(&old->last_fence,
 					     &old->obj->base.dev->struct_mutex);
-		if (ret)
+		if (ret) {
+			fence->vma = old;
 			return ret;
+		}
 
 		i915_vma_flush_writes(old);
-	}
 
-	if (fence->vma && fence->vma != vma) {
-		/* Ensure that all userspace CPU access is completed before
+		/*
+		 * Ensure that all userspace CPU access is completed before
 		 * stealing the fence.
 		 */
-		GEM_BUG_ON(fence->vma->fence != fence);
-		i915_vma_revoke_mmap(fence->vma);
-
-		fence->vma->fence = NULL;
-		fence->vma = NULL;
+		if (old != vma) {
+			GEM_BUG_ON(old->fence != fence);
+			i915_vma_revoke_mmap(old);
+			old->fence = NULL;
+		}
 
 		list_move(&fence->link, &fence->i915->mm.fence_list);
 	}
 
-	/* We only need to update the register itself if the device is awake.
+	/*
+	 * We only need to update the register itself if the device is awake.
 	 * If the device is currently powered down, we will defer the write
 	 * to the runtime resume, see i915_gem_restore_fences().
+	 *
+	 * This only works for removing the fence register, on acquisition
+	 * the caller must hold the rpm wakeref. The fence register must
+	 * be cleared before we can use any other fences to ensure that
+	 * the new fences do not overlap the elided clears, confusing HW.
 	 */
 	wakeref = intel_runtime_pm_get_if_in_use(fence->i915);
-	if (wakeref) {
-		fence_write(fence, vma);
-		intel_runtime_pm_put(fence->i915, wakeref);
+	if (!wakeref) {
+		GEM_BUG_ON(vma);
+		return 0;
 	}
 
-	if (vma) {
-		if (fence->vma != vma) {
-			vma->fence = fence;
-			fence->vma = vma;
-		}
+	fence_write(fence, vma);
+	fence->vma = vma;
 
+	if (vma) {
+		vma->fence = fence;
 		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
 	}
 
+	intel_runtime_pm_put(fence->i915, wakeref);
 	return 0;
 }
 
@@ -473,9 +480,10 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
 {
 	int i;
 
+	rcu_read_lock(); /* keep obj alive as we dereference */
 	for (i = 0; i < dev_priv->num_fence_regs; i++) {
 		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
-		struct i915_vma *vma = reg->vma;
+		struct i915_vma *vma = READ_ONCE(reg->vma);
 
 		GEM_BUG_ON(vma && vma->fence != reg);
 
@@ -483,18 +491,12 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
 		 * Commit delayed tiling changes if we have an object still
 		 * attached to the fence, otherwise just clear the fence.
 		 */
-		if (vma && !i915_gem_object_is_tiled(vma->obj)) {
-			GEM_BUG_ON(!reg->dirty);
-			GEM_BUG_ON(i915_vma_has_userfault(vma));
-
-			list_move(&reg->link, &dev_priv->mm.fence_list);
-			vma->fence = NULL;
+		if (vma && !i915_gem_object_is_tiled(vma->obj))
 			vma = NULL;
-		}
 
 		fence_write(reg, vma);
-		reg->vma = vma;
 	}
+	rcu_read_unlock();
 }
 
 /**
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 3/8] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
  2019-02-07  7:18 ` [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup Chris Wilson
@ 2019-02-07  7:18 ` Chris Wilson
  2019-02-07 15:05   ` Mika Kuoppala
  2019-02-07  7:18 ` [PATCH 4/8] drm/i915: Force the GPU reset upon wedging Chris Wilson
                   ` (8 subsequent siblings)
  10 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-07  7:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Previously, we were able to rely on the recursive properties of
struct_mutex to allow us to serialise revoking mmaps and reacquiring the
FENCE registers with them being clobbered over a global device reset.
I then proceeded to throw out the baby with the bath water in order to
pursue a struct_mutex-less reset.

Perusing LWN for alternative strategies, the dilemma on how to serialise
access to a global resource on one side was answered by
https://lwn.net/Articles/202847/ -- Sleepable RCU:

    1  int readside(void) {
    2      int idx;
    3      rcu_read_lock();
    4	   if (nomoresrcu) {
    5          rcu_read_unlock();
    6	       return -EINVAL;
    7      }
    8	   idx = srcu_read_lock(&ss);
    9	   rcu_read_unlock();
    10	   /* SRCU read-side critical section. */
    11	   srcu_read_unlock(&ss, idx);
    12	   return 0;
    13 }
    14
    15 void cleanup(void)
    16 {
    17     nomoresrcu = 1;
    18     synchronize_rcu();
    19     synchronize_srcu(&ss);
    20     cleanup_srcu_struct(&ss);
    21 }

No more worrying about stop_machine, just an uber-complex mutex,
optimised for reads, with the overhead pushed to the rare reset path.

However, we do run the risk of a deadlock as we allocate underneath the
SRCU read lock, and the allocation may require a GPU reset, causing a
dependency cycle via the in-flight requests. We resolve that by declaring
the driver wedged and cancelling all in-flight rendering.

v2: Use expedited rcu barriers to match our earlier timing
characteristics.
v3: Try to annotate locking contexts for sparse
v4: Reduce selftest lock duration to avoid a reset deadlock with fences
v5: s/srcu/reset_backoff_srcu/
v6: Remove more stale comments

Testcase: igt/gem_mmap_gtt/hang
Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  12 +-
 drivers/gpu/drm/i915/i915_drv.h               |  18 +--
 drivers/gpu/drm/i915/i915_gem.c               |  56 +++------
 drivers/gpu/drm/i915/i915_gem_fence_reg.c     |  36 ++----
 drivers/gpu/drm/i915/i915_gpu_error.h         |  39 ++-----
 drivers/gpu/drm/i915/i915_reset.c             | 110 +++++++++++-------
 drivers/gpu/drm/i915/i915_reset.h             |   4 +
 drivers/gpu/drm/i915/intel_drv.h              |   3 -
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |   5 +-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |   1 +
 10 files changed, 117 insertions(+), 167 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 0bd890c04fe4..a6fd157b1637 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1281,14 +1281,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	intel_wakeref_t wakeref;
 	enum intel_engine_id id;
 
+	seq_printf(m, "Reset flags: %lx\n", dev_priv->gpu_error.flags);
 	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Wedged\n");
+		seq_puts(m, "\tWedged\n");
 	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
-		seq_puts(m, "Waiter holding struct mutex\n");
-	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
-		seq_puts(m, "struct_mutex blocked for reset\n");
+		seq_puts(m, "\tDevice (global) reset in progress\n");
 
 	if (!i915_modparams.enable_hangcheck) {
 		seq_puts(m, "Hangcheck disabled\n");
@@ -3885,9 +3882,6 @@ i915_wedged_set(void *data, u64 val)
 	 * while it is writing to 'i915_wedged'
 	 */
 
-	if (i915_reset_backoff(&i915->gpu_error))
-		return -EAGAIN;
-
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a2293152cb6a..37230ae7fbe6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2989,7 +2989,12 @@ i915_gem_obj_finish_shmem_access(struct drm_i915_gem_object *obj)
 	i915_gem_object_unpin_pages(obj);
 }
 
-int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
+static inline int __must_check
+i915_mutex_lock_interruptible(struct drm_device *dev)
+{
+	return mutex_lock_interruptible(&dev->struct_mutex);
+}
+
 int i915_gem_dumb_create(struct drm_file *file_priv,
 			 struct drm_device *dev,
 			 struct drm_mode_create_dumb *args);
@@ -3006,21 +3011,11 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine);
 
-static inline bool i915_reset_backoff(struct i915_gpu_error *error)
-{
-	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
-}
-
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
 }
 
-static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
-{
-	return i915_reset_backoff(error) | i915_terminally_wedged(error);
-}
-
 static inline u32 i915_reset_count(struct i915_gpu_error *error)
 {
 	return READ_ONCE(error->reset_count);
@@ -3093,7 +3088,6 @@ struct drm_i915_fence_reg *
 i915_reserve_fence(struct drm_i915_private *dev_priv);
 void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
 
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv);
 void i915_gem_restore_fences(struct drm_i915_private *dev_priv);
 
 void i915_gem_detect_bit_6_swizzle(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 05ce9176ac4e..1eb3a5f8654c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -100,47 +100,6 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
 	spin_unlock(&dev_priv->mm.object_stat_lock);
 }
 
-static int
-i915_gem_wait_for_error(struct i915_gpu_error *error)
-{
-	int ret;
-
-	might_sleep();
-
-	/*
-	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
-	 * userspace. If it takes that long something really bad is going on and
-	 * we should simply try to bail out and fail as gracefully as possible.
-	 */
-	ret = wait_event_interruptible_timeout(error->reset_queue,
-					       !i915_reset_backoff(error),
-					       I915_RESET_TIMEOUT);
-	if (ret == 0) {
-		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
-		return -EIO;
-	} else if (ret < 0) {
-		return ret;
-	} else {
-		return 0;
-	}
-}
-
-int i915_mutex_lock_interruptible(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	int ret;
-
-	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
-	if (ret)
-		return ret;
-
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 static u32 __i915_gem_park(struct drm_i915_private *i915)
 {
 	intel_wakeref_t wakeref;
@@ -1869,6 +1828,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 	intel_wakeref_t wakeref;
 	struct i915_vma *vma;
 	pgoff_t page_offset;
+	int srcu;
 	int ret;
 
 	/* Sanity check that we allow writing into this object */
@@ -1908,7 +1868,6 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 		goto err_unlock;
 	}
 
-
 	/* Now pin it into the GTT as needed */
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
 				       PIN_MAPPABLE |
@@ -1946,9 +1905,15 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 	if (ret)
 		goto err_unpin;
 
+	srcu = i915_reset_trylock(dev_priv);
+	if (srcu < 0) {
+		ret = srcu;
+		goto err_unpin;
+	}
+
 	ret = i915_vma_pin_fence(vma);
 	if (ret)
-		goto err_unpin;
+		goto err_reset;
 
 	/* Finally, remap it using the new GTT offset */
 	ret = remap_io_mapping(area,
@@ -1969,6 +1934,8 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 
 err_fence:
 	i915_vma_unpin_fence(vma);
+err_reset:
+	i915_reset_unlock(dev_priv, srcu);
 err_unpin:
 	__i915_vma_unpin(vma);
 err_unlock:
@@ -5326,6 +5293,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
 	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
 	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
 	mutex_init(&dev_priv->gpu_error.wedge_mutex);
+	init_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
 
 	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
 
@@ -5358,6 +5326,8 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
 	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
 	WARN_ON(dev_priv->mm.object_count);
 
+	cleanup_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
+
 	kmem_cache_destroy(dev_priv->priorities);
 	kmem_cache_destroy(dev_priv->dependencies);
 	kmem_cache_destroy(dev_priv->requests);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index be89bd95ab7c..1ec1417cf8b4 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -270,6 +270,10 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		return 0;
 	}
 
+	ret = i915_reset_trylock(fence->i915);
+	if (ret < 0)
+		goto out_rpm;
+
 	fence_write(fence, vma);
 	fence->vma = vma;
 
@@ -278,8 +282,12 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
 	}
 
+	i915_reset_unlock(fence->i915, ret);
+	ret = 0;
+
+out_rpm:
 	intel_runtime_pm_put(fence->i915, wakeref);
-	return 0;
+	return ret;
 }
 
 /**
@@ -442,32 +450,6 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
 	list_add(&fence->link, &fence->i915->mm.fence_list);
 }
 
-/**
- * i915_gem_revoke_fences - revoke fence state
- * @dev_priv: i915 device private
- *
- * Removes all GTT mmappings via the fence registers. This forces any user
- * of the fence to reacquire that fence before continuing with their access.
- * One use is during GPU reset where the fence register is lost and we need to
- * revoke concurrent userspace access via GTT mmaps until the hardware has been
- * reset and the fence registers have been restored.
- */
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
-{
-	int i;
-
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
-
-		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
-
-		if (fence->vma)
-			i915_vma_revoke_mmap(fence->vma);
-	}
-}
-
 /**
  * i915_gem_restore_fences - restore fence state
  * @dev_priv: i915 device private
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 53b1f22dd365..afa3adb28f02 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -204,39 +204,13 @@ struct i915_gpu_error {
 
 	atomic_t pending_fb_pin;
 
-	/**
-	 * State variable controlling the reset flow and count
-	 *
-	 * This is a counter which gets incremented when reset is triggered,
-	 *
-	 * Before the reset commences, the I915_RESET_BACKOFF bit is set
-	 * meaning that any waiters holding onto the struct_mutex should
-	 * relinquish the lock immediately in order for the reset to start.
-	 *
-	 * If reset is not completed successfully, the I915_WEDGE bit is
-	 * set meaning that hardware is terminally sour and there is no
-	 * recovery. All waiters on the reset_queue will be woken when
-	 * that happens.
-	 *
-	 * This counter is used by the wait_seqno code to notice that reset
-	 * event happened and it needs to restart the entire ioctl (since most
-	 * likely the seqno it waited for won't ever signal anytime soon).
-	 *
-	 * This is important for lock-free wait paths, where no contended lock
-	 * naturally enforces the correct ordering between the bail-out of the
-	 * waiter and the gpu reset work code.
-	 */
-	unsigned long reset_count;
-
 	/**
 	 * flags: Control various stages of the GPU reset
 	 *
-	 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
-	 * other users acquiring the struct_mutex. To do this we set the
-	 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
-	 * and then check for that bit before acquiring the struct_mutex (in
-	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
-	 * secondary role in preventing two concurrent global reset attempts.
+	 * #I915_RESET_BACKOFF - When we start a global reset, we need to
+	 * serialise with any other users attempting to do the same, and
+	 * any global resources that may be clobber by the reset (such as
+	 * FENCE registers).
 	 *
 	 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
 	 * acquire the struct_mutex to reset an engine, we need an explicit
@@ -255,6 +229,9 @@ struct i915_gpu_error {
 #define I915_RESET_ENGINE	2
 #define I915_WEDGED		(BITS_PER_LONG - 1)
 
+	/** Number of times the device has been reset (global) */
+	u32 reset_count;
+
 	/** Number of times an engine has been reset */
 	u32 reset_engine_count[I915_NUM_ENGINES];
 
@@ -272,6 +249,8 @@ struct i915_gpu_error {
 	 */
 	wait_queue_head_t reset_queue;
 
+	struct srcu_struct reset_backoff_srcu;
+
 	struct i915_gpu_restart *restart;
 };
 
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 0e0ddf2e6815..c67d6c2a09a2 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -639,6 +639,32 @@ static void reset_prepare_engine(struct intel_engine_cs *engine)
 	engine->reset.prepare(engine);
 }
 
+static void revoke_mmaps(struct drm_i915_private *i915)
+{
+	int i;
+
+	for (i = 0; i < i915->num_fence_regs; i++) {
+		struct drm_vma_offset_node *node;
+		struct i915_vma *vma;
+		u64 vma_offset;
+
+		vma = READ_ONCE(i915->fence_regs[i].vma);
+		if (!vma)
+			continue;
+
+		if (!i915_vma_has_userfault(vma))
+			continue;
+
+		GEM_BUG_ON(vma->fence != &i915->fence_regs[i]);
+		node = &vma->obj->base.vma_node;
+		vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
+		unmap_mapping_range(i915->drm.anon_inode->i_mapping,
+				    drm_vma_node_offset_addr(node) + vma_offset,
+				    vma->size,
+				    1);
+	}
+}
+
 static void reset_prepare(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
@@ -648,6 +674,7 @@ static void reset_prepare(struct drm_i915_private *i915)
 		reset_prepare_engine(engine);
 
 	intel_uc_sanitize(i915);
+	revoke_mmaps(i915);
 }
 
 static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
@@ -911,50 +938,22 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	return ret;
 }
 
-struct __i915_reset {
-	struct drm_i915_private *i915;
-	unsigned int stalled_mask;
-};
-
-static int __i915_reset__BKL(void *data)
-{
-	struct __i915_reset *arg = data;
-	int err;
-
-	err = intel_gpu_reset(arg->i915, ALL_ENGINES);
-	if (err)
-		return err;
-
-	return gt_reset(arg->i915, arg->stalled_mask);
-}
-
-#if RESET_UNDER_STOP_MACHINE
-/*
- * XXX An alternative to using stop_machine would be to park only the
- * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
- * we should be able to prevent their memmory accesses via the lost fence
- * registers over the course of the reset without the potential recursive
- * of mutexes between the pagefault handler and reset.
- *
- * See igt/gem_mmap_gtt/hang
- */
-#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
-#else
-#define __do_reset(fn, arg) fn(arg)
-#endif
-
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
-	struct __i915_reset arg = { i915, stalled_mask };
 	int err, i;
 
-	err = __do_reset(__i915_reset__BKL, &arg);
+	/* Flush everyone currently using a resource about to be clobbered */
+	synchronize_srcu(&i915->gpu_error.reset_backoff_srcu);
+
+	err = intel_gpu_reset(i915, ALL_ENGINES);
 	for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
-		msleep(100);
-		err = __do_reset(__i915_reset__BKL, &arg);
+		msleep(10 * (i + 1));
+		err = intel_gpu_reset(i915, ALL_ENGINES);
 	}
+	if (err)
+		return err;
 
-	return err;
+	return gt_reset(i915, stalled_mask);
 }
 
 /**
@@ -966,8 +965,6 @@ static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
  * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
  * on failure.
  *
- * Caller must hold the struct_mutex.
- *
  * Procedure is fairly simple:
  *   - reset the chip using the reset reg
  *   - re-init context state
@@ -1274,9 +1271,12 @@ void i915_handle_error(struct drm_i915_private *i915,
 		wait_event(i915->gpu_error.reset_queue,
 			   !test_bit(I915_RESET_BACKOFF,
 				     &i915->gpu_error.flags));
-		goto out;
+		goto out; /* piggy-back on the other reset */
 	}
 
+	/* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
+	synchronize_rcu_expedited();
+
 	/* Prevent any other reset-engine attempt. */
 	for_each_engine(engine, i915, tmp) {
 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
@@ -1300,6 +1300,36 @@ void i915_handle_error(struct drm_i915_private *i915,
 	intel_runtime_pm_put(i915, wakeref);
 }
 
+int i915_reset_trylock(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	int srcu;
+
+	rcu_read_lock();
+	while (test_bit(I915_RESET_BACKOFF, &error->flags)) {
+		rcu_read_unlock();
+
+		if (wait_event_interruptible(error->reset_queue,
+					     !test_bit(I915_RESET_BACKOFF,
+						       &error->flags)))
+			return -EINTR;
+
+		rcu_read_lock();
+	}
+	srcu = srcu_read_lock(&error->reset_backoff_srcu);
+	rcu_read_unlock();
+
+	return srcu;
+}
+
+void i915_reset_unlock(struct drm_i915_private *i915, int tag)
+__releases(&i915->gpu_error.reset_backoff_srcu)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+
+	srcu_read_unlock(&error->reset_backoff_srcu, tag);
+}
+
 bool i915_reset_flush(struct drm_i915_private *i915)
 {
 	int err;
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index f2d347f319df..893c5d1c2eb8 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -9,6 +9,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/srcu.h>
 
 struct drm_i915_private;
 struct intel_engine_cs;
@@ -32,6 +33,9 @@ int i915_reset_engine(struct intel_engine_cs *engine,
 void i915_reset_request(struct i915_request *rq, bool guilty);
 bool i915_reset_flush(struct drm_i915_private *i915);
 
+int __must_check i915_reset_trylock(struct drm_i915_private *i915);
+void i915_reset_unlock(struct drm_i915_private *i915, int tag);
+
 bool intel_has_gpu_reset(struct drm_i915_private *i915);
 bool intel_has_reset_engine(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 9ec3d00fbd19..51a144534519 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -989,9 +989,6 @@ struct intel_crtc {
 
 	struct intel_crtc_state *config;
 
-	/* global reset count when the last flip was submitted */
-	unsigned int reset_count;
-
 	/* Access to these should be protected by dev_priv->irq_lock. */
 	bool cpu_fifo_underrun_disabled;
 	bool pch_fifo_underrun_disabled;
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 7b6f3bea9ef8..4886fac12628 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -1039,8 +1039,6 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 
 	/* Check that we can recover an unbind stuck on a hanging request */
 
-	igt_global_reset_lock(i915);
-
 	mutex_lock(&i915->drm.struct_mutex);
 	err = hang_init(&h, i915);
 	if (err)
@@ -1138,7 +1136,9 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 	}
 
 out_reset:
+	igt_global_reset_lock(i915);
 	fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
+	igt_global_reset_unlock(i915);
 
 	if (tsk) {
 		struct igt_wedge_me w;
@@ -1159,7 +1159,6 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 	hang_fini(&h);
 unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
-	igt_global_reset_unlock(i915);
 
 	if (i915_terminally_wedged(&i915->gpu_error))
 		return -EIO;
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 14ae46fda49f..fc516a2970f4 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -189,6 +189,7 @@ struct drm_i915_private *mock_gem_device(void)
 
 	init_waitqueue_head(&i915->gpu_error.wait_queue);
 	init_waitqueue_head(&i915->gpu_error.reset_queue);
+	init_srcu_struct(&i915->gpu_error.reset_backoff_srcu);
 	mutex_init(&i915->gpu_error.wedge_mutex);
 
 	i915->wq = alloc_ordered_workqueue("mock", 0);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 4/8] drm/i915: Force the GPU reset upon wedging
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
  2019-02-07  7:18 ` [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup Chris Wilson
  2019-02-07  7:18 ` [PATCH 3/8] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
@ 2019-02-07  7:18 ` Chris Wilson
  2019-02-08  9:31   ` Mika Kuoppala
  2019-02-07  7:18 ` [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging Chris Wilson
                   ` (7 subsequent siblings)
  10 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-07  7:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

When declaring the GPU wedged, we do need to hit the GPU with the reset
hammer so that its state matches our presumed state during cleanup. If
the reset fails, it fails, and we may be unhappy but wedged. However, if
we are testing our wedge/unwedged handling, the desync carries over into
the next test and promptly explodes.

References: https://bugs.freedesktop.org/show_bug.cgi?id=106702
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_reset.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index c67d6c2a09a2..64f26e17243a 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -532,9 +532,6 @@ typedef int (*reset_func)(struct drm_i915_private *,
 
 static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
 {
-	if (!i915_modparams.reset)
-		return NULL;
-
 	if (INTEL_GEN(i915) >= 8)
 		return gen8_reset_engines;
 	else if (INTEL_GEN(i915) >= 6)
@@ -599,6 +596,9 @@ bool intel_has_gpu_reset(struct drm_i915_private *i915)
 	if (USES_GUC(i915))
 		return false;
 
+	if (!i915_modparams.reset)
+		return NULL;
+
 	return intel_get_gpu_reset(i915);
 }
 
@@ -824,7 +824,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 		reset_prepare_engine(engine);
 
 	/* Even if the GPU reset fails, it should still stop the engines */
-	if (INTEL_GEN(i915) >= 5)
+	if (!INTEL_INFO(i915)->gpu_reset_clobbers_display)
 		intel_gpu_reset(i915, ALL_ENGINES);
 
 	for_each_engine(engine, i915, id) {
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (2 preceding siblings ...)
  2019-02-07  7:18 ` [PATCH 4/8] drm/i915: Force the GPU reset upon wedging Chris Wilson
@ 2019-02-07  7:18 ` Chris Wilson
  2019-02-08  9:46   ` Mika Kuoppala
  2019-02-07  7:18 ` [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
                   ` (6 subsequent siblings)
  10 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-07  7:18 UTC (permalink / raw)
  To: intel-gfx

On wedging, we mark all executing requests as complete and all pending
requests completed as soon as they are ready. Before unwedging though we
wish to flush those pending requests prior to restoring default
execution, and so we must wait. Do so interruptibly as we do not provide
the EINTR gracefully back to userspace in this case but persistent in
the permanently wedged start without restarting the syscall.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 28 ++++++++--------------------
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 64f26e17243a..c4fcb450bd80 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -862,7 +862,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct i915_timeline *tl;
-	bool ret = false;
 
 	if (!test_bit(I915_WEDGED, &error->flags))
 		return true;
@@ -887,30 +886,20 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	mutex_lock(&i915->gt.timelines.mutex);
 	list_for_each_entry(tl, &i915->gt.timelines.active_list, link) {
 		struct i915_request *rq;
-		long timeout;
 
 		rq = i915_active_request_get_unlocked(&tl->last_request);
 		if (!rq)
 			continue;
 
 		/*
-		 * We can't use our normal waiter as we want to
-		 * avoid recursively trying to handle the current
-		 * reset. The basic dma_fence_default_wait() installs
-		 * a callback for dma_fence_signal(), which is
-		 * triggered by our nop handler (indirectly, the
-		 * callback enables the signaler thread which is
-		 * woken by the nop_submit_request() advancing the seqno
-		 * and when the seqno passes the fence, the signaler
-		 * then signals the fence waking us up).
+		 * All internal dependencies (i915_requests) will have
+		 * been flushed by the set-wedge, but we may be stuck waiting
+		 * for external fences. These should all be capped to 10s
+		 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
+		 * in the worst case.
 		 */
-		timeout = dma_fence_default_wait(&rq->fence, true,
-						 MAX_SCHEDULE_TIMEOUT);
+		dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT);
 		i915_request_put(rq);
-		if (timeout < 0) {
-			mutex_unlock(&i915->gt.timelines.mutex);
-			goto unlock;
-		}
 	}
 	mutex_unlock(&i915->gt.timelines.mutex);
 
@@ -931,11 +920,10 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
-	ret = true;
-unlock:
+
 	mutex_unlock(&i915->gpu_error.wedge_mutex);
 
-	return ret;
+	return true;
 }
 
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (3 preceding siblings ...)
  2019-02-07  7:18 ` [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging Chris Wilson
@ 2019-02-07  7:18 ` Chris Wilson
  2019-02-08  9:56   ` Mika Kuoppala
  2019-02-07  7:18 ` [PATCH 7/8] drm/i915: Serialise resets with wedging Chris Wilson
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-07  7:18 UTC (permalink / raw)
  To: intel-gfx

Since we use the debugfs to recover the device after modifying the
i915.reset parameter, we need to be sure that we apply the reset and not
piggy-back onto a concurrent one in order for the parameter to take
effect.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index a6fd157b1637..8a488ffc8b7d 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3874,13 +3874,9 @@ i915_wedged_set(void *data, u64 val)
 {
 	struct drm_i915_private *i915 = data;
 
-	/*
-	 * There is no safeguard against this debugfs entry colliding
-	 * with the hangcheck calling same i915_handle_error() in
-	 * parallel, causing an explosion. For now we assume that the
-	 * test harness is responsible enough not to inject gpu hangs
-	 * while it is writing to 'i915_wedged'
-	 */
+	/* Flush any previous reset before applying for a new one */
+	wait_event(i915->gpu_error.reset_queue,
+		   !test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags));
 
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 7/8] drm/i915: Serialise resets with wedging
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (4 preceding siblings ...)
  2019-02-07  7:18 ` [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
@ 2019-02-07  7:18 ` Chris Wilson
  2019-02-08 14:30   ` Mika Kuoppala
  2019-02-07  7:18 ` [PATCH 8/8] drm/i915: Don't claim an unstarted request was guilty Chris Wilson
                   ` (4 subsequent siblings)
  10 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-07  7:18 UTC (permalink / raw)
  To: intel-gfx

Prevent concurrent set-wedge with ongoing resets (and vice versa) by
taking the same wedge_mutex around both operations.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 68 ++++++++++++++++++-------------
 1 file changed, 40 insertions(+), 28 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index c4fcb450bd80..9494b015185a 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -794,17 +794,14 @@ static void nop_submit_request(struct i915_request *request)
 	intel_engine_queue_breadcrumbs(engine);
 }
 
-void i915_gem_set_wedged(struct drm_i915_private *i915)
+static void __i915_gem_set_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	mutex_lock(&error->wedge_mutex);
-	if (test_bit(I915_WEDGED, &error->flags)) {
-		mutex_unlock(&error->wedge_mutex);
+	if (test_bit(I915_WEDGED, &error->flags))
 		return;
-	}
 
 	if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
 		struct drm_printer p = drm_debug_printer(__func__);
@@ -853,12 +850,18 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 	set_bit(I915_WEDGED, &error->flags);
 
 	GEM_TRACE("end\n");
-	mutex_unlock(&error->wedge_mutex);
+}
 
-	wake_up_all(&error->reset_queue);
+void i915_gem_set_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+
+	mutex_lock(&error->wedge_mutex);
+	__i915_gem_set_wedged(i915);
+	mutex_unlock(&error->wedge_mutex);
 }
 
-bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+static bool __i915_gem_unset_wedged(struct drm_i915_private *i915)
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	struct i915_timeline *tl;
@@ -869,8 +872,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
 		return false;
 
-	mutex_lock(&error->wedge_mutex);
-
 	GEM_TRACE("start\n");
 
 	/*
@@ -921,11 +922,21 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
 	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
 
-	mutex_unlock(&i915->gpu_error.wedge_mutex);
-
 	return true;
 }
 
+bool i915_gem_unset_wedged(struct drm_i915_private *i915)
+{
+	struct i915_gpu_error *error = &i915->gpu_error;
+	bool result;
+
+	mutex_lock(&error->wedge_mutex);
+	result = __i915_gem_unset_wedged(i915);
+	mutex_unlock(&error->wedge_mutex);
+
+	return result;
+}
+
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
 	int err, i;
@@ -975,7 +986,7 @@ void i915_reset(struct drm_i915_private *i915,
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
 	/* Clear any previous failed attempts at recovery. Time to try again. */
-	if (!i915_gem_unset_wedged(i915))
+	if (!__i915_gem_unset_wedged(i915))
 		return;
 
 	if (reason)
@@ -1037,7 +1048,7 @@ void i915_reset(struct drm_i915_private *i915,
 	 */
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 error:
-	i915_gem_set_wedged(i915);
+	__i915_gem_set_wedged(i915);
 	goto finish;
 }
 
@@ -1129,7 +1140,9 @@ static void i915_reset_device(struct drm_i915_private *i915,
 	i915_wedge_on_timeout(&w, i915, 5 * HZ) {
 		intel_prepare_reset(i915);
 
+		mutex_lock(&error->wedge_mutex);
 		i915_reset(i915, engine_mask, reason);
+		mutex_unlock(&error->wedge_mutex);
 
 		intel_finish_reset(i915);
 	}
@@ -1197,6 +1210,7 @@ void i915_handle_error(struct drm_i915_private *i915,
 		       unsigned long flags,
 		       const char *fmt, ...)
 {
+	struct i915_gpu_error *error = &i915->gpu_error;
 	struct intel_engine_cs *engine;
 	intel_wakeref_t wakeref;
 	unsigned int tmp;
@@ -1233,20 +1247,19 @@ void i915_handle_error(struct drm_i915_private *i915,
 	 * Try engine reset when available. We fall back to full reset if
 	 * single reset fails.
 	 */
-	if (intel_has_reset_engine(i915) &&
-	    !i915_terminally_wedged(&i915->gpu_error)) {
+	if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
 		for_each_engine_masked(engine, i915, engine_mask, tmp) {
 			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
 			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					     &i915->gpu_error.flags))
+					     &error->flags))
 				continue;
 
 			if (i915_reset_engine(engine, msg) == 0)
 				engine_mask &= ~intel_engine_flag(engine);
 
 			clear_bit(I915_RESET_ENGINE + engine->id,
-				  &i915->gpu_error.flags);
-			wake_up_bit(&i915->gpu_error.flags,
+				  &error->flags);
+			wake_up_bit(&error->flags,
 				    I915_RESET_ENGINE + engine->id);
 		}
 	}
@@ -1255,10 +1268,9 @@ void i915_handle_error(struct drm_i915_private *i915,
 		goto out;
 
 	/* Full reset needs the mutex, stop any other user trying to do so. */
-	if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
-		wait_event(i915->gpu_error.reset_queue,
-			   !test_bit(I915_RESET_BACKOFF,
-				     &i915->gpu_error.flags));
+	if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) {
+		wait_event(error->reset_queue,
+			   !test_bit(I915_RESET_BACKOFF, &error->flags));
 		goto out; /* piggy-back on the other reset */
 	}
 
@@ -1268,8 +1280,8 @@ void i915_handle_error(struct drm_i915_private *i915,
 	/* Prevent any other reset-engine attempt. */
 	for_each_engine(engine, i915, tmp) {
 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
-					&i915->gpu_error.flags))
-			wait_on_bit(&i915->gpu_error.flags,
+					&error->flags))
+			wait_on_bit(&error->flags,
 				    I915_RESET_ENGINE + engine->id,
 				    TASK_UNINTERRUPTIBLE);
 	}
@@ -1278,11 +1290,11 @@ void i915_handle_error(struct drm_i915_private *i915,
 
 	for_each_engine(engine, i915, tmp) {
 		clear_bit(I915_RESET_ENGINE + engine->id,
-			  &i915->gpu_error.flags);
+			  &error->flags);
 	}
 
-	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
-	wake_up_all(&i915->gpu_error.reset_queue);
+	clear_bit(I915_RESET_BACKOFF, &error->flags);
+	wake_up_all(&error->reset_queue);
 
 out:
 	intel_runtime_pm_put(i915, wakeref);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH 8/8] drm/i915: Don't claim an unstarted request was guilty
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (5 preceding siblings ...)
  2019-02-07  7:18 ` [PATCH 7/8] drm/i915: Serialise resets with wedging Chris Wilson
@ 2019-02-07  7:18 ` Chris Wilson
  2019-02-07  7:41   ` [PATCH] " Chris Wilson
  2019-02-07  8:08 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs (rev2) Patchwork
                   ` (3 subsequent siblings)
  10 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-07  7:18 UTC (permalink / raw)
  To: intel-gfx

If we haven't even begun executing the payload of the stalled request,
then we should not claim that its userspace context was guilty of
submitting a hanging batch.

v2: Check for context corruption before trying to restart.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c              | 34 ++++++++++++++++++-
 drivers/gpu/drm/i915/selftests/igt_spinner.c  |  9 ++++-
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |  6 ++++
 3 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5e98fd79bd9d..5d5ce91a5dfa 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1878,6 +1878,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
+static bool lrc_regs_ok(const struct i915_request *rq)
+{
+	const struct intel_ring *ring = rq->ring;
+	const u32 *regs = rq->hw_context->lrc_reg_state;
+
+	/* Quick spot check for the common signs of context corruption */
+
+	if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
+	    (RING_CTL_SIZE(ring->size) | RING_VALID))
+		return false;
+
+	if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
+		return false;
+
+	return true;
+}
+
 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1912,6 +1929,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	if (!rq)
 		goto out_unlock;
 
+	/*
+	 * If this request hasn't started yet, e.g. it is waiting on a
+	 * semaphore, we need to avoid skipping the request or else we
+	 * break the signaling chain. However, if the context is corrupt
+	 * the request will not restart and we will be stuck with a wedged
+	 * device. It is quite often the case that if we issue a reset
+	 * while the GPU is loading the context image, that context image
+	 * becomes corrupt.
+	 *
+	 * Otherwise, if we have not started yet, the request should replay
+	 * perfectly and we do not need to flag the result as being erroneous.
+	 */
+	if (!i915_request_started(rq) && lrc_regs_ok(rq))
+		goto out_unlock;
+
 	/*
 	 * If the request was innocent, we leave the request in the ELSP
 	 * and will try to replay it on restarting. The context image may
@@ -1924,7 +1956,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	 * image back to the expected values to skip over the guilty request.
 	 */
 	i915_reset_request(rq, stalled);
-	if (!stalled)
+	if (!stalled && lrc_regs_ok(rq))
 		goto out_unlock;
 
 	/*
diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c
index 9ebd9225684e..86354e51bdd3 100644
--- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
+++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
@@ -142,10 +142,17 @@ igt_spinner_create_request(struct igt_spinner *spin,
 	*batch++ = upper_32_bits(vma->node.start);
 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
 
-	i915_gem_chipset_flush(spin->i915);
+	if (engine->emit_init_breadcrumb &&
+	    rq->timeline->has_initial_breadcrumb) {
+		err = engine->emit_init_breadcrumb(rq);
+		if (err)
+			goto cancel_rq;
+	}
 
 	err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
 
+	i915_gem_chipset_flush(spin->i915);
+
 cancel_rq:
 	if (err) {
 		i915_request_skip(rq, err);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4886fac12628..36c17bfe05a7 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -246,6 +246,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 	if (INTEL_GEN(vm->i915) <= 5)
 		flags |= I915_DISPATCH_SECURE;
 
+	if (rq->engine->emit_init_breadcrumb) {
+		err = rq->engine->emit_init_breadcrumb(rq);
+		if (err)
+			goto cancel_rq;
+	}
+
 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 
 cancel_rq:
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* [PATCH] drm/i915: Don't claim an unstarted request was guilty
  2019-02-07  7:18 ` [PATCH 8/8] drm/i915: Don't claim an unstarted request was guilty Chris Wilson
@ 2019-02-07  7:41   ` Chris Wilson
  2019-02-08 14:47     ` Mika Kuoppala
  0 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-07  7:41 UTC (permalink / raw)
  To: intel-gfx

If we haven't even begun executing the payload of the stalled request,
then we should not claim that its userspace context was guilty of
submitting a hanging batch.

v2: Check for context corruption before trying to restart.
v3: Preserve semaphores on skipping requests (need to keep the timelines
intact).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c              | 42 +++++++++++++++++--
 drivers/gpu/drm/i915/selftests/igt_spinner.c  |  9 +++-
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |  6 +++
 3 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5e98fd79bd9d..e3134a635926 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
 	*cs++ = rq->fence.seqno - 1;
 
 	intel_ring_advance(rq, cs);
+
+	/* Record the updated position of the request's payload */
+	rq->infix = intel_ring_offset(rq, cs);
+
 	return 0;
 }
 
@@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
+static bool lrc_regs_ok(const struct i915_request *rq)
+{
+	const struct intel_ring *ring = rq->ring;
+	const u32 *regs = rq->hw_context->lrc_reg_state;
+
+	/* Quick spot check for the common signs of context corruption */
+
+	if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
+	    (RING_CTL_SIZE(ring->size) | RING_VALID))
+		return false;
+
+	if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
+		return false;
+
+	return true;
+}
+
 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -1912,6 +1933,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	if (!rq)
 		goto out_unlock;
 
+	/*
+	 * If this request hasn't started yet, e.g. it is waiting on a
+	 * semaphore, we need to avoid skipping the request or else we
+	 * break the signaling chain. However, if the context is corrupt
+	 * the request will not restart and we will be stuck with a wedged
+	 * device. It is quite often the case that if we issue a reset
+	 * while the GPU is loading the context image, that context image
+	 * becomes corrupt.
+	 *
+	 * Otherwise, if we have not started yet, the request should replay
+	 * perfectly and we do not need to flag the result as being erroneous.
+	 */
+	if (!i915_request_started(rq) && lrc_regs_ok(rq))
+		goto out_unlock;
+
 	/*
 	 * If the request was innocent, we leave the request in the ELSP
 	 * and will try to replay it on restarting. The context image may
@@ -1924,7 +1960,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	 * image back to the expected values to skip over the guilty request.
 	 */
 	i915_reset_request(rq, stalled);
-	if (!stalled)
+	if (!stalled && lrc_regs_ok(rq))
 		goto out_unlock;
 
 	/*
@@ -1942,8 +1978,8 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 		       engine->context_size - PAGE_SIZE);
 	}
 
-	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
-	rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
+	/* Rerun the request; its payload has been neutered (if guilty). */
+	rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
 	intel_ring_update_space(rq->ring);
 
 	execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c
index 9ebd9225684e..86354e51bdd3 100644
--- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
+++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
@@ -142,10 +142,17 @@ igt_spinner_create_request(struct igt_spinner *spin,
 	*batch++ = upper_32_bits(vma->node.start);
 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
 
-	i915_gem_chipset_flush(spin->i915);
+	if (engine->emit_init_breadcrumb &&
+	    rq->timeline->has_initial_breadcrumb) {
+		err = engine->emit_init_breadcrumb(rq);
+		if (err)
+			goto cancel_rq;
+	}
 
 	err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
 
+	i915_gem_chipset_flush(spin->i915);
+
 cancel_rq:
 	if (err) {
 		i915_request_skip(rq, err);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 4886fac12628..36c17bfe05a7 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -246,6 +246,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
 	if (INTEL_GEN(vm->i915) <= 5)
 		flags |= I915_DISPATCH_SECURE;
 
+	if (rq->engine->emit_init_breadcrumb) {
+		err = rq->engine->emit_init_breadcrumb(rq);
+		if (err)
+			goto cancel_rq;
+	}
+
 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
 
 cancel_rq:
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 33+ messages in thread

* ✗ Fi.CI.SPARSE: warning for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs (rev2)
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (6 preceding siblings ...)
  2019-02-07  7:18 ` [PATCH 8/8] drm/i915: Don't claim an unstarted request was guilty Chris Wilson
@ 2019-02-07  8:08 ` Patchwork
  2019-02-07  8:25 ` ✓ Fi.CI.BAT: success " Patchwork
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 33+ messages in thread
From: Patchwork @ 2019-02-07  8:08 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs (rev2)
URL   : https://patchwork.freedesktop.org/series/56328/
State : warning

== Summary ==

$ dim sparse origin/drm-tip
Sparse version: v0.5.2
Commit: drm/i915: Hack and slash, throttle execbuffer hogs
Okay!

Commit: drm/i915: Defer removing fence register tracking to rpm wakeup
Okay!

Commit: drm/i915: Revoke mmaps and prevent access to fence registers across reset
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_reset.c:1303:5: warning: context imbalance in 'i915_reset_trylock' - different lock contexts for basic block
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3565:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3559:16: warning: expression using sizeof(void)

Commit: drm/i915: Force the GPU reset upon wedging
Okay!

Commit: drm/i915: Uninterruptibly drain the timelines on unwedging
Okay!

Commit: drm/i915: Wait for old resets before applying debugfs/i915_wedged
Okay!

Commit: drm/i915: Serialise resets with wedging
Okay!

Commit: drm/i915: Don't claim an unstarted request was guilty
Okay!

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs (rev2)
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (7 preceding siblings ...)
  2019-02-07  8:08 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs (rev2) Patchwork
@ 2019-02-07  8:25 ` Patchwork
  2019-02-07  9:53 ` ✓ Fi.CI.IGT: " Patchwork
  2019-02-07 16:01 ` [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Joonas Lahtinen
  10 siblings, 0 replies; 33+ messages in thread
From: Patchwork @ 2019-02-07  8:25 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs (rev2)
URL   : https://patchwork.freedesktop.org/series/56328/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_5557 -> Patchwork_12165
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/56328/revisions/2/mbox/

Known issues
------------

  Here are the changes found in Patchwork_12165 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@kms_chamelium@hdmi-hpd-fast:
    - fi-kbl-7500u:       PASS -> FAIL [fdo#109485]

  * igt@kms_frontbuffer_tracking@basic:
    - fi-hsw-peppy:       NOTRUN -> DMESG-FAIL [fdo#102614]
    - fi-byt-clapper:     PASS -> FAIL [fdo#103167]

  * igt@kms_pipe_crc_basic@suspend-read-crc-pipe-a:
    - fi-blb-e6850:       PASS -> INCOMPLETE [fdo#107718]

  * igt@kms_pipe_crc_basic@suspend-read-crc-pipe-b:
    - fi-byt-clapper:     PASS -> FAIL [fdo#103191] / [fdo#107362] +1

  * igt@prime_vgem@basic-fence-flip:
    - fi-gdg-551:         PASS -> FAIL [fdo#103182]

  
#### Possible fixes ####

  * igt@kms_busy@basic-flip-b:
    - fi-gdg-551:         FAIL [fdo#103182] -> PASS

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#102614]: https://bugs.freedesktop.org/show_bug.cgi?id=102614
  [fdo#103167]: https://bugs.freedesktop.org/show_bug.cgi?id=103167
  [fdo#103182]: https://bugs.freedesktop.org/show_bug.cgi?id=103182
  [fdo#103191]: https://bugs.freedesktop.org/show_bug.cgi?id=103191
  [fdo#107362]: https://bugs.freedesktop.org/show_bug.cgi?id=107362
  [fdo#107718]: https://bugs.freedesktop.org/show_bug.cgi?id=107718
  [fdo#108569]: https://bugs.freedesktop.org/show_bug.cgi?id=108569
  [fdo#109226]: https://bugs.freedesktop.org/show_bug.cgi?id=109226
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109485]: https://bugs.freedesktop.org/show_bug.cgi?id=109485
  [fdo#109527]: https://bugs.freedesktop.org/show_bug.cgi?id=109527


Participating hosts (47 -> 43)
------------------------------

  Additional (3): fi-skl-guc fi-skl-6770hq fi-hsw-peppy 
  Missing    (7): fi-kbl-soraka fi-ilk-m540 fi-byt-squawks fi-icl-u2 fi-bwr-2160 fi-bsw-cyan fi-bdw-samus 


Build changes
-------------

    * Linux: CI_DRM_5557 -> Patchwork_12165

  CI_DRM_5557: 72302b1c5245655423f75a857aec82f037991b6f @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4812: 592b854fead32c2b0dac7198edfb9a6bffd66932 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12165: 71cf64add1a4a4161dafd6dc981760bd15dc0aee @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

71cf64add1a4 drm/i915: Don't claim an unstarted request was guilty
ce1f174a350f drm/i915: Serialise resets with wedging
d97c702e76f1 drm/i915: Wait for old resets before applying debugfs/i915_wedged
170a5922ce52 drm/i915: Uninterruptibly drain the timelines on unwedging
77266f8b305c drm/i915: Force the GPU reset upon wedging
d09a96c81249 drm/i915: Revoke mmaps and prevent access to fence registers across reset
ee217ea14032 drm/i915: Defer removing fence register tracking to rpm wakeup
cb0ee23a6277 drm/i915: Hack and slash, throttle execbuffer hogs

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12165/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* ✓ Fi.CI.IGT: success for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs (rev2)
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (8 preceding siblings ...)
  2019-02-07  8:25 ` ✓ Fi.CI.BAT: success " Patchwork
@ 2019-02-07  9:53 ` Patchwork
  2019-02-07 16:01 ` [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Joonas Lahtinen
  10 siblings, 0 replies; 33+ messages in thread
From: Patchwork @ 2019-02-07  9:53 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs (rev2)
URL   : https://patchwork.freedesktop.org/series/56328/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_5557_full -> Patchwork_12165_full
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  

Known issues
------------

  Here are the changes found in Patchwork_12165_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_exec_suspend@basic-s3:
    - shard-snb:          PASS -> DMESG-WARN [fdo#102365]

  * igt@kms_available_modes_crc@available_mode_test_crc:
    - shard-glk:          PASS -> FAIL [fdo#106641]

  * igt@kms_busy@extended-modeset-hang-newfb-render-b:
    - shard-kbl:          NOTRUN -> DMESG-WARN [fdo#107956] +4

  * igt@kms_busy@extended-modeset-hang-newfb-with-reset-render-b:
    - shard-snb:          NOTRUN -> DMESG-WARN [fdo#107956]

  * igt@kms_busy@extended-pageflip-hang-newfb-render-a:
    - shard-apl:          NOTRUN -> DMESG-WARN [fdo#107956]

  * igt@kms_color@pipe-a-degamma:
    - shard-kbl:          NOTRUN -> FAIL [fdo#104782] / [fdo#108145]

  * igt@kms_content_protection@atomic:
    - shard-kbl:          NOTRUN -> FAIL [fdo#108597] +1

  * igt@kms_cursor_crc@cursor-128x128-dpms:
    - shard-kbl:          NOTRUN -> FAIL [fdo#103232]

  * igt@kms_cursor_crc@cursor-64x21-random:
    - shard-apl:          PASS -> FAIL [fdo#103232] +2

  * igt@kms_cursor_crc@cursor-64x64-dpms:
    - shard-glk:          PASS -> FAIL [fdo#103232] +1

  * igt@kms_cursor_crc@cursor-64x64-suspend:
    - shard-kbl:          NOTRUN -> FAIL [fdo#103191] / [fdo#103232]

  * igt@kms_frontbuffer_tracking@fbc-1p-primscrn-cur-indfb-draw-pwrite:
    - shard-apl:          PASS -> FAIL [fdo#103167] +1

  * igt@kms_frontbuffer_tracking@fbc-1p-primscrn-spr-indfb-fullscreen:
    - shard-kbl:          NOTRUN -> FAIL [fdo#103167]

  * igt@kms_frontbuffer_tracking@fbc-2p-primscrn-spr-indfb-draw-pwrite:
    - shard-glk:          PASS -> FAIL [fdo#103167] +2

  * igt@kms_plane_alpha_blend@pipe-a-constant-alpha-max:
    - shard-apl:          NOTRUN -> FAIL [fdo#108145]

  * igt@kms_plane_alpha_blend@pipe-c-alpha-7efc:
    - shard-kbl:          NOTRUN -> FAIL [fdo#108145] / [fdo#108590] +3

  * igt@kms_plane_alpha_blend@pipe-c-alpha-opaque-fb:
    - shard-kbl:          NOTRUN -> FAIL [fdo#108145] +4

  * igt@kms_universal_plane@universal-plane-pipe-a-functional:
    - shard-glk:          PASS -> FAIL [fdo#103166]
    - shard-apl:          PASS -> FAIL [fdo#103166]

  
#### Possible fixes ####

  * igt@gem_exec_blt@cold-min:
    - shard-glk:          DMESG-WARN [fdo#105763] / [fdo#106538] -> PASS

  * igt@gem_exec_schedule@pi-ringfull-blt:
    - shard-apl:          FAIL [fdo#103158] -> PASS +3

  * igt@gem_exec_schedule@pi-ringfull-bsd:
    - shard-glk:          FAIL [fdo#103158] -> PASS +3

  * igt@gem_exec_schedule@pi-ringfull-bsd1:
    - shard-kbl:          FAIL [fdo#103158] -> PASS +1

  * igt@gem_mmap_gtt@hang:
    - shard-hsw:          FAIL [fdo#109469] -> PASS
    - shard-glk:          FAIL [fdo#109469] -> PASS
    - shard-apl:          FAIL [fdo#109469] -> PASS

  * igt@i915_selftest@live_hangcheck:
    - shard-snb:          INCOMPLETE [fdo#105411] -> PASS

  * igt@kms_cursor_crc@cursor-256x85-random:
    - shard-apl:          FAIL [fdo#103232] -> PASS +1

  * igt@kms_cursor_crc@cursor-size-change:
    - shard-glk:          FAIL [fdo#103232] -> PASS +3

  * igt@kms_cursor_legacy@2x-long-nonblocking-modeset-vs-cursor-atomic:
    - shard-glk:          FAIL [fdo#105454] / [fdo#106509] -> PASS

  * igt@kms_flip@basic-plain-flip:
    - shard-hsw:          DMESG-WARN [fdo#102614] -> PASS

  * igt@kms_frontbuffer_tracking@fbc-1p-offscren-pri-indfb-draw-blt:
    - shard-snb:          {SKIP} [fdo#109271] -> PASS +5

  * igt@kms_frontbuffer_tracking@fbc-1p-primscrn-spr-indfb-draw-mmap-gtt:
    - shard-glk:          FAIL [fdo#103167] -> PASS +1

  * igt@kms_plane@pixel-format-pipe-b-planes-source-clamping:
    - shard-apl:          FAIL [fdo#108948] -> PASS

  * igt@kms_plane_alpha_blend@pipe-a-alpha-opaque-fb:
    - shard-apl:          FAIL [fdo#108145] -> PASS
    - shard-glk:          FAIL [fdo#108145] -> PASS +1

  * igt@kms_plane_multiple@atomic-pipe-b-tiling-yf:
    - shard-apl:          FAIL [fdo#103166] -> PASS +1

  * igt@kms_setmode@basic:
    - shard-apl:          FAIL [fdo#99912] -> PASS
    - shard-hsw:          FAIL [fdo#99912] -> PASS

  
#### Warnings ####

  * igt@gem_mmap_gtt@hang:
    - shard-snb:          FAIL [fdo#109469] -> INCOMPLETE [fdo#105411]

  * igt@kms_ccs@pipe-c-bad-rotation-90:
    - shard-apl:          {SKIP} [fdo#109271] -> INCOMPLETE [fdo#103927]

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#102365]: https://bugs.freedesktop.org/show_bug.cgi?id=102365
  [fdo#102614]: https://bugs.freedesktop.org/show_bug.cgi?id=102614
  [fdo#103158]: https://bugs.freedesktop.org/show_bug.cgi?id=103158
  [fdo#103166]: https://bugs.freedesktop.org/show_bug.cgi?id=103166
  [fdo#103167]: https://bugs.freedesktop.org/show_bug.cgi?id=103167
  [fdo#103191]: https://bugs.freedesktop.org/show_bug.cgi?id=103191
  [fdo#103232]: https://bugs.freedesktop.org/show_bug.cgi?id=103232
  [fdo#103927]: https://bugs.freedesktop.org/show_bug.cgi?id=103927
  [fdo#104782]: https://bugs.freedesktop.org/show_bug.cgi?id=104782
  [fdo#105411]: https://bugs.freedesktop.org/show_bug.cgi?id=105411
  [fdo#105454]: https://bugs.freedesktop.org/show_bug.cgi?id=105454
  [fdo#105763]: https://bugs.freedesktop.org/show_bug.cgi?id=105763
  [fdo#106509]: https://bugs.freedesktop.org/show_bug.cgi?id=106509
  [fdo#106538]: https://bugs.freedesktop.org/show_bug.cgi?id=106538
  [fdo#106641]: https://bugs.freedesktop.org/show_bug.cgi?id=106641
  [fdo#107956]: https://bugs.freedesktop.org/show_bug.cgi?id=107956
  [fdo#108145]: https://bugs.freedesktop.org/show_bug.cgi?id=108145
  [fdo#108590]: https://bugs.freedesktop.org/show_bug.cgi?id=108590
  [fdo#108597]: https://bugs.freedesktop.org/show_bug.cgi?id=108597
  [fdo#108948]: https://bugs.freedesktop.org/show_bug.cgi?id=108948
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109469]: https://bugs.freedesktop.org/show_bug.cgi?id=109469
  [fdo#99912]: https://bugs.freedesktop.org/show_bug.cgi?id=99912


Participating hosts (7 -> 5)
------------------------------

  Missing    (2): shard-skl shard-iclb 


Build changes
-------------

    * Linux: CI_DRM_5557 -> Patchwork_12165

  CI_DRM_5557: 72302b1c5245655423f75a857aec82f037991b6f @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4812: 592b854fead32c2b0dac7198edfb9a6bffd66932 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12165: 71cf64add1a4a4161dafd6dc981760bd15dc0aee @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12165/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup
  2019-02-07  7:18 ` [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup Chris Wilson
@ 2019-02-07 13:22   ` Mika Kuoppala
  2019-02-07 13:38     ` Chris Wilson
  0 siblings, 1 reply; 33+ messages in thread
From: Mika Kuoppala @ 2019-02-07 13:22 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Currently, we may simultaneously release the fence register from both
> fence_update() and i915_gem_restore_fences(). This is dangerous, so
> defer the bookkeeping entirely to i915_gem_restore_fences() when the
> device is asleep.
>
> Reported-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_fence_reg.c | 62 ++++++++++++-----------
>  1 file changed, 32 insertions(+), 30 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> index e037e94792f3..be89bd95ab7c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> @@ -210,6 +210,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>  			struct i915_vma *vma)
>  {
>  	intel_wakeref_t wakeref;
> +	struct i915_vma *old;
>  	int ret;
>  
>  	if (vma) {
> @@ -229,49 +230,55 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>  			return ret;
>  	}
>  
> -	if (fence->vma) {
> -		struct i915_vma *old = fence->vma;
> -
> +	old = xchg(&fence->vma, NULL);

So this is for restore seeing fence consistently.

> +	if (old) {
>  		ret = i915_active_request_retire(&old->last_fence,
>  					     &old->obj->base.dev->struct_mutex);
> -		if (ret)
> +		if (ret) {
> +			fence->vma = old;

And this then won't matter as the restore fences had zeroed
the fence reg before error. We get back to exact state
later but when?

>  			return ret;
> +		}
>  
>  		i915_vma_flush_writes(old);
> -	}
>  
> -	if (fence->vma && fence->vma != vma) {
> -		/* Ensure that all userspace CPU access is completed before
> +		/*
> +		 * Ensure that all userspace CPU access is completed before
>  		 * stealing the fence.
>  		 */
> -		GEM_BUG_ON(fence->vma->fence != fence);
> -		i915_vma_revoke_mmap(fence->vma);
> -
> -		fence->vma->fence = NULL;
> -		fence->vma = NULL;
> +		if (old != vma) {
> +			GEM_BUG_ON(old->fence != fence);
> +			i915_vma_revoke_mmap(old);
> +			old->fence = NULL;
> +		}
>  
>  		list_move(&fence->link, &fence->i915->mm.fence_list);
>  	}
>  
> -	/* We only need to update the register itself if the device is awake.
> +	/*
> +	 * We only need to update the register itself if the device is awake.
>  	 * If the device is currently powered down, we will defer the write
>  	 * to the runtime resume, see i915_gem_restore_fences().
> +	 *
> +	 * This only works for removing the fence register, on acquisition
> +	 * the caller must hold the rpm wakeref. The fence register must
> +	 * be cleared before we can use any other fences to ensure that
> +	 * the new fences do not overlap the elided clears, confusing HW.
>  	 */
>  	wakeref = intel_runtime_pm_get_if_in_use(fence->i915);
> -	if (wakeref) {
> -		fence_write(fence, vma);
> -		intel_runtime_pm_put(fence->i915, wakeref);
> +	if (!wakeref) {
> +		GEM_BUG_ON(vma);
> +		return 0;
>  	}
>  
> -	if (vma) {
> -		if (fence->vma != vma) {
> -			vma->fence = fence;
> -			fence->vma = vma;
> -		}
> +	fence_write(fence, vma);
> +	fence->vma = vma;
>  
> +	if (vma) {
> +		vma->fence = fence;
>  		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
>  	}
>  
> +	intel_runtime_pm_put(fence->i915, wakeref);
>  	return 0;
>  }
>  
> @@ -473,9 +480,10 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
>  {
>  	int i;
>  
> +	rcu_read_lock(); /* keep obj alive as we dereference */
>  	for (i = 0; i < dev_priv->num_fence_regs; i++) {
>  		struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];

I do have spent some amount of time to try to figure out if
there is a reasoning of sometimes calling the fence reg as 'fence'
and in other cases 'reg'.

If there is a reason, help me out. If there is not, I
politely ask to follow the same naming than in revoke_fences.

Or that we go for 'fence_reg' always when talking about
preallocated reg slots.

> -		struct i915_vma *vma = reg->vma;
> +		struct i915_vma *vma = READ_ONCE(reg->vma);
>  
>  		GEM_BUG_ON(vma && vma->fence != reg);
>  
> @@ -483,18 +491,12 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
>  		 * Commit delayed tiling changes if we have an object still
>  		 * attached to the fence, otherwise just clear the fence.
>  		 */
> -		if (vma && !i915_gem_object_is_tiled(vma->obj)) {
> -			GEM_BUG_ON(!reg->dirty);

You omit the dirty check here. If the reasoning is
that we can't sample due to inconstency wrt rest of fence reg,
does it then lead to need to make a __fence_write()
that does not write the dirty flag.

For making sure that for next pin won't drop the write?

> -			GEM_BUG_ON(i915_vma_has_userfault(vma));
> -
> -			list_move(&reg->link, &dev_priv->mm.fence_list);

This makes life easier.

> -			vma->fence = NULL;
> +		if (vma && !i915_gem_object_is_tiled(vma->obj))
>  			vma = NULL;
> -		}
>  
>  		fence_write(reg, vma);
> -		reg->vma = vma;
>  	}
> +	rcu_read_unlock();
>  }
>  
>  /**
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup
  2019-02-07 13:22   ` Mika Kuoppala
@ 2019-02-07 13:38     ` Chris Wilson
  2019-02-07 14:09       ` Mika Kuoppala
  0 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-07 13:38 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2019-02-07 13:22:45)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Currently, we may simultaneously release the fence register from both
> > fence_update() and i915_gem_restore_fences(). This is dangerous, so
> > defer the bookkeeping entirely to i915_gem_restore_fences() when the
> > device is asleep.
> >
> > Reported-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_gem_fence_reg.c | 62 ++++++++++++-----------
> >  1 file changed, 32 insertions(+), 30 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> > index e037e94792f3..be89bd95ab7c 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> > @@ -210,6 +210,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
> >                       struct i915_vma *vma)
> >  {
> >       intel_wakeref_t wakeref;
> > +     struct i915_vma *old;
> >       int ret;
> >  
> >       if (vma) {
> > @@ -229,49 +230,55 @@ static int fence_update(struct drm_i915_fence_reg *fence,
> >                       return ret;
> >       }
> >  
> > -     if (fence->vma) {
> > -             struct i915_vma *old = fence->vma;
> > -
> > +     old = xchg(&fence->vma, NULL);
> 
> So this is for restore seeing fence consistently.
> 
> > +     if (old) {
> >               ret = i915_active_request_retire(&old->last_fence,
> >                                            &old->obj->base.dev->struct_mutex);
> > -             if (ret)
> > +             if (ret) {
> > +                     fence->vma = old;
> 
> And this then won't matter as the restore fences had zeroed
> the fence reg before error. We get back to exact state
> later but when?

This operation is under the mutex guarding the fences, and the previous
fence was unpinned so not in used. Prior to being used, all is
consistent.

> >                       return ret;
> > +             }
> >  
> >               i915_vma_flush_writes(old);
> > -     }
> >  
> > -     if (fence->vma && fence->vma != vma) {
> > -             /* Ensure that all userspace CPU access is completed before
> > +             /*
> > +              * Ensure that all userspace CPU access is completed before
> >                * stealing the fence.
> >                */
> > -             GEM_BUG_ON(fence->vma->fence != fence);
> > -             i915_vma_revoke_mmap(fence->vma);
> > -
> > -             fence->vma->fence = NULL;
> > -             fence->vma = NULL;
> > +             if (old != vma) {
> > +                     GEM_BUG_ON(old->fence != fence);
> > +                     i915_vma_revoke_mmap(old);
> > +                     old->fence = NULL;
> > +             }
> >  
> >               list_move(&fence->link, &fence->i915->mm.fence_list);
> >       }
> >  
> > -     /* We only need to update the register itself if the device is awake.
> > +     /*
> > +      * We only need to update the register itself if the device is awake.
> >        * If the device is currently powered down, we will defer the write
> >        * to the runtime resume, see i915_gem_restore_fences().
> > +      *
> > +      * This only works for removing the fence register, on acquisition
> > +      * the caller must hold the rpm wakeref. The fence register must
> > +      * be cleared before we can use any other fences to ensure that
> > +      * the new fences do not overlap the elided clears, confusing HW.
> >        */
> >       wakeref = intel_runtime_pm_get_if_in_use(fence->i915);
> > -     if (wakeref) {
> > -             fence_write(fence, vma);
> > -             intel_runtime_pm_put(fence->i915, wakeref);
> > +     if (!wakeref) {
> > +             GEM_BUG_ON(vma);
> > +             return 0;
> >       }
> >  
> > -     if (vma) {
> > -             if (fence->vma != vma) {
> > -                     vma->fence = fence;
> > -                     fence->vma = vma;
> > -             }
> > +     fence_write(fence, vma);
> > +     fence->vma = vma;
> >  
> > +     if (vma) {
> > +             vma->fence = fence;
> >               list_move_tail(&fence->link, &fence->i915->mm.fence_list);
> >       }
> >  
> > +     intel_runtime_pm_put(fence->i915, wakeref);
> >       return 0;
> >  }
> >  
> > @@ -473,9 +480,10 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
> >  {
> >       int i;
> >  
> > +     rcu_read_lock(); /* keep obj alive as we dereference */
> >       for (i = 0; i < dev_priv->num_fence_regs; i++) {
> >               struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
> 
> I do have spent some amount of time to try to figure out if
> there is a reasoning of sometimes calling the fence reg as 'fence'
> and in other cases 'reg'.
> 
> If there is a reason, help me out. If there is not, I
> politely ask to follow the same naming than in revoke_fences.

The hw is known as fences, but so are other things. reg is too general,
and the use here is inconsistent with every other use of reg. In short,
it really doesn't matter...

> Or that we go for 'fence_reg' always when talking about
> preallocated reg slots.

Except now you are pulling my leg.

> > -             struct i915_vma *vma = reg->vma;
> > +             struct i915_vma *vma = READ_ONCE(reg->vma);
> >  
> >               GEM_BUG_ON(vma && vma->fence != reg);
> >  
> > @@ -483,18 +491,12 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
> >                * Commit delayed tiling changes if we have an object still
> >                * attached to the fence, otherwise just clear the fence.
> >                */
> > -             if (vma && !i915_gem_object_is_tiled(vma->obj)) {
> > -                     GEM_BUG_ON(!reg->dirty);
> 
> You omit the dirty check here. If the reasoning is
> that we can't sample due to inconstency wrt rest of fence reg,
> does it then lead to need to make a __fence_write()
> that does not write the dirty flag.

Because it doesn't matter, we are just flushing the register back to the
known state.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup
  2019-02-07 13:38     ` Chris Wilson
@ 2019-02-07 14:09       ` Mika Kuoppala
  2019-02-07 14:13         ` Chris Wilson
  0 siblings, 1 reply; 33+ messages in thread
From: Mika Kuoppala @ 2019-02-07 14:09 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2019-02-07 13:22:45)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > Currently, we may simultaneously release the fence register from both
>> > fence_update() and i915_gem_restore_fences(). This is dangerous, so
>> > defer the bookkeeping entirely to i915_gem_restore_fences() when the
>> > device is asleep.
>> >
>> > Reported-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

A tad overstatement but fine. I feel like I was wandering
in these hoods, being lost and confused and bumping into fences.

>> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> > Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
>> > ---
>> >  drivers/gpu/drm/i915/i915_gem_fence_reg.c | 62 ++++++++++++-----------
>> >  1 file changed, 32 insertions(+), 30 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
>> > index e037e94792f3..be89bd95ab7c 100644
>> > --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
>> > +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
>> > @@ -210,6 +210,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>> >                       struct i915_vma *vma)
>> >  {
>> >       intel_wakeref_t wakeref;
>> > +     struct i915_vma *old;
>> >       int ret;
>> >  
>> >       if (vma) {
>> > @@ -229,49 +230,55 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>> >                       return ret;
>> >       }
>> >  
>> > -     if (fence->vma) {
>> > -             struct i915_vma *old = fence->vma;
>> > -
>> > +     old = xchg(&fence->vma, NULL);
>> 
>> So this is for restore seeing fence consistently.
>> 
>> > +     if (old) {
>> >               ret = i915_active_request_retire(&old->last_fence,
>> >                                            &old->obj->base.dev->struct_mutex);
>> > -             if (ret)
>> > +             if (ret) {
>> > +                     fence->vma = old;
>> 
>> And this then won't matter as the restore fences had zeroed
>> the fence reg before error. We get back to exact state
>> later but when?
>
> This operation is under the mutex guarding the fences, and the previous
> fence was unpinned so not in used. Prior to being used, all is
> consistent.

Ah didn't get the unpinning part.

>
>> >                       return ret;
>> > +             }
>> >  
>> >               i915_vma_flush_writes(old);
>> > -     }
>> >  
>> > -     if (fence->vma && fence->vma != vma) {
>> > -             /* Ensure that all userspace CPU access is completed before
>> > +             /*
>> > +              * Ensure that all userspace CPU access is completed before
>> >                * stealing the fence.
>> >                */
>> > -             GEM_BUG_ON(fence->vma->fence != fence);
>> > -             i915_vma_revoke_mmap(fence->vma);
>> > -
>> > -             fence->vma->fence = NULL;
>> > -             fence->vma = NULL;
>> > +             if (old != vma) {
>> > +                     GEM_BUG_ON(old->fence != fence);
>> > +                     i915_vma_revoke_mmap(old);
>> > +                     old->fence = NULL;
>> > +             }
>> >  
>> >               list_move(&fence->link, &fence->i915->mm.fence_list);
>> >       }
>> >  
>> > -     /* We only need to update the register itself if the device is awake.
>> > +     /*
>> > +      * We only need to update the register itself if the device is awake.
>> >        * If the device is currently powered down, we will defer the write
>> >        * to the runtime resume, see i915_gem_restore_fences().
>> > +      *
>> > +      * This only works for removing the fence register, on acquisition
>> > +      * the caller must hold the rpm wakeref. The fence register must
>> > +      * be cleared before we can use any other fences to ensure that
>> > +      * the new fences do not overlap the elided clears, confusing HW.
>> >        */
>> >       wakeref = intel_runtime_pm_get_if_in_use(fence->i915);
>> > -     if (wakeref) {
>> > -             fence_write(fence, vma);
>> > -             intel_runtime_pm_put(fence->i915, wakeref);
>> > +     if (!wakeref) {
>> > +             GEM_BUG_ON(vma);
>> > +             return 0;
>> >       }
>> >  
>> > -     if (vma) {
>> > -             if (fence->vma != vma) {
>> > -                     vma->fence = fence;
>> > -                     fence->vma = vma;
>> > -             }
>> > +     fence_write(fence, vma);
>> > +     fence->vma = vma;
>> >  
>> > +     if (vma) {
>> > +             vma->fence = fence;
>> >               list_move_tail(&fence->link, &fence->i915->mm.fence_list);
>> >       }
>> >  
>> > +     intel_runtime_pm_put(fence->i915, wakeref);
>> >       return 0;
>> >  }
>> >  
>> > @@ -473,9 +480,10 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
>> >  {
>> >       int i;
>> >  
>> > +     rcu_read_lock(); /* keep obj alive as we dereference */
>> >       for (i = 0; i < dev_priv->num_fence_regs; i++) {
>> >               struct drm_i915_fence_reg *reg = &dev_priv->fence_regs[i];
>> 
>> I do have spent some amount of time to try to figure out if
>> there is a reasoning of sometimes calling the fence reg as 'fence'
>> and in other cases 'reg'.
>> 
>> If there is a reason, help me out. If there is not, I
>> politely ask to follow the same naming than in revoke_fences.
>
> The hw is known as fences, but so are other things. reg is too general,
> and the use here is inconsistent with every other use of reg. In short,
> it really doesn't matter...
>
>> Or that we go for 'fence_reg' always when talking about
>> preallocated reg slots.
>
> Except now you are pulling my leg.

Nope!

>
>> > -             struct i915_vma *vma = reg->vma;
>> > +             struct i915_vma *vma = READ_ONCE(reg->vma);
>> >  
>> >               GEM_BUG_ON(vma && vma->fence != reg);
>> >  
>> > @@ -483,18 +491,12 @@ void i915_gem_restore_fences(struct drm_i915_private *dev_priv)
>> >                * Commit delayed tiling changes if we have an object still
>> >                * attached to the fence, otherwise just clear the fence.
>> >                */
>> > -             if (vma && !i915_gem_object_is_tiled(vma->obj)) {
>> > -                     GEM_BUG_ON(!reg->dirty);
>> 
>> You omit the dirty check here. If the reasoning is
>> that we can't sample due to inconstency wrt rest of fence reg,
>> does it then lead to need to make a __fence_write()
>> that does not write the dirty flag.
>
> Because it doesn't matter, we are just flushing the register back to the
> known state.

Ah I see, well perhaps there is then possiblity to throw out
the dirty trickery out from suspend/restore side in future.

The issue I was concerned is addressed so,
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup
  2019-02-07 14:09       ` Mika Kuoppala
@ 2019-02-07 14:13         ` Chris Wilson
  0 siblings, 0 replies; 33+ messages in thread
From: Chris Wilson @ 2019-02-07 14:13 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2019-02-07 14:09:15)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Because it doesn't matter, we are just flushing the register back to the
> > known state.
> 
> Ah I see, well perhaps there is then possiblity to throw out
> the dirty trickery out from suspend/restore side in future.

Yes, I wrote such a patch a few hours ago as I stared at last_fence in
dismay wishing we had pipelined fences instead. Might just resurrect
those as otherwise it's quite a gruesome casualty of the struct_mutex
wars.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 3/8] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-02-07  7:18 ` [PATCH 3/8] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
@ 2019-02-07 15:05   ` Mika Kuoppala
  0 siblings, 0 replies; 33+ messages in thread
From: Mika Kuoppala @ 2019-02-07 15:05 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Previously, we were able to rely on the recursive properties of
> struct_mutex to allow us to serialise revoking mmaps and reacquiring the
> FENCE registers with them being clobbered over a global device reset.
> I then proceeded to throw out the baby with the bath water in order to
> pursue a struct_mutex-less reset.
>
> Perusing LWN for alternative strategies, the dilemma on how to serialise
> access to a global resource on one side was answered by
> https://lwn.net/Articles/202847/ -- Sleepable RCU:

This has been a helpful reference for understanding this patch.

>
>     1  int readside(void) {
>     2      int idx;
>     3      rcu_read_lock();
>     4	   if (nomoresrcu) {
>     5          rcu_read_unlock();
>     6	       return -EINVAL;
>     7      }
>     8	   idx = srcu_read_lock(&ss);
>     9	   rcu_read_unlock();
>     10	   /* SRCU read-side critical section. */
>     11	   srcu_read_unlock(&ss, idx);
>     12	   return 0;
>     13 }
>     14
>     15 void cleanup(void)
>     16 {
>     17     nomoresrcu = 1;
>     18     synchronize_rcu();
>     19     synchronize_srcu(&ss);
>     20     cleanup_srcu_struct(&ss);
>     21 }
>
> No more worrying about stop_machine, just an uber-complex mutex,
> optimised for reads, with the overhead pushed to the rare reset path.
>
> However, we do run the risk of a deadlock as we allocate underneath the
> SRCU read lock, and the allocation may require a GPU reset, causing a
> dependency cycle via the in-flight requests. We resolve that by declaring
> the driver wedged and cancelling all in-flight rendering.
>
> v2: Use expedited rcu barriers to match our earlier timing
> characteristics.
> v3: Try to annotate locking contexts for sparse
> v4: Reduce selftest lock duration to avoid a reset deadlock with fences
> v5: s/srcu/reset_backoff_srcu/
> v6: Remove more stale comments
>
> Testcase: igt/gem_mmap_gtt/hang
> Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c           |  12 +-
>  drivers/gpu/drm/i915/i915_drv.h               |  18 +--
>  drivers/gpu/drm/i915/i915_gem.c               |  56 +++------
>  drivers/gpu/drm/i915/i915_gem_fence_reg.c     |  36 ++----
>  drivers/gpu/drm/i915/i915_gpu_error.h         |  39 ++-----
>  drivers/gpu/drm/i915/i915_reset.c             | 110 +++++++++++-------
>  drivers/gpu/drm/i915/i915_reset.h             |   4 +
>  drivers/gpu/drm/i915/intel_drv.h              |   3 -
>  .../gpu/drm/i915/selftests/intel_hangcheck.c  |   5 +-
>  .../gpu/drm/i915/selftests/mock_gem_device.c  |   1 +
>  10 files changed, 117 insertions(+), 167 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 0bd890c04fe4..a6fd157b1637 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -1281,14 +1281,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
>  	intel_wakeref_t wakeref;
>  	enum intel_engine_id id;
>  
> +	seq_printf(m, "Reset flags: %lx\n", dev_priv->gpu_error.flags);
>  	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
> -		seq_puts(m, "Wedged\n");
> +		seq_puts(m, "\tWedged\n");
>  	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
> -		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
> -	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
> -		seq_puts(m, "Waiter holding struct mutex\n");
> -	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
> -		seq_puts(m, "struct_mutex blocked for reset\n");
> +		seq_puts(m, "\tDevice (global) reset in progress\n");
>  
>  	if (!i915_modparams.enable_hangcheck) {
>  		seq_puts(m, "Hangcheck disabled\n");
> @@ -3885,9 +3882,6 @@ i915_wedged_set(void *data, u64 val)
>  	 * while it is writing to 'i915_wedged'
>  	 */
>  
> -	if (i915_reset_backoff(&i915->gpu_error))
> -		return -EAGAIN;
> -
>  	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
>  			  "Manually set wedged engine mask = %llx", val);
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index a2293152cb6a..37230ae7fbe6 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -2989,7 +2989,12 @@ i915_gem_obj_finish_shmem_access(struct drm_i915_gem_object *obj)
>  	i915_gem_object_unpin_pages(obj);
>  }
>  
> -int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
> +static inline int __must_check
> +i915_mutex_lock_interruptible(struct drm_device *dev)
> +{
> +	return mutex_lock_interruptible(&dev->struct_mutex);
> +}
> +
>  int i915_gem_dumb_create(struct drm_file *file_priv,
>  			 struct drm_device *dev,
>  			 struct drm_mode_create_dumb *args);
> @@ -3006,21 +3011,11 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
>  struct i915_request *
>  i915_gem_find_active_request(struct intel_engine_cs *engine);
>  
> -static inline bool i915_reset_backoff(struct i915_gpu_error *error)
> -{
> -	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
> -}
> -
>  static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
>  {
>  	return unlikely(test_bit(I915_WEDGED, &error->flags));
>  }
>  
> -static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
> -{
> -	return i915_reset_backoff(error) | i915_terminally_wedged(error);
> -}
> -

Stale comments is a broad term, nothing to dislike here. 

>  static inline u32 i915_reset_count(struct i915_gpu_error *error)
>  {
>  	return READ_ONCE(error->reset_count);
> @@ -3093,7 +3088,6 @@ struct drm_i915_fence_reg *
>  i915_reserve_fence(struct drm_i915_private *dev_priv);
>  void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
>  
> -void i915_gem_revoke_fences(struct drm_i915_private *dev_priv);
>  void i915_gem_restore_fences(struct drm_i915_private *dev_priv);
>  
>  void i915_gem_detect_bit_6_swizzle(struct drm_i915_private *dev_priv);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 05ce9176ac4e..1eb3a5f8654c 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -100,47 +100,6 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
>  	spin_unlock(&dev_priv->mm.object_stat_lock);
>  }
>  
> -static int
> -i915_gem_wait_for_error(struct i915_gpu_error *error)
> -{
> -	int ret;
> -
> -	might_sleep();
> -
> -	/*
> -	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
> -	 * userspace. If it takes that long something really bad is going on and
> -	 * we should simply try to bail out and fail as gracefully as possible.
> -	 */
> -	ret = wait_event_interruptible_timeout(error->reset_queue,
> -					       !i915_reset_backoff(error),
> -					       I915_RESET_TIMEOUT);
> -	if (ret == 0) {
> -		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
> -		return -EIO;
> -	} else if (ret < 0) {
> -		return ret;
> -	} else {
> -		return 0;
> -	}
> -}
> -
> -int i915_mutex_lock_interruptible(struct drm_device *dev)
> -{
> -	struct drm_i915_private *dev_priv = to_i915(dev);
> -	int ret;
> -
> -	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
> -	if (ret)
> -		return ret;
> -
> -	ret = mutex_lock_interruptible(&dev->struct_mutex);
> -	if (ret)
> -		return ret;
> -
> -	return 0;
> -}
> -
>  static u32 __i915_gem_park(struct drm_i915_private *i915)
>  {
>  	intel_wakeref_t wakeref;
> @@ -1869,6 +1828,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
>  	intel_wakeref_t wakeref;
>  	struct i915_vma *vma;
>  	pgoff_t page_offset;
> +	int srcu;
>  	int ret;
>  
>  	/* Sanity check that we allow writing into this object */
> @@ -1908,7 +1868,6 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
>  		goto err_unlock;
>  	}
>  
> -
>  	/* Now pin it into the GTT as needed */
>  	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
>  				       PIN_MAPPABLE |
> @@ -1946,9 +1905,15 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
>  	if (ret)
>  		goto err_unpin;
>  
> +	srcu = i915_reset_trylock(dev_priv);
> +	if (srcu < 0) {
> +		ret = srcu;
> +		goto err_unpin;
> +	}
> +
>  	ret = i915_vma_pin_fence(vma);
>  	if (ret)
> -		goto err_unpin;
> +		goto err_reset;
>  
>  	/* Finally, remap it using the new GTT offset */
>  	ret = remap_io_mapping(area,
> @@ -1969,6 +1934,8 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
>  
>  err_fence:
>  	i915_vma_unpin_fence(vma);
> +err_reset:
> +	i915_reset_unlock(dev_priv, srcu);
>  err_unpin:
>  	__i915_vma_unpin(vma);
>  err_unlock:
> @@ -5326,6 +5293,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
>  	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
>  	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
>  	mutex_init(&dev_priv->gpu_error.wedge_mutex);
> +	init_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
>  
>  	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
>  
> @@ -5358,6 +5326,8 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
>  	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
>  	WARN_ON(dev_priv->mm.object_count);
>  
> +	cleanup_srcu_struct(&dev_priv->gpu_error.reset_backoff_srcu);
> +
>  	kmem_cache_destroy(dev_priv->priorities);
>  	kmem_cache_destroy(dev_priv->dependencies);
>  	kmem_cache_destroy(dev_priv->requests);
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> index be89bd95ab7c..1ec1417cf8b4 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> @@ -270,6 +270,10 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>  		return 0;
>  	}
>  
> +	ret = i915_reset_trylock(fence->i915);
> +	if (ret < 0)
> +		goto out_rpm;
> +
>  	fence_write(fence, vma);
>  	fence->vma = vma;
>  
> @@ -278,8 +282,12 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>  		list_move_tail(&fence->link, &fence->i915->mm.fence_list);
>  	}
>  
> +	i915_reset_unlock(fence->i915, ret);
> +	ret = 0;
> +
> +out_rpm:
>  	intel_runtime_pm_put(fence->i915, wakeref);
> -	return 0;
> +	return ret;
>  }
>  
>  /**
> @@ -442,32 +450,6 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
>  	list_add(&fence->link, &fence->i915->mm.fence_list);
>  }
>  
> -/**
> - * i915_gem_revoke_fences - revoke fence state
> - * @dev_priv: i915 device private
> - *
> - * Removes all GTT mmappings via the fence registers. This forces any user
> - * of the fence to reacquire that fence before continuing with their access.
> - * One use is during GPU reset where the fence register is lost and we need to
> - * revoke concurrent userspace access via GTT mmaps until the hardware has been
> - * reset and the fence registers have been restored.
> - */
> -void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
> -{
> -	int i;
> -
> -	lockdep_assert_held(&dev_priv->drm.struct_mutex);
> -
> -	for (i = 0; i < dev_priv->num_fence_regs; i++) {
> -		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
> -
> -		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
> -
> -		if (fence->vma)
> -			i915_vma_revoke_mmap(fence->vma);
> -	}
> -}
> -
>  /**
>   * i915_gem_restore_fences - restore fence state
>   * @dev_priv: i915 device private
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
> index 53b1f22dd365..afa3adb28f02 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.h
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.h
> @@ -204,39 +204,13 @@ struct i915_gpu_error {
>  
>  	atomic_t pending_fb_pin;
>  
> -	/**
> -	 * State variable controlling the reset flow and count
> -	 *
> -	 * This is a counter which gets incremented when reset is triggered,
> -	 *
> -	 * Before the reset commences, the I915_RESET_BACKOFF bit is set
> -	 * meaning that any waiters holding onto the struct_mutex should
> -	 * relinquish the lock immediately in order for the reset to start.
> -	 *
> -	 * If reset is not completed successfully, the I915_WEDGE bit is
> -	 * set meaning that hardware is terminally sour and there is no
> -	 * recovery. All waiters on the reset_queue will be woken when
> -	 * that happens.
> -	 *
> -	 * This counter is used by the wait_seqno code to notice that reset
> -	 * event happened and it needs to restart the entire ioctl (since most
> -	 * likely the seqno it waited for won't ever signal anytime soon).
> -	 *
> -	 * This is important for lock-free wait paths, where no contended lock
> -	 * naturally enforces the correct ordering between the bail-out of the
> -	 * waiter and the gpu reset work code.
> -	 */

I noticed an one line comment on earlier patch. I don't want to go
back to see if I did notice this elephant in here.


> -	unsigned long reset_count;
> -
>  	/**
>  	 * flags: Control various stages of the GPU reset
>  	 *
> -	 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
> -	 * other users acquiring the struct_mutex. To do this we set the
> -	 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
> -	 * and then check for that bit before acquiring the struct_mutex (in
> -	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
> -	 * secondary role in preventing two concurrent global reset attempts.
> +	 * #I915_RESET_BACKOFF - When we start a global reset, we need to
> +	 * serialise with any other users attempting to do the same, and
> +	 * any global resources that may be clobber by the reset (such as
> +	 * FENCE registers).
>  	 *
>  	 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
>  	 * acquire the struct_mutex to reset an engine, we need an explicit
> @@ -255,6 +229,9 @@ struct i915_gpu_error {
>  #define I915_RESET_ENGINE	2
>  #define I915_WEDGED		(BITS_PER_LONG - 1)
>  
> +	/** Number of times the device has been reset (global) */
> +	u32 reset_count;
> +
>  	/** Number of times an engine has been reset */
>  	u32 reset_engine_count[I915_NUM_ENGINES];
>  
> @@ -272,6 +249,8 @@ struct i915_gpu_error {
>  	 */
>  	wait_queue_head_t reset_queue;
>  
> +	struct srcu_struct reset_backoff_srcu;
> +

I hope it is not too verbose now. On first reading
the patch there is more confusion and one desires for
verboseness on all the ropes one hangs. Later that
desire obviously diminishes, assuming
understanding grows.

But there is not much code on top of it and
on there, reader knows what it is for.

>  	struct i915_gpu_restart *restart;
>  };
>  
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index 0e0ddf2e6815..c67d6c2a09a2 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -639,6 +639,32 @@ static void reset_prepare_engine(struct intel_engine_cs *engine)
>  	engine->reset.prepare(engine);
>  }
>  
> +static void revoke_mmaps(struct drm_i915_private *i915)
> +{
> +	int i;
> +
> +	for (i = 0; i < i915->num_fence_regs; i++) {
> +		struct drm_vma_offset_node *node;
> +		struct i915_vma *vma;
> +		u64 vma_offset;
> +
> +		vma = READ_ONCE(i915->fence_regs[i].vma);
> +		if (!vma)
> +			continue;
> +
> +		if (!i915_vma_has_userfault(vma))
> +			continue;
> +
> +		GEM_BUG_ON(vma->fence != &i915->fence_regs[i]);
> +		node = &vma->obj->base.vma_node;
> +		vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
> +		unmap_mapping_range(i915->drm.anon_inode->i_mapping,
> +				    drm_vma_node_offset_addr(node) + vma_offset,
> +				    vma->size,
> +				    1);

Code maps to subset of vma_remove_mmap. Slight potential
to consolidate but this was more of peace of mind
check for reviewer that the details are identical.

> +	}
> +}
> +
>  static void reset_prepare(struct drm_i915_private *i915)
>  {
>  	struct intel_engine_cs *engine;
> @@ -648,6 +674,7 @@ static void reset_prepare(struct drm_i915_private *i915)
>  		reset_prepare_engine(engine);
>  
>  	intel_uc_sanitize(i915);
> +	revoke_mmaps(i915);
>  }
>  
>  static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
> @@ -911,50 +938,22 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  	return ret;
>  }
>  
> -struct __i915_reset {
> -	struct drm_i915_private *i915;
> -	unsigned int stalled_mask;
> -};
> -
> -static int __i915_reset__BKL(void *data)
> -{
> -	struct __i915_reset *arg = data;
> -	int err;
> -
> -	err = intel_gpu_reset(arg->i915, ALL_ENGINES);
> -	if (err)
> -		return err;
> -
> -	return gt_reset(arg->i915, arg->stalled_mask);
> -}
> -
> -#if RESET_UNDER_STOP_MACHINE
> -/*
> - * XXX An alternative to using stop_machine would be to park only the
> - * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
> - * we should be able to prevent their memmory accesses via the lost fence
> - * registers over the course of the reset without the potential recursive
> - * of mutexes between the pagefault handler and reset.
> - *
> - * See igt/gem_mmap_gtt/hang
> - */
> -#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
> -#else
> -#define __do_reset(fn, arg) fn(arg)
> -#endif
> -
>  static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
>  {
> -	struct __i915_reset arg = { i915, stalled_mask };
>  	int err, i;
>  
> -	err = __do_reset(__i915_reset__BKL, &arg);
> +	/* Flush everyone currently using a resource about to be clobbered */
> +	synchronize_srcu(&i915->gpu_error.reset_backoff_srcu);
> +
> +	err = intel_gpu_reset(i915, ALL_ENGINES);
>  	for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
> -		msleep(100);
> -		err = __do_reset(__i915_reset__BKL, &arg);
> +		msleep(10 * (i + 1));
> +		err = intel_gpu_reset(i915, ALL_ENGINES);
>  	}
> +	if (err)
> +		return err;
>  
> -	return err;
> +	return gt_reset(i915, stalled_mask);
>  }
>  
>  /**
> @@ -966,8 +965,6 @@ static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
>   * Reset the chip.  Useful if a hang is detected. Marks the device as wedged
>   * on failure.
>   *
> - * Caller must hold the struct_mutex.
> - *
>   * Procedure is fairly simple:
>   *   - reset the chip using the reset reg
>   *   - re-init context state
> @@ -1274,9 +1271,12 @@ void i915_handle_error(struct drm_i915_private *i915,
>  		wait_event(i915->gpu_error.reset_queue,
>  			   !test_bit(I915_RESET_BACKOFF,
>  				     &i915->gpu_error.flags));
> -		goto out;
> +		goto out; /* piggy-back on the other reset */
>  	}
>  
> +	/* Make sure i915_reset_trylock() sees the I915_RESET_BACKOFF */
> +	synchronize_rcu_expedited();
> +
>  	/* Prevent any other reset-engine attempt. */
>  	for_each_engine(engine, i915, tmp) {
>  		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> @@ -1300,6 +1300,36 @@ void i915_handle_error(struct drm_i915_private *i915,
>  	intel_runtime_pm_put(i915, wakeref);
>  }
>  
> +int i915_reset_trylock(struct drm_i915_private *i915)
> +{
> +	struct i915_gpu_error *error = &i915->gpu_error;
> +	int srcu;
> +
> +	rcu_read_lock();
> +	while (test_bit(I915_RESET_BACKOFF, &error->flags)) {

The mb is implicit in here and the as the set returns
a value, it is explicit in write side. So should be ok.

> +		rcu_read_unlock();
> +
> +		if (wait_event_interruptible(error->reset_queue,
> +					     !test_bit(I915_RESET_BACKOFF,
> +						       &error->flags)))
> +			return -EINTR;
> +
> +		rcu_read_lock();
> +	}
> +	srcu = srcu_read_lock(&error->reset_backoff_srcu);
> +	rcu_read_unlock();
> +
> +	return srcu;
> +}
> +
> +void i915_reset_unlock(struct drm_i915_private *i915, int tag)
> +__releases(&i915->gpu_error.reset_backoff_srcu)
> +{
> +	struct i915_gpu_error *error = &i915->gpu_error;
> +
> +	srcu_read_unlock(&error->reset_backoff_srcu, tag);
> +}
> +
>  bool i915_reset_flush(struct drm_i915_private *i915)
>  {
>  	int err;
> diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
> index f2d347f319df..893c5d1c2eb8 100644
> --- a/drivers/gpu/drm/i915/i915_reset.h
> +++ b/drivers/gpu/drm/i915/i915_reset.h
> @@ -9,6 +9,7 @@
>  
>  #include <linux/compiler.h>
>  #include <linux/types.h>
> +#include <linux/srcu.h>
>  
>  struct drm_i915_private;
>  struct intel_engine_cs;
> @@ -32,6 +33,9 @@ int i915_reset_engine(struct intel_engine_cs *engine,
>  void i915_reset_request(struct i915_request *rq, bool guilty);
>  bool i915_reset_flush(struct drm_i915_private *i915);
>  
> +int __must_check i915_reset_trylock(struct drm_i915_private *i915);
> +void i915_reset_unlock(struct drm_i915_private *i915, int tag);
> +
>  bool intel_has_gpu_reset(struct drm_i915_private *i915);
>  bool intel_has_reset_engine(struct drm_i915_private *i915);
>  
> diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
> index 9ec3d00fbd19..51a144534519 100644
> --- a/drivers/gpu/drm/i915/intel_drv.h
> +++ b/drivers/gpu/drm/i915/intel_drv.h
> @@ -989,9 +989,6 @@ struct intel_crtc {
>  
>  	struct intel_crtc_state *config;
>  
> -	/* global reset count when the last flip was submitted */
> -	unsigned int reset_count;
> -

There is a even different level on stale on staleness.


Ok, I remember Tvrtko mentioning on some patch
that he needed few hours and a snackbreak in between
when reviewing. I 'win' on that front, with a large
margin.

Now when all pieces are in place, it is a trick^H^H
elegant way to block clients from accessing memory
when we pull out the hardware beneath them.

I am torn between needing to comment the mechanism
into more verbose comment. But then, the comments
are already there and it is mostly about
learning the rcu/srcu in a level that makes you
believe this works. And there is plenty of
subject material on that topic.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>


>  	/* Access to these should be protected by dev_priv->irq_lock. */
>  	bool cpu_fifo_underrun_disabled;
>  	bool pch_fifo_underrun_disabled;
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 7b6f3bea9ef8..4886fac12628 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -1039,8 +1039,6 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
>  
>  	/* Check that we can recover an unbind stuck on a hanging request */
>  
> -	igt_global_reset_lock(i915);
> -
>  	mutex_lock(&i915->drm.struct_mutex);
>  	err = hang_init(&h, i915);
>  	if (err)
> @@ -1138,7 +1136,9 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
>  	}
>  
>  out_reset:
> +	igt_global_reset_lock(i915);
>  	fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
> +	igt_global_reset_unlock(i915);
>  
>  	if (tsk) {
>  		struct igt_wedge_me w;
> @@ -1159,7 +1159,6 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
>  	hang_fini(&h);
>  unlock:
>  	mutex_unlock(&i915->drm.struct_mutex);
> -	igt_global_reset_unlock(i915);
>  
>  	if (i915_terminally_wedged(&i915->gpu_error))
>  		return -EIO;
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index 14ae46fda49f..fc516a2970f4 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -189,6 +189,7 @@ struct drm_i915_private *mock_gem_device(void)
>  
>  	init_waitqueue_head(&i915->gpu_error.wait_queue);
>  	init_waitqueue_head(&i915->gpu_error.reset_queue);
> +	init_srcu_struct(&i915->gpu_error.reset_backoff_srcu);
>  	mutex_init(&i915->gpu_error.wedge_mutex);
>  
>  	i915->wq = alloc_ordered_workqueue("mock", 0);
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs
  2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
                   ` (9 preceding siblings ...)
  2019-02-07  9:53 ` ✓ Fi.CI.IGT: " Patchwork
@ 2019-02-07 16:01 ` Joonas Lahtinen
  2019-02-07 16:05   ` Chris Wilson
  2019-02-07 16:21   ` Chris Wilson
  10 siblings, 2 replies; 33+ messages in thread
From: Joonas Lahtinen @ 2019-02-07 16:01 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Quoting Chris Wilson (2019-02-07 09:18:22)
> Apply backpressure to hogs that emit requests faster than the GPU can
> process them by waiting for their ring to be less than half-full before
> proceeding with taking the struct_mutex.
> 
> This is a gross hack to apply throttling backpressure, the long term
> goal is to remove the struct_mutex contention so that each client
> naturally waits, preferably in an asynchronous, nonblocking fashion
> (pipelined operations for the win), for their own resources and never
> blocks another client within the driver at least. (Realtime priority
> goals would extend to ensuring that resource contention favours high
> priority clients as well.)
> 
> This patch only limits excessive request production and does not attempt
> to throttle clients that block waiting for eviction (either global GTT or
> system memory) or any other global resources, see above for the long term
> goal.

Code checks out.

Some Testcase: links and some relative perf numbers might be in place.

It's pretty arbitarily tuned algorithm, after all :)

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> Cc: John Harrison <John.C.Harrison@Intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 63 ++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_ringbuffer.c    | 13 -----
>  drivers/gpu/drm/i915/intel_ringbuffer.h    | 12 +++++
>  3 files changed, 75 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 8eedf7cac493..84ef3abc567e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -753,6 +753,64 @@ static int eb_select_context(struct i915_execbuffer *eb)
>         return 0;
>  }
>  
> +static struct i915_request *__eb_wait_for_ring(struct intel_ring *ring)
> +{
> +       struct i915_request *rq;
> +
> +       if (intel_ring_update_space(ring) >= PAGE_SIZE)
> +               return NULL;
> +
> +       /*
> +        * Find a request that after waiting upon, there will be at least half
> +        * the ring available. The hystersis allows us to compete for the
> +        * shared ring and should mean that we sleep less often prior to
> +        * claiming our resources, but not so long that the ring completely
> +        * drains before we can submit our next request.
> +        */
> +       list_for_each_entry(rq, &ring->request_list, ring_link) {
> +               if (__intel_ring_space(rq->postfix,
> +                                      ring->emit, ring->size) > ring->size / 2)
> +                       break;
> +       }
> +       if (&rq->ring_link == &ring->request_list)
> +               return NULL; /* weird, we will check again later for real */
> +
> +       return i915_request_get(rq);
> +}
> +
> +static int eb_wait_for_ring(const struct i915_execbuffer *eb)
> +{
> +       const struct intel_context *ce;
> +       struct i915_request *rq;
> +       int ret = 0;
> +
> +       /*
> +        * Apply a light amount of backpressure to prevent excessive hogs
> +        * from blocking waiting for space whilst holding struct_mutex and
> +        * keeping all of their resources pinned.
> +        */
> +
> +       ce = to_intel_context(eb->ctx, eb->engine);
> +       if (!ce->ring) /* first use, assume empty! */
> +               return 0;
> +
> +       rq = __eb_wait_for_ring(ce->ring);
> +       if (rq) {
> +               mutex_unlock(&eb->i915->drm.struct_mutex);
> +
> +               if (i915_request_wait(rq,
> +                                     I915_WAIT_INTERRUPTIBLE,
> +                                     MAX_SCHEDULE_TIMEOUT) < 0)
> +                       ret = -EINTR;
> +
> +               i915_request_put(rq);
> +
> +               mutex_lock(&eb->i915->drm.struct_mutex);
> +       }
> +
> +       return ret;
> +}
> +
>  static int eb_lookup_vmas(struct i915_execbuffer *eb)
>  {
>         struct radix_tree_root *handles_vma = &eb->ctx->handles_vma;
> @@ -2291,6 +2349,10 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>         if (err)
>                 goto err_rpm;
>  
> +       err = eb_wait_for_ring(&eb); /* may temporarily drop struct_mutex */
> +       if (unlikely(err))
> +               goto err_unlock;
> +
>         err = eb_relocate(&eb);
>         if (err) {
>                 /*
> @@ -2435,6 +2497,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
>  err_vma:
>         if (eb.exec)
>                 eb_release_vmas(&eb);
> +err_unlock:
>         mutex_unlock(&dev->struct_mutex);
>  err_rpm:
>         intel_runtime_pm_put(eb.i915, wakeref);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index b889b27f8aeb..7f841dba87b3 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -49,19 +49,6 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
>                 I915_GEM_HWS_INDEX_ADDR);
>  }
>  
> -static unsigned int __intel_ring_space(unsigned int head,
> -                                      unsigned int tail,
> -                                      unsigned int size)
> -{
> -       /*
> -        * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
> -        * same cacheline, the Head Pointer must not be greater than the Tail
> -        * Pointer."
> -        */
> -       GEM_BUG_ON(!is_power_of_2(size));
> -       return (head - tail - CACHELINE_BYTES) & (size - 1);
> -}
> -
>  unsigned int intel_ring_update_space(struct intel_ring *ring)
>  {
>         unsigned int space;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 4d4ea6963a72..710ffb221775 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -832,6 +832,18 @@ intel_ring_set_tail(struct intel_ring *ring, unsigned int tail)
>         return tail;
>  }
>  
> +static inline unsigned int
> +__intel_ring_space(unsigned int head, unsigned int tail, unsigned int size)
> +{
> +       /*
> +        * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
> +        * same cacheline, the Head Pointer must not be greater than the Tail
> +        * Pointer."
> +        */
> +       GEM_BUG_ON(!is_power_of_2(size));
> +       return (head - tail - CACHELINE_BYTES) & (size - 1);
> +}
> +
>  void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno);
>  
>  int intel_engine_setup_common(struct intel_engine_cs *engine);
> -- 
> 2.20.1
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs
  2019-02-07 16:01 ` [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Joonas Lahtinen
@ 2019-02-07 16:05   ` Chris Wilson
  2019-02-07 16:21   ` Chris Wilson
  1 sibling, 0 replies; 33+ messages in thread
From: Chris Wilson @ 2019-02-07 16:05 UTC (permalink / raw)
  To: Joonas Lahtinen, intel-gfx

Quoting Joonas Lahtinen (2019-02-07 16:01:31)
> Quoting Chris Wilson (2019-02-07 09:18:22)
> > Apply backpressure to hogs that emit requests faster than the GPU can
> > process them by waiting for their ring to be less than half-full before
> > proceeding with taking the struct_mutex.
> > 
> > This is a gross hack to apply throttling backpressure, the long term
> > goal is to remove the struct_mutex contention so that each client
> > naturally waits, preferably in an asynchronous, nonblocking fashion
> > (pipelined operations for the win), for their own resources and never
> > blocks another client within the driver at least. (Realtime priority
> > goals would extend to ensuring that resource contention favours high
> > priority clients as well.)
> > 
> > This patch only limits excessive request production and does not attempt
> > to throttle clients that block waiting for eviction (either global GTT or
> > system memory) or any other global resources, see above for the long term
> > goal.
> 
> Code checks out.
> 
> Some Testcase: links and some relative perf numbers might be in place.

Fixes an arbitrary test case written with exactly this problem in mind
;)

Performance stress tests show no impact, which soothes my conscience as
my biggest worry was upsetting those microbenchmarks that try to measure
driver overhead.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs
  2019-02-07 16:01 ` [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Joonas Lahtinen
  2019-02-07 16:05   ` Chris Wilson
@ 2019-02-07 16:21   ` Chris Wilson
  1 sibling, 0 replies; 33+ messages in thread
From: Chris Wilson @ 2019-02-07 16:21 UTC (permalink / raw)
  To: Joonas Lahtinen, intel-gfx

Quoting Joonas Lahtinen (2019-02-07 16:01:31)
> Quoting Chris Wilson (2019-02-07 09:18:22)
> > Apply backpressure to hogs that emit requests faster than the GPU can
> > process them by waiting for their ring to be less than half-full before
> > proceeding with taking the struct_mutex.
> > 
> > This is a gross hack to apply throttling backpressure, the long term
> > goal is to remove the struct_mutex contention so that each client
> > naturally waits, preferably in an asynchronous, nonblocking fashion
> > (pipelined operations for the win), for their own resources and never
> > blocks another client within the driver at least. (Realtime priority
> > goals would extend to ensuring that resource contention favours high
> > priority clients as well.)
> > 
> > This patch only limits excessive request production and does not attempt
> > to throttle clients that block waiting for eviction (either global GTT or
> > system memory) or any other global resources, see above for the long term
> > goal.
> 
> Code checks out.
> 
> Some Testcase: links and some relative perf numbers might be in place.
> 
> It's pretty arbitarily tuned algorithm, after all :)
> 
> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Added the testcase and a passing mention of no perf diff measured, and
pushed this patch because things look better in the morning.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 4/8] drm/i915: Force the GPU reset upon wedging
  2019-02-07  7:18 ` [PATCH 4/8] drm/i915: Force the GPU reset upon wedging Chris Wilson
@ 2019-02-08  9:31   ` Mika Kuoppala
  2019-02-08  9:47     ` Chris Wilson
  0 siblings, 1 reply; 33+ messages in thread
From: Mika Kuoppala @ 2019-02-08  9:31 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> When declaring the GPU wedged, we do need to hit the GPU with the reset
> hammer so that its state matches our presumed state during cleanup. If
> the reset fails, it fails, and we may be unhappy but wedged. However, if
> we are testing our wedge/unwedged handling, the desync carries over into
> the next test and promptly explodes.
>
> References: https://bugs.freedesktop.org/show_bug.cgi?id=106702
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_reset.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index c67d6c2a09a2..64f26e17243a 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -532,9 +532,6 @@ typedef int (*reset_func)(struct drm_i915_private *,
>  
>  static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
>  {
> -	if (!i915_modparams.reset)
> -		return NULL;
> -
>  	if (INTEL_GEN(i915) >= 8)
>  		return gen8_reset_engines;
>  	else if (INTEL_GEN(i915) >= 6)
> @@ -599,6 +596,9 @@ bool intel_has_gpu_reset(struct drm_i915_private *i915)
>  	if (USES_GUC(i915))
>  		return false;
>  
> +	if (!i915_modparams.reset)
> +		return NULL;
> +

I first thought there was no functional change
in this but it is play with get and has.

Part of the reset code complexity is the
deep nesting of 'reset' used on all
layers and you need effort to distinguish
what is prepping, driver resetting and
gpu resetting. But it is not all bad
as we have now 'gpu' popping up in places
where the actual hw is involved.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

>  	return intel_get_gpu_reset(i915);
>  }
>  
> @@ -824,7 +824,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  		reset_prepare_engine(engine);
>  
>  	/* Even if the GPU reset fails, it should still stop the engines */
> -	if (INTEL_GEN(i915) >= 5)
> +	if (!INTEL_INFO(i915)->gpu_reset_clobbers_display)
>  		intel_gpu_reset(i915, ALL_ENGINES);
>  
>  	for_each_engine(engine, i915, id) {
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging
  2019-02-07  7:18 ` [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging Chris Wilson
@ 2019-02-08  9:46   ` Mika Kuoppala
  2019-02-08 10:00     ` Chris Wilson
  0 siblings, 1 reply; 33+ messages in thread
From: Mika Kuoppala @ 2019-02-08  9:46 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> On wedging, we mark all executing requests as complete and all pending
> requests completed as soon as they are ready. Before unwedging though we
> wish to flush those pending requests prior to restoring default
> execution, and so we must wait. Do so interruptibly as we do not provide

uninterruptibly?

> the EINTR gracefully back to userspace in this case but persistent in

be persistent?

We lost the gracefullness due to not having the interruptible
mutex wait on reset path anymore?

-Mika

> the permanently wedged start without restarting the syscall.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_reset.c | 28 ++++++++--------------------
>  1 file changed, 8 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index 64f26e17243a..c4fcb450bd80 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -862,7 +862,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  {
>  	struct i915_gpu_error *error = &i915->gpu_error;
>  	struct i915_timeline *tl;
> -	bool ret = false;
>  
>  	if (!test_bit(I915_WEDGED, &error->flags))
>  		return true;
> @@ -887,30 +886,20 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  	mutex_lock(&i915->gt.timelines.mutex);
>  	list_for_each_entry(tl, &i915->gt.timelines.active_list, link) {
>  		struct i915_request *rq;
> -		long timeout;
>  
>  		rq = i915_active_request_get_unlocked(&tl->last_request);
>  		if (!rq)
>  			continue;
>  
>  		/*
> -		 * We can't use our normal waiter as we want to
> -		 * avoid recursively trying to handle the current
> -		 * reset. The basic dma_fence_default_wait() installs
> -		 * a callback for dma_fence_signal(), which is
> -		 * triggered by our nop handler (indirectly, the
> -		 * callback enables the signaler thread which is
> -		 * woken by the nop_submit_request() advancing the seqno
> -		 * and when the seqno passes the fence, the signaler
> -		 * then signals the fence waking us up).
> +		 * All internal dependencies (i915_requests) will have
> +		 * been flushed by the set-wedge, but we may be stuck waiting
> +		 * for external fences. These should all be capped to 10s
> +		 * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
> +		 * in the worst case.
>  		 */
> -		timeout = dma_fence_default_wait(&rq->fence, true,
> -						 MAX_SCHEDULE_TIMEOUT);
> +		dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT);
>  		i915_request_put(rq);
> -		if (timeout < 0) {
> -			mutex_unlock(&i915->gt.timelines.mutex);
> -			goto unlock;
> -		}
>  	}
>  	mutex_unlock(&i915->gt.timelines.mutex);
>  
> @@ -931,11 +920,10 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  
>  	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
>  	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
> -	ret = true;
> -unlock:
> +
>  	mutex_unlock(&i915->gpu_error.wedge_mutex);
>  
> -	return ret;
> +	return true;
>  }
>  
>  static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 4/8] drm/i915: Force the GPU reset upon wedging
  2019-02-08  9:31   ` Mika Kuoppala
@ 2019-02-08  9:47     ` Chris Wilson
  0 siblings, 0 replies; 33+ messages in thread
From: Chris Wilson @ 2019-02-08  9:47 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2019-02-08 09:31:48)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > When declaring the GPU wedged, we do need to hit the GPU with the reset
> > hammer so that its state matches our presumed state during cleanup. If
> > the reset fails, it fails, and we may be unhappy but wedged. However, if
> > we are testing our wedge/unwedged handling, the desync carries over into
> > the next test and promptly explodes.
> >
> > References: https://bugs.freedesktop.org/show_bug.cgi?id=106702
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Mika Kuoppala <mika.kuoppala@intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_reset.c | 8 ++++----
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> > index c67d6c2a09a2..64f26e17243a 100644
> > --- a/drivers/gpu/drm/i915/i915_reset.c
> > +++ b/drivers/gpu/drm/i915/i915_reset.c
> > @@ -532,9 +532,6 @@ typedef int (*reset_func)(struct drm_i915_private *,
> >  
> >  static reset_func intel_get_gpu_reset(struct drm_i915_private *i915)
> >  {
> > -     if (!i915_modparams.reset)
> > -             return NULL;
> > -
> >       if (INTEL_GEN(i915) >= 8)
> >               return gen8_reset_engines;
> >       else if (INTEL_GEN(i915) >= 6)
> > @@ -599,6 +596,9 @@ bool intel_has_gpu_reset(struct drm_i915_private *i915)
> >       if (USES_GUC(i915))
> >               return false;
> >  
> > +     if (!i915_modparams.reset)
> > +             return NULL;
> > +
> 
> I first thought there was no functional change
> in this but it is play with get and has.
> 
> Part of the reset code complexity is the
> deep nesting of 'reset' used on all
> layers and you need effort to distinguish
> what is prepping, driver resetting and
> gpu resetting. But it is not all bad
> as we have now 'gpu' popping up in places
> where the actual hw is involved.

The nice part about this is that it reduces the number of places that
are affected by the parameter to just i915_reset() (oh, and the
I915_PARAM query for has_gpu_reset). Should be far less surprising in
future.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged
  2019-02-07  7:18 ` [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
@ 2019-02-08  9:56   ` Mika Kuoppala
  2019-02-08 10:01     ` Chris Wilson
  0 siblings, 1 reply; 33+ messages in thread
From: Mika Kuoppala @ 2019-02-08  9:56 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Since we use the debugfs to recover the device after modifying the
> i915.reset parameter, we need to be sure that we apply the reset and not
> piggy-back onto a concurrent one in order for the parameter to take
> effect.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c | 10 +++-------
>  1 file changed, 3 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index a6fd157b1637..8a488ffc8b7d 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3874,13 +3874,9 @@ i915_wedged_set(void *data, u64 val)
>  {
>  	struct drm_i915_private *i915 = data;
>  
> -	/*
> -	 * There is no safeguard against this debugfs entry colliding
> -	 * with the hangcheck calling same i915_handle_error() in
> -	 * parallel, causing an explosion. For now we assume that the
> -	 * test harness is responsible enough not to inject gpu hangs
> -	 * while it is writing to 'i915_wedged'
> -	 */
> +	/* Flush any previous reset before applying for a new one */
> +	wait_event(i915->gpu_error.reset_queue,
> +		   !test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags));

You removed the comment and yes this makes us wait on our turn
to flip the switch. But the hangcheck vs this race still holds.

Now even if they would two pile on this switch...there should be
no harm as in that case we see two log entries
resulting in a one reset.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

-Mika
>  
>  	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
>  			  "Manually set wedged engine mask = %llx", val);
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging
  2019-02-08  9:46   ` Mika Kuoppala
@ 2019-02-08 10:00     ` Chris Wilson
  2019-02-08 15:07       ` Mika Kuoppala
  0 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-08 10:00 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2019-02-08 09:46:01)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > On wedging, we mark all executing requests as complete and all pending
> > requests completed as soon as they are ready. Before unwedging though we
> > wish to flush those pending requests prior to restoring default
> > execution, and so we must wait. Do so interruptibly as we do not provide
> 
> uninterruptibly?
> 
> > the EINTR gracefully back to userspace in this case but persistent in
> 
> be persistent?
> 
> We lost the gracefullness due to not having the interruptible
> mutex wait on reset path anymore?

No. We never had graceful handling, as we could never propagate the
EINTR from unwedge back to userspace so it became EIO instead.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged
  2019-02-08  9:56   ` Mika Kuoppala
@ 2019-02-08 10:01     ` Chris Wilson
  0 siblings, 0 replies; 33+ messages in thread
From: Chris Wilson @ 2019-02-08 10:01 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2019-02-08 09:56:59)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Since we use the debugfs to recover the device after modifying the
> > i915.reset parameter, we need to be sure that we apply the reset and not
> > piggy-back onto a concurrent one in order for the parameter to take
> > effect.
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/i915_debugfs.c | 10 +++-------
> >  1 file changed, 3 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> > index a6fd157b1637..8a488ffc8b7d 100644
> > --- a/drivers/gpu/drm/i915/i915_debugfs.c
> > +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> > @@ -3874,13 +3874,9 @@ i915_wedged_set(void *data, u64 val)
> >  {
> >       struct drm_i915_private *i915 = data;
> >  
> > -     /*
> > -      * There is no safeguard against this debugfs entry colliding
> > -      * with the hangcheck calling same i915_handle_error() in
> > -      * parallel, causing an explosion. For now we assume that the
> > -      * test harness is responsible enough not to inject gpu hangs
> > -      * while it is writing to 'i915_wedged'
> > -      */
> > +     /* Flush any previous reset before applying for a new one */
> > +     wait_event(i915->gpu_error.reset_queue,
> > +                !test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags));
> 
> You removed the comment and yes this makes us wait on our turn
> to flip the switch. But the hangcheck vs this race still holds.

Concurrent resets have been safe for yonks... But what I realised
about our piggy-backing of the 2 resets into one meant that if the value
of the i915.reset modparam changed we didn't run with the updated value.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 7/8] drm/i915: Serialise resets with wedging
  2019-02-07  7:18 ` [PATCH 7/8] drm/i915: Serialise resets with wedging Chris Wilson
@ 2019-02-08 14:30   ` Mika Kuoppala
  0 siblings, 0 replies; 33+ messages in thread
From: Mika Kuoppala @ 2019-02-08 14:30 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Prevent concurrent set-wedge with ongoing resets (and vice versa) by
> taking the same wedge_mutex around both operations.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/i915_reset.c | 68 ++++++++++++++++++-------------
>  1 file changed, 40 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index c4fcb450bd80..9494b015185a 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -794,17 +794,14 @@ static void nop_submit_request(struct i915_request *request)
>  	intel_engine_queue_breadcrumbs(engine);
>  }
>  
> -void i915_gem_set_wedged(struct drm_i915_private *i915)
> +static void __i915_gem_set_wedged(struct drm_i915_private *i915)
>  {
>  	struct i915_gpu_error *error = &i915->gpu_error;
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>  
> -	mutex_lock(&error->wedge_mutex);
> -	if (test_bit(I915_WEDGED, &error->flags)) {
> -		mutex_unlock(&error->wedge_mutex);
> +	if (test_bit(I915_WEDGED, &error->flags))
>  		return;
> -	}
>  
>  	if (GEM_SHOW_DEBUG() && !intel_engines_are_idle(i915)) {
>  		struct drm_printer p = drm_debug_printer(__func__);
> @@ -853,12 +850,18 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
>  	set_bit(I915_WEDGED, &error->flags);
>  
>  	GEM_TRACE("end\n");
> -	mutex_unlock(&error->wedge_mutex);
> +}
>  
> -	wake_up_all(&error->reset_queue);
> +void i915_gem_set_wedged(struct drm_i915_private *i915)
> +{
> +	struct i915_gpu_error *error = &i915->gpu_error;
> +
> +	mutex_lock(&error->wedge_mutex);
> +	__i915_gem_set_wedged(i915);
> +	mutex_unlock(&error->wedge_mutex);
>  }
>  
> -bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> +static bool __i915_gem_unset_wedged(struct drm_i915_private *i915)
>  {
>  	struct i915_gpu_error *error = &i915->gpu_error;
>  	struct i915_timeline *tl;
> @@ -869,8 +872,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  	if (!i915->gt.scratch) /* Never full initialised, recovery impossible */
>  		return false;
>  
> -	mutex_lock(&error->wedge_mutex);
> -
>  	GEM_TRACE("start\n");
>  
>  	/*
> @@ -921,11 +922,21 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>  	smp_mb__before_atomic(); /* complete takeover before enabling execbuf */
>  	clear_bit(I915_WEDGED, &i915->gpu_error.flags);
>  
> -	mutex_unlock(&i915->gpu_error.wedge_mutex);
> -
>  	return true;
>  }
>  
> +bool i915_gem_unset_wedged(struct drm_i915_private *i915)
> +{
> +	struct i915_gpu_error *error = &i915->gpu_error;
> +	bool result;
> +
> +	mutex_lock(&error->wedge_mutex);
> +	result = __i915_gem_unset_wedged(i915);
> +	mutex_unlock(&error->wedge_mutex);
> +
> +	return result;
> +}
> +
>  static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
>  {
>  	int err, i;
> @@ -975,7 +986,7 @@ void i915_reset(struct drm_i915_private *i915,
>  	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
>  
>  	/* Clear any previous failed attempts at recovery. Time to try again. */
> -	if (!i915_gem_unset_wedged(i915))
> +	if (!__i915_gem_unset_wedged(i915))
>  		return;
>  
>  	if (reason)
> @@ -1037,7 +1048,7 @@ void i915_reset(struct drm_i915_private *i915,
>  	 */
>  	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
>  error:
> -	i915_gem_set_wedged(i915);
> +	__i915_gem_set_wedged(i915);
>  	goto finish;
>  }
>  
> @@ -1129,7 +1140,9 @@ static void i915_reset_device(struct drm_i915_private *i915,
>  	i915_wedge_on_timeout(&w, i915, 5 * HZ) {
>  		intel_prepare_reset(i915);
>  
> +		mutex_lock(&error->wedge_mutex);
>  		i915_reset(i915, engine_mask, reason);
> +		mutex_unlock(&error->wedge_mutex);
>  
>  		intel_finish_reset(i915);
>  	}
> @@ -1197,6 +1210,7 @@ void i915_handle_error(struct drm_i915_private *i915,
>  		       unsigned long flags,
>  		       const char *fmt, ...)
>  {
> +	struct i915_gpu_error *error = &i915->gpu_error;
>  	struct intel_engine_cs *engine;
>  	intel_wakeref_t wakeref;
>  	unsigned int tmp;
> @@ -1233,20 +1247,19 @@ void i915_handle_error(struct drm_i915_private *i915,
>  	 * Try engine reset when available. We fall back to full reset if
>  	 * single reset fails.
>  	 */
> -	if (intel_has_reset_engine(i915) &&
> -	    !i915_terminally_wedged(&i915->gpu_error)) {
> +	if (intel_has_reset_engine(i915) && !i915_terminally_wedged(error)) {
>  		for_each_engine_masked(engine, i915, engine_mask, tmp) {
>  			BUILD_BUG_ON(I915_RESET_MODESET >= I915_RESET_ENGINE);
>  			if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> -					     &i915->gpu_error.flags))
> +					     &error->flags))
>  				continue;
>  
>  			if (i915_reset_engine(engine, msg) == 0)
>  				engine_mask &= ~intel_engine_flag(engine);
>  
>  			clear_bit(I915_RESET_ENGINE + engine->id,
> -				  &i915->gpu_error.flags);
> -			wake_up_bit(&i915->gpu_error.flags,
> +				  &error->flags);
> +			wake_up_bit(&error->flags,
>  				    I915_RESET_ENGINE + engine->id);
>  		}
>  	}
> @@ -1255,10 +1268,9 @@ void i915_handle_error(struct drm_i915_private *i915,
>  		goto out;
>  
>  	/* Full reset needs the mutex, stop any other user trying to do so. */
> -	if (test_and_set_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
> -		wait_event(i915->gpu_error.reset_queue,
> -			   !test_bit(I915_RESET_BACKOFF,
> -				     &i915->gpu_error.flags));
> +	if (test_and_set_bit(I915_RESET_BACKOFF, &error->flags)) {
> +		wait_event(error->reset_queue,
> +			   !test_bit(I915_RESET_BACKOFF, &error->flags));
>  		goto out; /* piggy-back on the other reset */
>  	}
>  
> @@ -1268,8 +1280,8 @@ void i915_handle_error(struct drm_i915_private *i915,
>  	/* Prevent any other reset-engine attempt. */
>  	for_each_engine(engine, i915, tmp) {
>  		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
> -					&i915->gpu_error.flags))
> -			wait_on_bit(&i915->gpu_error.flags,
> +					&error->flags))
> +			wait_on_bit(&error->flags,
>  				    I915_RESET_ENGINE + engine->id,
>  				    TASK_UNINTERRUPTIBLE);
>  	}
> @@ -1278,11 +1290,11 @@ void i915_handle_error(struct drm_i915_private *i915,
>  
>  	for_each_engine(engine, i915, tmp) {
>  		clear_bit(I915_RESET_ENGINE + engine->id,
> -			  &i915->gpu_error.flags);
> +			  &error->flags);
>  	}
>  
> -	clear_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags);
> -	wake_up_all(&i915->gpu_error.reset_queue);
> +	clear_bit(I915_RESET_BACKOFF, &error->flags);
> +	wake_up_all(&error->reset_queue);
>  
>  out:
>  	intel_runtime_pm_put(i915, wakeref);
> -- 
> 2.20.1
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] drm/i915: Don't claim an unstarted request was guilty
  2019-02-07  7:41   ` [PATCH] " Chris Wilson
@ 2019-02-08 14:47     ` Mika Kuoppala
  2019-02-08 14:58       ` Chris Wilson
  0 siblings, 1 reply; 33+ messages in thread
From: Mika Kuoppala @ 2019-02-08 14:47 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> If we haven't even begun executing the payload of the stalled request,
> then we should not claim that its userspace context was guilty of
> submitting a hanging batch.
>
> v2: Check for context corruption before trying to restart.
> v3: Preserve semaphores on skipping requests (need to keep the timelines
> intact).
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c              | 42 +++++++++++++++++--
>  drivers/gpu/drm/i915/selftests/igt_spinner.c  |  9 +++-
>  .../gpu/drm/i915/selftests/intel_hangcheck.c  |  6 +++
>  3 files changed, 53 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 5e98fd79bd9d..e3134a635926 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
>  	*cs++ = rq->fence.seqno - 1;
>  
>  	intel_ring_advance(rq, cs);
> +
> +	/* Record the updated position of the request's payload */
> +	rq->infix = intel_ring_offset(rq, cs);
> +
>  	return 0;
>  }
>  
> @@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
>  	spin_unlock_irqrestore(&engine->timeline.lock, flags);
>  }
>  
> +static bool lrc_regs_ok(const struct i915_request *rq)
> +{
> +	const struct intel_ring *ring = rq->ring;
> +	const u32 *regs = rq->hw_context->lrc_reg_state;
> +
> +	/* Quick spot check for the common signs of context corruption */
> +
> +	if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
> +	    (RING_CTL_SIZE(ring->size) | RING_VALID))
> +		return false;

You been noticing this with ctx corruption? Well now
thinking about it, we have had reports where on init,
on some trouble, the valid vanishes.


> +
> +	if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
> +		return false;
> +

Seen this on bugzilla reports too. Are there more
in your sleeve or is this a compromise on complexity
and performance? Checking on a sane actual head too?

The heuristics of it bothers me some as we will
get false positives.

So in effect, when we get one, we just move ahead
after an extra reset as we got it all wrong?

> +	return true;
> +}
> +
>  static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  {
>  	struct intel_engine_execlists * const execlists = &engine->execlists;
> @@ -1912,6 +1933,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  	if (!rq)
>  		goto out_unlock;
>  
> +	/*
> +	 * If this request hasn't started yet, e.g. it is waiting on a
> +	 * semaphore, we need to avoid skipping the request or else we
> +	 * break the signaling chain. However, if the context is corrupt
> +	 * the request will not restart and we will be stuck with a wedged
> +	 * device. It is quite often the case that if we issue a reset
> +	 * while the GPU is loading the context image, that context image
> +	 * becomes corrupt.
> +	 *
> +	 * Otherwise, if we have not started yet, the request should replay
> +	 * perfectly and we do not need to flag the result as being erroneous.
> +	 */
> +	if (!i915_request_started(rq) && lrc_regs_ok(rq))
> +		goto out_unlock;
> +
>  	/*
>  	 * If the request was innocent, we leave the request in the ELSP
>  	 * and will try to replay it on restarting. The context image may
> @@ -1924,7 +1960,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  	 * image back to the expected values to skip over the guilty request.
>  	 */
>  	i915_reset_request(rq, stalled);
> -	if (!stalled)
> +	if (!stalled && lrc_regs_ok(rq))

Extend the ctx validator usage for on guilty engines, well why not.
-Mika

>  		goto out_unlock;
>  
>  	/*
> @@ -1942,8 +1978,8 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
>  		       engine->context_size - PAGE_SIZE);
>  	}
>  
> -	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
> -	rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
> +	/* Rerun the request; its payload has been neutered (if guilty). */
> +	rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
>  	intel_ring_update_space(rq->ring);
>  
>  	execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
> diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c
> index 9ebd9225684e..86354e51bdd3 100644
> --- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
> +++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
> @@ -142,10 +142,17 @@ igt_spinner_create_request(struct igt_spinner *spin,
>  	*batch++ = upper_32_bits(vma->node.start);
>  	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
>  
> -	i915_gem_chipset_flush(spin->i915);
> +	if (engine->emit_init_breadcrumb &&
> +	    rq->timeline->has_initial_breadcrumb) {
> +		err = engine->emit_init_breadcrumb(rq);
> +		if (err)
> +			goto cancel_rq;
> +	}
>  
>  	err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
>  
> +	i915_gem_chipset_flush(spin->i915);
> +
>  cancel_rq:
>  	if (err) {
>  		i915_request_skip(rq, err);
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 4886fac12628..36c17bfe05a7 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -246,6 +246,12 @@ hang_create_request(struct hang *h, struct intel_engine_cs *engine)
>  	if (INTEL_GEN(vm->i915) <= 5)
>  		flags |= I915_DISPATCH_SECURE;
>  
> +	if (rq->engine->emit_init_breadcrumb) {
> +		err = rq->engine->emit_init_breadcrumb(rq);
> +		if (err)
> +			goto cancel_rq;
> +	}
> +
>  	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
>  
>  cancel_rq:
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] drm/i915: Don't claim an unstarted request was guilty
  2019-02-08 14:47     ` Mika Kuoppala
@ 2019-02-08 14:58       ` Chris Wilson
  2019-02-08 15:31         ` Mika Kuoppala
  0 siblings, 1 reply; 33+ messages in thread
From: Chris Wilson @ 2019-02-08 14:58 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2019-02-08 14:47:13)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > If we haven't even begun executing the payload of the stalled request,
> > then we should not claim that its userspace context was guilty of
> > submitting a hanging batch.
> >
> > v2: Check for context corruption before trying to restart.
> > v3: Preserve semaphores on skipping requests (need to keep the timelines
> > intact).
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >  drivers/gpu/drm/i915/intel_lrc.c              | 42 +++++++++++++++++--
> >  drivers/gpu/drm/i915/selftests/igt_spinner.c  |  9 +++-
> >  .../gpu/drm/i915/selftests/intel_hangcheck.c  |  6 +++
> >  3 files changed, 53 insertions(+), 4 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 5e98fd79bd9d..e3134a635926 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
> >       *cs++ = rq->fence.seqno - 1;
> >  
> >       intel_ring_advance(rq, cs);
> > +
> > +     /* Record the updated position of the request's payload */
> > +     rq->infix = intel_ring_offset(rq, cs);
> > +
> >       return 0;
> >  }
> >  
> > @@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
> >       spin_unlock_irqrestore(&engine->timeline.lock, flags);
> >  }
> >  
> > +static bool lrc_regs_ok(const struct i915_request *rq)
> > +{
> > +     const struct intel_ring *ring = rq->ring;
> > +     const u32 *regs = rq->hw_context->lrc_reg_state;
> > +
> > +     /* Quick spot check for the common signs of context corruption */
> > +
> > +     if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
> > +         (RING_CTL_SIZE(ring->size) | RING_VALID))
> > +             return false;
> 
> You been noticing this with ctx corruption? Well now
> thinking about it, we have had reports where on init,
> on some trouble, the valid vanishes.

Yes, it's why we've been copying the default context over guilty for a
long time (pretty much since live_hangcheck became a thing iirc).

> > +
> > +     if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
> > +             return false;
> > +
> 
> Seen this on bugzilla reports too. Are there more
> in your sleeve or is this a compromise on complexity
> and performance? Checking on a sane actual head too?

No, I can't remember seeing this, just loosing CTL. But I do recall that
at one point it seemed the whole reg state was zero, but that is foggy
memory.
 
> The heuristics of it bothers me some as we will
> get false positives.

They cannot be false positives! If we restore to a batch setup like
this, it will hang -- which is why we explicitly reset them.
 
> So in effect, when we get one, we just move ahead
> after an extra reset as we got it all wrong?

Yup. The context is corrupt, we replace it with a sane one and hope
nobody notices. Mesa does notice though....

> > +     return true;
> > +}
> > +
> >  static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
> >  {
> >       struct intel_engine_execlists * const execlists = &engine->execlists;
> > @@ -1912,6 +1933,21 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
> >       if (!rq)
> >               goto out_unlock;
> >  
> > +     /*
> > +      * If this request hasn't started yet, e.g. it is waiting on a
> > +      * semaphore, we need to avoid skipping the request or else we
> > +      * break the signaling chain. However, if the context is corrupt
> > +      * the request will not restart and we will be stuck with a wedged
> > +      * device. It is quite often the case that if we issue a reset
> > +      * while the GPU is loading the context image, that context image
> > +      * becomes corrupt.
> > +      *
> > +      * Otherwise, if we have not started yet, the request should replay
> > +      * perfectly and we do not need to flag the result as being erroneous.
> > +      */
> > +     if (!i915_request_started(rq) && lrc_regs_ok(rq))
> > +             goto out_unlock;
> > +
> >       /*
> >        * If the request was innocent, we leave the request in the ELSP
> >        * and will try to replay it on restarting. The context image may
> > @@ -1924,7 +1960,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
> >        * image back to the expected values to skip over the guilty request.
> >        */
> >       i915_reset_request(rq, stalled);
> > -     if (!stalled)
> > +     if (!stalled && lrc_regs_ok(rq))
> 
> Extend the ctx validator usage for on guilty engines, well why not.

We forcibly reset anyway.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging
  2019-02-08 10:00     ` Chris Wilson
@ 2019-02-08 15:07       ` Mika Kuoppala
  2019-02-08 15:13         ` Chris Wilson
  0 siblings, 1 reply; 33+ messages in thread
From: Mika Kuoppala @ 2019-02-08 15:07 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2019-02-08 09:46:01)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > On wedging, we mark all executing requests as complete and all pending
>> > requests completed as soon as they are ready. Before unwedging though we
>> > wish to flush those pending requests prior to restoring default
>> > execution, and so we must wait. Do so interruptibly as we do not provide
>> 
>> uninterruptibly?
>> 
>> > the EINTR gracefully back to userspace in this case but persistent in
>> 
>> be persistent?
>> 
>> We lost the gracefullness due to not having the interruptible
>> mutex wait on reset path anymore?
>
> No. We never had graceful handling, as we could never propagate the
> EINTR from unwedge back to userspace so it became EIO instead.

Now I managed to entertain atleast myself.
I try to find a deeper meaning so hard on this...

and there it is, in plain sight: the return code
is not used. I really did read it before
but obviously no rampup in powerstate was involved.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging
  2019-02-08 15:07       ` Mika Kuoppala
@ 2019-02-08 15:13         ` Chris Wilson
  0 siblings, 0 replies; 33+ messages in thread
From: Chris Wilson @ 2019-02-08 15:13 UTC (permalink / raw)
  To: Mika Kuoppala, intel-gfx

Quoting Mika Kuoppala (2019-02-08 15:07:55)
> Chris Wilson <chris@chris-wilson.co.uk> writes:
> 
> > Quoting Mika Kuoppala (2019-02-08 09:46:01)
> >> Chris Wilson <chris@chris-wilson.co.uk> writes:
> >> 
> >> > On wedging, we mark all executing requests as complete and all pending
> >> > requests completed as soon as they are ready. Before unwedging though we
> >> > wish to flush those pending requests prior to restoring default
> >> > execution, and so we must wait. Do so interruptibly as we do not provide
> >> 
> >> uninterruptibly?
> >> 
> >> > the EINTR gracefully back to userspace in this case but persistent in
> >> 
> >> be persistent?
> >> 
> >> We lost the gracefullness due to not having the interruptible
> >> mutex wait on reset path anymore?
> >
> > No. We never had graceful handling, as we could never propagate the
> > EINTR from unwedge back to userspace so it became EIO instead.
> 
> Now I managed to entertain atleast myself.
> I try to find a deeper meaning so hard on this...
> 
> and there it is, in plain sight: the return code
> is not used. I really did read it before
> but obviously no rampup in powerstate was involved.

Exactly. A task for the future would be to use killable here, let the
user die and leave the machine wedged. But that requires a bit more
effort :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* Re: [PATCH] drm/i915: Don't claim an unstarted request was guilty
  2019-02-08 14:58       ` Chris Wilson
@ 2019-02-08 15:31         ` Mika Kuoppala
  0 siblings, 0 replies; 33+ messages in thread
From: Mika Kuoppala @ 2019-02-08 15:31 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Quoting Mika Kuoppala (2019-02-08 14:47:13)
>> Chris Wilson <chris@chris-wilson.co.uk> writes:
>> 
>> > If we haven't even begun executing the payload of the stalled request,
>> > then we should not claim that its userspace context was guilty of
>> > submitting a hanging batch.
>> >
>> > v2: Check for context corruption before trying to restart.
>> > v3: Preserve semaphores on skipping requests (need to keep the timelines
>> > intact).
>> >
>> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> > ---
>> >  drivers/gpu/drm/i915/intel_lrc.c              | 42 +++++++++++++++++--
>> >  drivers/gpu/drm/i915/selftests/igt_spinner.c  |  9 +++-
>> >  .../gpu/drm/i915/selftests/intel_hangcheck.c  |  6 +++
>> >  3 files changed, 53 insertions(+), 4 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> > index 5e98fd79bd9d..e3134a635926 100644
>> > --- a/drivers/gpu/drm/i915/intel_lrc.c
>> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> > @@ -1387,6 +1387,10 @@ static int gen8_emit_init_breadcrumb(struct i915_request *rq)
>> >       *cs++ = rq->fence.seqno - 1;
>> >  
>> >       intel_ring_advance(rq, cs);
>> > +
>> > +     /* Record the updated position of the request's payload */
>> > +     rq->infix = intel_ring_offset(rq, cs);
>> > +
>> >       return 0;
>> >  }
>> >  
>> > @@ -1878,6 +1882,23 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
>> >       spin_unlock_irqrestore(&engine->timeline.lock, flags);
>> >  }
>> >  
>> > +static bool lrc_regs_ok(const struct i915_request *rq)
>> > +{
>> > +     const struct intel_ring *ring = rq->ring;
>> > +     const u32 *regs = rq->hw_context->lrc_reg_state;
>> > +
>> > +     /* Quick spot check for the common signs of context corruption */
>> > +
>> > +     if (regs[CTX_RING_BUFFER_CONTROL + 1] !=
>> > +         (RING_CTL_SIZE(ring->size) | RING_VALID))
>> > +             return false;
>> 
>> You been noticing this with ctx corruption? Well now
>> thinking about it, we have had reports where on init,
>> on some trouble, the valid vanishes.
>
> Yes, it's why we've been copying the default context over guilty for a
> long time (pretty much since live_hangcheck became a thing iirc).
>
>> > +
>> > +     if (regs[CTX_RING_BUFFER_START + 1] != i915_ggtt_offset(ring->vma))
>> > +             return false;
>> > +
>> 
>> Seen this on bugzilla reports too. Are there more
>> in your sleeve or is this a compromise on complexity
>> and performance? Checking on a sane actual head too?
>
> No, I can't remember seeing this, just loosing CTL. But I do recall that
> at one point it seemed the whole reg state was zero, but that is foggy
> memory.

I might mix this with the failed init where the start was
bogus/stale. Looked like after the hang, the hardware just
swapped to a previous ctx without any provocation.

Regardless of the reason, this will guard the
restart.

>  
>> The heuristics of it bothers me some as we will
>> get false positives.
>
> They cannot be false positives! If we restore to a batch setup like
> this, it will hang -- which is why we explicitly reset them.
>  
>> So in effect, when we get one, we just move ahead
>> after an extra reset as we got it all wrong?
>
> Yup. The context is corrupt, we replace it with a sane one and hope
> nobody notices. Mesa does notice though....

We tell them about our shortcomings, if they choose to listen.

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 33+ messages in thread

* [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs
@ 2019-02-06 17:11 Chris Wilson
  0 siblings, 0 replies; 33+ messages in thread
From: Chris Wilson @ 2019-02-06 17:11 UTC (permalink / raw)
  To: intel-gfx; +Cc: mika.kuoppala

Apply backpressure to hogs that emit requests faster than the GPU can
process them by waiting for their ring to be less than half-full before
proceeding with taking the struct_mutex.

This is a gross hack to apply throttling backpressure, the long term
goal is to remove the struct_mutex contention so that each client
naturally waits, preferably in an asynchronous, nonblocking fashion
(pipelined operations for the win), for their own resources and never
blocks another client within the driver at least. (Realtime priority
goals would extend to ensuring that resource contention favours high
priority clients as well.)

This patch only limits excessive request production and does not attempt
to throttle clients that block waiting for eviction (either global GTT or
system memory) or any other global resources, see above for the long term
goal.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 63 ++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.c    | 13 -----
 drivers/gpu/drm/i915/intel_ringbuffer.h    | 12 +++++
 3 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 8eedf7cac493..84ef3abc567e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -753,6 +753,64 @@ static int eb_select_context(struct i915_execbuffer *eb)
 	return 0;
 }
 
+static struct i915_request *__eb_wait_for_ring(struct intel_ring *ring)
+{
+	struct i915_request *rq;
+
+	if (intel_ring_update_space(ring) >= PAGE_SIZE)
+		return NULL;
+
+	/*
+	 * Find a request that after waiting upon, there will be at least half
+	 * the ring available. The hystersis allows us to compete for the
+	 * shared ring and should mean that we sleep less often prior to
+	 * claiming our resources, but not so long that the ring completely
+	 * drains before we can submit our next request.
+	 */
+	list_for_each_entry(rq, &ring->request_list, ring_link) {
+		if (__intel_ring_space(rq->postfix,
+				       ring->emit, ring->size) > ring->size / 2)
+			break;
+	}
+	if (&rq->ring_link == &ring->request_list)
+		return NULL; /* weird, we will check again later for real */
+
+	return i915_request_get(rq);
+}
+
+static int eb_wait_for_ring(const struct i915_execbuffer *eb)
+{
+	const struct intel_context *ce;
+	struct i915_request *rq;
+	int ret = 0;
+
+	/*
+	 * Apply a light amount of backpressure to prevent excessive hogs
+	 * from blocking waiting for space whilst holding struct_mutex and
+	 * keeping all of their resources pinned.
+	 */
+
+	ce = to_intel_context(eb->ctx, eb->engine);
+	if (!ce->ring) /* first use, assume empty! */
+		return 0;
+
+	rq = __eb_wait_for_ring(ce->ring);
+	if (rq) {
+		mutex_unlock(&eb->i915->drm.struct_mutex);
+
+		if (i915_request_wait(rq,
+				      I915_WAIT_INTERRUPTIBLE,
+				      MAX_SCHEDULE_TIMEOUT) < 0)
+			ret = -EINTR;
+
+		i915_request_put(rq);
+
+		mutex_lock(&eb->i915->drm.struct_mutex);
+	}
+
+	return ret;
+}
+
 static int eb_lookup_vmas(struct i915_execbuffer *eb)
 {
 	struct radix_tree_root *handles_vma = &eb->ctx->handles_vma;
@@ -2291,6 +2349,10 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	if (err)
 		goto err_rpm;
 
+	err = eb_wait_for_ring(&eb); /* may temporarily drop struct_mutex */
+	if (unlikely(err))
+		goto err_unlock;
+
 	err = eb_relocate(&eb);
 	if (err) {
 		/*
@@ -2435,6 +2497,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 err_vma:
 	if (eb.exec)
 		eb_release_vmas(&eb);
+err_unlock:
 	mutex_unlock(&dev->struct_mutex);
 err_rpm:
 	intel_runtime_pm_put(eb.i915, wakeref);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index b889b27f8aeb..7f841dba87b3 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -49,19 +49,6 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 		I915_GEM_HWS_INDEX_ADDR);
 }
 
-static unsigned int __intel_ring_space(unsigned int head,
-				       unsigned int tail,
-				       unsigned int size)
-{
-	/*
-	 * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
-	 * same cacheline, the Head Pointer must not be greater than the Tail
-	 * Pointer."
-	 */
-	GEM_BUG_ON(!is_power_of_2(size));
-	return (head - tail - CACHELINE_BYTES) & (size - 1);
-}
-
 unsigned int intel_ring_update_space(struct intel_ring *ring)
 {
 	unsigned int space;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 4d4ea6963a72..710ffb221775 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -832,6 +832,18 @@ intel_ring_set_tail(struct intel_ring *ring, unsigned int tail)
 	return tail;
 }
 
+static inline unsigned int
+__intel_ring_space(unsigned int head, unsigned int tail, unsigned int size)
+{
+	/*
+	 * "If the Ring Buffer Head Pointer and the Tail Pointer are on the
+	 * same cacheline, the Head Pointer must not be greater than the Tail
+	 * Pointer."
+	 */
+	GEM_BUG_ON(!is_power_of_2(size));
+	return (head - tail - CACHELINE_BYTES) & (size - 1);
+}
+
 void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno);
 
 int intel_engine_setup_common(struct intel_engine_cs *engine);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 33+ messages in thread

end of thread, other threads:[~2019-02-08 15:32 UTC | newest]

Thread overview: 33+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-02-07  7:18 [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Chris Wilson
2019-02-07  7:18 ` [PATCH 2/8] drm/i915: Defer removing fence register tracking to rpm wakeup Chris Wilson
2019-02-07 13:22   ` Mika Kuoppala
2019-02-07 13:38     ` Chris Wilson
2019-02-07 14:09       ` Mika Kuoppala
2019-02-07 14:13         ` Chris Wilson
2019-02-07  7:18 ` [PATCH 3/8] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
2019-02-07 15:05   ` Mika Kuoppala
2019-02-07  7:18 ` [PATCH 4/8] drm/i915: Force the GPU reset upon wedging Chris Wilson
2019-02-08  9:31   ` Mika Kuoppala
2019-02-08  9:47     ` Chris Wilson
2019-02-07  7:18 ` [PATCH 5/8] drm/i915: Uninterruptibly drain the timelines on unwedging Chris Wilson
2019-02-08  9:46   ` Mika Kuoppala
2019-02-08 10:00     ` Chris Wilson
2019-02-08 15:07       ` Mika Kuoppala
2019-02-08 15:13         ` Chris Wilson
2019-02-07  7:18 ` [PATCH 6/8] drm/i915: Wait for old resets before applying debugfs/i915_wedged Chris Wilson
2019-02-08  9:56   ` Mika Kuoppala
2019-02-08 10:01     ` Chris Wilson
2019-02-07  7:18 ` [PATCH 7/8] drm/i915: Serialise resets with wedging Chris Wilson
2019-02-08 14:30   ` Mika Kuoppala
2019-02-07  7:18 ` [PATCH 8/8] drm/i915: Don't claim an unstarted request was guilty Chris Wilson
2019-02-07  7:41   ` [PATCH] " Chris Wilson
2019-02-08 14:47     ` Mika Kuoppala
2019-02-08 14:58       ` Chris Wilson
2019-02-08 15:31         ` Mika Kuoppala
2019-02-07  8:08 ` ✗ Fi.CI.SPARSE: warning for series starting with [1/8] drm/i915: Hack and slash, throttle execbuffer hogs (rev2) Patchwork
2019-02-07  8:25 ` ✓ Fi.CI.BAT: success " Patchwork
2019-02-07  9:53 ` ✓ Fi.CI.IGT: " Patchwork
2019-02-07 16:01 ` [PATCH 1/8] drm/i915: Hack and slash, throttle execbuffer hogs Joonas Lahtinen
2019-02-07 16:05   ` Chris Wilson
2019-02-07 16:21   ` Chris Wilson
  -- strict thread matches above, loose matches on Subject: below --
2019-02-06 17:11 Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.