All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset
@ 2019-01-30  2:18 Chris Wilson
  2019-01-30  2:18 ` [PATCH 02/11] drm/i915/execlists: Suppress redundant preemption Chris Wilson
                   ` (14 more replies)
  0 siblings, 15 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:18 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Previously, we were able to rely on the recursive properties of
struct_mutex to allow us to serialise revoking mmaps and reacquiring the
FENCE registers with them being clobbered over a global device reset.
I then proceeded to throw out the baby with the bath water in order to
pursue a struct_mutex-less reset.

Perusing LWN for alternative strategies, the dilemma on how to serialise
access to a global resource on one side was answered by
https://lwn.net/Articles/202847/ -- Sleepable RCU:

    1  int readside(void) {
    2      int idx;
    3      rcu_read_lock();
    4	   if (nomoresrcu) {
    5          rcu_read_unlock();
    6	       return -EINVAL;
    7      }
    8	   idx = srcu_read_lock(&ss);
    9	   rcu_read_unlock();
    10	   /* SRCU read-side critical section. */
    11	   srcu_read_unlock(&ss, idx);
    12	   return 0;
    13 }
    14
    15 void cleanup(void)
    16 {
    17     nomoresrcu = 1;
    18     synchronize_rcu();
    19     synchronize_srcu(&ss);
    20     cleanup_srcu_struct(&ss);
    21 }

No more worrying about stop_machine, just an uber-complex mutex,
optimised for reads, with the overhead pushed to the rare reset path.

However, we do run the risk of a deadlock as we allocate underneath the
SRCU read lock, and the allocation may require a GPU reset, causing a
dependency cycle via the in-flight requests. We resolve that by declaring
the driver wedged and cancelling all in-flight rendering.

Testcase: igt/gem_mmap_gtt/hang
Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c           | 12 +--
 drivers/gpu/drm/i915/i915_drv.h               | 18 ++--
 drivers/gpu/drm/i915/i915_gem.c               | 56 +++--------
 drivers/gpu/drm/i915/i915_gem_fence_reg.c     | 26 -----
 drivers/gpu/drm/i915/i915_gpu_error.h         | 12 +--
 drivers/gpu/drm/i915/i915_reset.c             | 96 ++++++++++++-------
 drivers/gpu/drm/i915/i915_reset.h             |  4 +
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  1 +
 8 files changed, 93 insertions(+), 132 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index fa2c226fc779..2cea263b4d79 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1281,14 +1281,11 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	intel_wakeref_t wakeref;
 	enum intel_engine_id id;
 
+	seq_printf(m, "Reset flags: %lx\n", dev_priv->gpu_error.flags);
 	if (test_bit(I915_WEDGED, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Wedged\n");
+		seq_puts(m, "\tWedged\n");
 	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
-		seq_puts(m, "Waiter holding struct mutex\n");
-	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
-		seq_puts(m, "struct_mutex blocked for reset\n");
+		seq_puts(m, "\tDevice (global) reset in progress\n");
 
 	if (!i915_modparams.enable_hangcheck) {
 		seq_puts(m, "Hangcheck disabled\n");
@@ -3885,9 +3882,6 @@ i915_wedged_set(void *data, u64 val)
 	 * while it is writing to 'i915_wedged'
 	 */
 
-	if (i915_reset_backoff(&i915->gpu_error))
-		return -EAGAIN;
-
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d072f3369ee1..8ec28a7f5452 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2986,7 +2986,12 @@ i915_gem_obj_finish_shmem_access(struct drm_i915_gem_object *obj)
 	i915_gem_object_unpin_pages(obj);
 }
 
-int __must_check i915_mutex_lock_interruptible(struct drm_device *dev);
+static inline int __must_check
+i915_mutex_lock_interruptible(struct drm_device *dev)
+{
+	return mutex_lock_interruptible(&dev->struct_mutex);
+}
+
 int i915_gem_dumb_create(struct drm_file *file_priv,
 			 struct drm_device *dev,
 			 struct drm_mode_create_dumb *args);
@@ -3003,21 +3008,11 @@ int __must_check i915_gem_set_global_seqno(struct drm_device *dev, u32 seqno);
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine);
 
-static inline bool i915_reset_backoff(struct i915_gpu_error *error)
-{
-	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
-}
-
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
 }
 
-static inline bool i915_reset_backoff_or_wedged(struct i915_gpu_error *error)
-{
-	return i915_reset_backoff(error) | i915_terminally_wedged(error);
-}
-
 static inline u32 i915_reset_count(struct i915_gpu_error *error)
 {
 	return READ_ONCE(error->reset_count);
@@ -3090,7 +3085,6 @@ struct drm_i915_fence_reg *
 i915_reserve_fence(struct drm_i915_private *dev_priv);
 void i915_unreserve_fence(struct drm_i915_fence_reg *fence);
 
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv);
 void i915_gem_restore_fences(struct drm_i915_private *dev_priv);
 
 void i915_gem_detect_bit_6_swizzle(struct drm_i915_private *dev_priv);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e802af64d628..caccff87a2a1 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -100,47 +100,6 @@ static void i915_gem_info_remove_obj(struct drm_i915_private *dev_priv,
 	spin_unlock(&dev_priv->mm.object_stat_lock);
 }
 
-static int
-i915_gem_wait_for_error(struct i915_gpu_error *error)
-{
-	int ret;
-
-	might_sleep();
-
-	/*
-	 * Only wait 10 seconds for the gpu reset to complete to avoid hanging
-	 * userspace. If it takes that long something really bad is going on and
-	 * we should simply try to bail out and fail as gracefully as possible.
-	 */
-	ret = wait_event_interruptible_timeout(error->reset_queue,
-					       !i915_reset_backoff(error),
-					       I915_RESET_TIMEOUT);
-	if (ret == 0) {
-		DRM_ERROR("Timed out waiting for the gpu reset to complete\n");
-		return -EIO;
-	} else if (ret < 0) {
-		return ret;
-	} else {
-		return 0;
-	}
-}
-
-int i915_mutex_lock_interruptible(struct drm_device *dev)
-{
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	int ret;
-
-	ret = i915_gem_wait_for_error(&dev_priv->gpu_error);
-	if (ret)
-		return ret;
-
-	ret = mutex_lock_interruptible(&dev->struct_mutex);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
 static u32 __i915_gem_park(struct drm_i915_private *i915)
 {
 	intel_wakeref_t wakeref;
@@ -1869,6 +1828,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 	intel_wakeref_t wakeref;
 	struct i915_vma *vma;
 	pgoff_t page_offset;
+	int srcu;
 	int ret;
 
 	/* Sanity check that we allow writing into this object */
@@ -1908,7 +1868,6 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 		goto err_unlock;
 	}
 
-
 	/* Now pin it into the GTT as needed */
 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0,
 				       PIN_MAPPABLE |
@@ -1946,9 +1905,15 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 	if (ret)
 		goto err_unpin;
 
+	srcu = i915_reset_lock(dev_priv);
+	if (srcu < 0) {
+		ret = srcu;
+		goto err_unpin;
+	}
+
 	ret = i915_vma_pin_fence(vma);
 	if (ret)
-		goto err_unpin;
+		goto err_reset;
 
 	/* Finally, remap it using the new GTT offset */
 	ret = remap_io_mapping(area,
@@ -1969,6 +1934,8 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
 
 err_fence:
 	i915_vma_unpin_fence(vma);
+err_reset:
+	i915_reset_unlock(dev_priv, srcu);
 err_unpin:
 	__i915_vma_unpin(vma);
 err_unlock:
@@ -5324,6 +5291,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
 	init_waitqueue_head(&dev_priv->gpu_error.wait_queue);
 	init_waitqueue_head(&dev_priv->gpu_error.reset_queue);
 	mutex_init(&dev_priv->gpu_error.wedge_mutex);
+	init_srcu_struct(&dev_priv->gpu_error.srcu);
 
 	atomic_set(&dev_priv->mm.bsd_engine_dispatch_index, 0);
 
@@ -5356,6 +5324,8 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
 	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
 	WARN_ON(dev_priv->mm.object_count);
 
+	cleanup_srcu_struct(&dev_priv->gpu_error.srcu);
+
 	kmem_cache_destroy(dev_priv->priorities);
 	kmem_cache_destroy(dev_priv->dependencies);
 	kmem_cache_destroy(dev_priv->requests);
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index 46e259661294..bdb745d5747f 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -435,32 +435,6 @@ void i915_unreserve_fence(struct drm_i915_fence_reg *fence)
 	list_add(&fence->link, &fence->i915->mm.fence_list);
 }
 
-/**
- * i915_gem_revoke_fences - revoke fence state
- * @dev_priv: i915 device private
- *
- * Removes all GTT mmappings via the fence registers. This forces any user
- * of the fence to reacquire that fence before continuing with their access.
- * One use is during GPU reset where the fence register is lost and we need to
- * revoke concurrent userspace access via GTT mmaps until the hardware has been
- * reset and the fence registers have been restored.
- */
-void i915_gem_revoke_fences(struct drm_i915_private *dev_priv)
-{
-	int i;
-
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
-	for (i = 0; i < dev_priv->num_fence_regs; i++) {
-		struct drm_i915_fence_reg *fence = &dev_priv->fence_regs[i];
-
-		GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
-
-		if (fence->vma)
-			i915_vma_revoke_mmap(fence->vma);
-	}
-}
-
 /**
  * i915_gem_restore_fences - restore fence state
  * @dev_priv: i915 device private
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 53b1f22dd365..4e797c552b96 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -231,12 +231,10 @@ struct i915_gpu_error {
 	/**
 	 * flags: Control various stages of the GPU reset
 	 *
-	 * #I915_RESET_BACKOFF - When we start a reset, we want to stop any
-	 * other users acquiring the struct_mutex. To do this we set the
-	 * #I915_RESET_BACKOFF bit in the error flags when we detect a reset
-	 * and then check for that bit before acquiring the struct_mutex (in
-	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
-	 * secondary role in preventing two concurrent global reset attempts.
+	 * #I915_RESET_BACKOFF - When we start a global reset, we need to
+	 * serialise with any other users attempting to do the same, and
+	 * any global resources that may be clobber by the reset (such as
+	 * FENCE registers).
 	 *
 	 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
 	 * acquire the struct_mutex to reset an engine, we need an explicit
@@ -272,6 +270,8 @@ struct i915_gpu_error {
 	 */
 	wait_queue_head_t reset_queue;
 
+	struct srcu_struct srcu;
+
 	struct i915_gpu_restart *restart;
 };
 
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 4462007a681c..328b35410672 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -639,6 +639,31 @@ static void reset_prepare_engine(struct intel_engine_cs *engine)
 	engine->reset.prepare(engine);
 }
 
+static void revoke_mmaps(struct drm_i915_private *i915)
+{
+	int i;
+
+	for (i = 0; i < i915->num_fence_regs; i++) {
+		struct i915_vma *vma = i915->fence_regs[i].vma;
+		struct drm_vma_offset_node *node;
+		u64 vma_offset;
+
+		if (!vma)
+			continue;
+
+		GEM_BUG_ON(vma->fence != &i915->fence_regs[i]);
+		if (!i915_vma_has_userfault(vma))
+			continue;
+
+		node = &vma->obj->base.vma_node;
+		vma_offset = vma->ggtt_view.partial.offset << PAGE_SHIFT;
+		unmap_mapping_range(i915->drm.anon_inode->i_mapping,
+				    drm_vma_node_offset_addr(node) + vma_offset,
+				    vma->size,
+				    1);
+	}
+}
+
 static void reset_prepare(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
@@ -648,6 +673,7 @@ static void reset_prepare(struct drm_i915_private *i915)
 		reset_prepare_engine(engine);
 
 	intel_uc_sanitize(i915);
+	revoke_mmaps(i915);
 }
 
 static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
@@ -911,50 +937,19 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	return ret;
 }
 
-struct __i915_reset {
-	struct drm_i915_private *i915;
-	unsigned int stalled_mask;
-};
-
-static int __i915_reset__BKL(void *data)
-{
-	struct __i915_reset *arg = data;
-	int err;
-
-	err = intel_gpu_reset(arg->i915, ALL_ENGINES);
-	if (err)
-		return err;
-
-	return gt_reset(arg->i915, arg->stalled_mask);
-}
-
-#if RESET_UNDER_STOP_MACHINE
-/*
- * XXX An alternative to using stop_machine would be to park only the
- * processes that have a GGTT mmap. By remote parking the threads (SIGSTOP)
- * we should be able to prevent their memmory accesses via the lost fence
- * registers over the course of the reset without the potential recursive
- * of mutexes between the pagefault handler and reset.
- *
- * See igt/gem_mmap_gtt/hang
- */
-#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
-#else
-#define __do_reset(fn, arg) fn(arg)
-#endif
-
 static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
-	struct __i915_reset arg = { i915, stalled_mask };
 	int err, i;
 
-	err = __do_reset(__i915_reset__BKL, &arg);
+	err = intel_gpu_reset(i915, ALL_ENGINES);
 	for (i = 0; err && i < RESET_MAX_RETRIES; i++) {
 		msleep(100);
-		err = __do_reset(__i915_reset__BKL, &arg);
+		err = intel_gpu_reset(i915, ALL_ENGINES);
 	}
+	if (err)
+		return err;
 
-	return err;
+	return gt_reset(i915, stalled_mask);
 }
 
 /**
@@ -1277,6 +1272,9 @@ void i915_handle_error(struct drm_i915_private *i915,
 		goto out;
 	}
 
+	synchronize_rcu();
+	synchronize_srcu(&i915->gpu_error.srcu);
+
 	/* Prevent any other reset-engine attempt. */
 	for_each_engine(engine, i915, tmp) {
 		while (test_and_set_bit(I915_RESET_ENGINE + engine->id,
@@ -1300,6 +1298,32 @@ void i915_handle_error(struct drm_i915_private *i915,
 	intel_runtime_pm_put(i915, wakeref);
 }
 
+int i915_reset_lock(struct drm_i915_private *i915)
+{
+	int srcu;
+
+	rcu_read_lock();
+	while (test_bit(I915_RESET_BACKOFF, &i915->gpu_error.flags)) {
+		rcu_read_unlock();
+
+		if (wait_event_interruptible(i915->gpu_error.reset_queue,
+					     !test_bit(I915_RESET_BACKOFF,
+						       &i915->gpu_error.flags)))
+			return -EINTR;
+
+		rcu_read_lock();
+	}
+	srcu = srcu_read_lock(&i915->gpu_error.srcu);
+	rcu_read_unlock();
+
+	return srcu;
+}
+
+void i915_reset_unlock(struct drm_i915_private *i915, int tag)
+{
+	srcu_read_unlock(&i915->gpu_error.srcu, tag);
+}
+
 bool i915_reset_flush(struct drm_i915_private *i915)
 {
 	int err;
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index f2d347f319df..eb412e0158da 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -9,6 +9,7 @@
 
 #include <linux/compiler.h>
 #include <linux/types.h>
+#include <linux/srcu.h>
 
 struct drm_i915_private;
 struct intel_engine_cs;
@@ -32,6 +33,9 @@ int i915_reset_engine(struct intel_engine_cs *engine,
 void i915_reset_request(struct i915_request *rq, bool guilty);
 bool i915_reset_flush(struct drm_i915_private *i915);
 
+int i915_reset_lock(struct drm_i915_private *i915);
+void i915_reset_unlock(struct drm_i915_private *i915, int tag);
+
 bool intel_has_gpu_reset(struct drm_i915_private *i915);
 bool intel_has_reset_engine(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 14ae46fda49f..074a0d9cbf26 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -189,6 +189,7 @@ struct drm_i915_private *mock_gem_device(void)
 
 	init_waitqueue_head(&i915->gpu_error.wait_queue);
 	init_waitqueue_head(&i915->gpu_error.reset_queue);
+	init_srcu_struct(&i915->gpu_error.srcu);
 	mutex_init(&i915->gpu_error.wedge_mutex);
 
 	i915->wq = alloc_ordered_workqueue("mock", 0);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 02/11] drm/i915/execlists: Suppress redundant preemption
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
@ 2019-01-30  2:18 ` Chris Wilson
  2019-01-30  2:18 ` [PATCH 03/11] drm/i915/selftests: Exercise some AB...BA preemption chains Chris Wilson
                   ` (13 subsequent siblings)
  14 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:18 UTC (permalink / raw)
  To: intel-gfx

On unwinding the active request we give it a small (limited to internal
priority levels) boost to prevent it from being gazumped a second time.
However, this means that it can be promoted to above the request that
triggered the preemption request, causing a preempt-to-idle cycle for no
change. We can avoid this if we take the boost into account when
checking if the preemption request is valid.

v2: After preemption the active request will be after the preemptee if
they end up with equal priority.

v3: Tvrtko pointed out that this, the existing logic, makes
I915_PRIORITY_WAIT non-preemptible. Document this interesting quirk!

v4: Prove Tvrtko was right about WAIT being non-preemptible and test it.
v5: Except not all priorities were made equal, and the WAIT not preempting
is only if we start off as !NEWCLIENT.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/intel_lrc.c             |  45 +++++-
 drivers/gpu/drm/i915/selftests/igt_spinner.c |   9 +-
 drivers/gpu/drm/i915/selftests/intel_lrc.c   | 159 +++++++++++++++++++
 3 files changed, 208 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index a9eb0211ce77..2616b0b3e8d5 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -164,6 +164,8 @@
 #define WA_TAIL_DWORDS 2
 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 
+#define ACTIVE_PRIORITY (I915_PRIORITY_NEWCLIENT)
+
 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 					    struct intel_engine_cs *engine,
 					    struct intel_context *ce);
@@ -188,6 +190,34 @@ static inline int rq_prio(const struct i915_request *rq)
 	return rq->sched.attr.priority;
 }
 
+static inline int active_prio(const struct i915_request *rq)
+{
+	int prio = rq_prio(rq);
+
+	/*
+	 * On unwinding the active request, we give it a priority bump
+	 * equivalent to a freshly submitted request. This protects it from
+	 * being gazumped again, but it would be preferable if we didn't
+	 * let it be gazumped in the first place!
+	 *
+	 * See __unwind_incomplete_requests()
+	 */
+	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY &&
+	    i915_request_started(rq)) {
+		/*
+		 * After preemption, we insert the active request at the
+		 * end of the new priority level. This means that we will be
+		 * _lower_ priority than the preemptee all things equal (and
+		 * so the preemption is valid), so adjust our comparison
+		 * accordingly.
+		 */
+		prio |= ACTIVE_PRIORITY;
+		prio--;
+	}
+
+	return prio;
+}
+
 static int queue_prio(const struct intel_engine_execlists *execlists)
 {
 	struct i915_priolist *p;
@@ -208,7 +238,7 @@ static int queue_prio(const struct intel_engine_execlists *execlists)
 static inline bool need_preempt(const struct intel_engine_cs *engine,
 				const struct i915_request *rq)
 {
-	const int last_prio = rq_prio(rq);
+	int last_prio;
 
 	if (!intel_engine_has_preemption(engine))
 		return false;
@@ -228,6 +258,7 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
 	 * preempt. If that hint is stale or we may be trying to preempt
 	 * ourselves, ignore the request.
 	 */
+	last_prio = active_prio(rq);
 	if (!__execlists_need_preempt(engine->execlists.queue_priority_hint,
 				      last_prio))
 		return false;
@@ -353,7 +384,7 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
 {
 	struct i915_request *rq, *rn, *active = NULL;
 	struct list_head *uninitialized_var(pl);
-	int prio = I915_PRIORITY_INVALID | I915_PRIORITY_NEWCLIENT;
+	int prio = I915_PRIORITY_INVALID | ACTIVE_PRIORITY;
 
 	lockdep_assert_held(&engine->timeline.lock);
 
@@ -384,9 +415,15 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
 	 * The active request is now effectively the start of a new client
 	 * stream, so give it the equivalent small priority bump to prevent
 	 * it being gazumped a second time by another peer.
+	 *
+	 * One consequence of this preemption boost is that we may jump
+	 * over lesser priorities (such as I915_PRIORITY_WAIT), effectively
+	 * making those priorities non-preemptible. They will be moved forward
+	 * in the priority queue, but they will not gain immediate access to
+	 * the GPU.
 	 */
-	if (!(prio & I915_PRIORITY_NEWCLIENT)) {
-		prio |= I915_PRIORITY_NEWCLIENT;
+	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY) {
+		prio |= ACTIVE_PRIORITY;
 		active->sched.attr.priority = prio;
 		list_move_tail(&active->sched.link,
 			       i915_sched_lookup_priolist(engine, prio));
diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c
index 9ebd9225684e..86354e51bdd3 100644
--- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
+++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
@@ -142,10 +142,17 @@ igt_spinner_create_request(struct igt_spinner *spin,
 	*batch++ = upper_32_bits(vma->node.start);
 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
 
-	i915_gem_chipset_flush(spin->i915);
+	if (engine->emit_init_breadcrumb &&
+	    rq->timeline->has_initial_breadcrumb) {
+		err = engine->emit_init_breadcrumb(rq);
+		if (err)
+			goto cancel_rq;
+	}
 
 	err = engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, 0);
 
+	i915_gem_chipset_flush(spin->i915);
+
 cancel_rq:
 	if (err) {
 		i915_request_skip(rq, err);
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index fb35f53c9ce3..8774a3ca5a97 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -405,6 +405,164 @@ static int live_suppress_self_preempt(void *arg)
 	goto err_client_b;
 }
 
+static int __i915_sw_fence_call
+dummy_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	return NOTIFY_DONE;
+}
+
+static struct i915_request *dummy_request(struct intel_engine_cs *engine)
+{
+	struct i915_request *rq;
+
+	rq = kmalloc(sizeof(*rq), GFP_KERNEL | __GFP_ZERO);
+	if (!rq)
+		return NULL;
+
+	INIT_LIST_HEAD(&rq->active_list);
+	rq->engine = engine;
+
+	i915_sched_node_init(&rq->sched);
+
+	/* mark this request as permanently incomplete */
+	rq->fence.seqno = 1;
+	rq->hwsp_seqno = (u32 *)&rq->fence.seqno + 1;
+
+	i915_sw_fence_init(&rq->submit, dummy_notify);
+	i915_sw_fence_commit(&rq->submit);
+
+	return rq;
+}
+
+static void dummy_request_free(struct i915_request *dummy)
+{
+	i915_request_mark_complete(dummy);
+	i915_sched_node_fini(dummy->engine->i915, &dummy->sched);
+	kfree(dummy);
+}
+
+static int live_suppress_wait_preempt(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct preempt_client client[4];
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
+	int err = -ENOMEM;
+	int i;
+
+	/*
+	 * Waiters are given a little priority nudge, but not enough
+	 * to actually cause any preemption. Double check that we do
+	 * not needlessly generate preempt-to-idle cycles.
+	 */
+
+	if (!HAS_LOGICAL_RING_PREEMPTION(i915))
+		return 0;
+
+	if (USES_GUC_SUBMISSION(i915))
+		return 0; /* presume black blox */
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	if (preempt_client_init(i915, &client[0])) /* ELSP[0] */
+		goto err_unlock;
+	if (preempt_client_init(i915, &client[1])) /* ELSP[1] */
+		goto err_client_0;
+	if (preempt_client_init(i915, &client[2])) /* head of queue */
+		goto err_client_1;
+	if (preempt_client_init(i915, &client[3])) /* bystander */
+		goto err_client_2;
+
+	for_each_engine(engine, i915, id) {
+		int depth;
+
+		if (!engine->emit_init_breadcrumb)
+			continue;
+
+		for (depth = 0; depth < ARRAY_SIZE(client); depth++) {
+			struct i915_request *rq[ARRAY_SIZE(client)];
+			struct i915_request *dummy;
+
+			engine->execlists.preempt_hang.count = 0;
+
+			dummy = dummy_request(engine);
+			if (!dummy)
+				goto err_client_3;
+
+			for (i = 0; i < ARRAY_SIZE(client); i++) {
+				rq[i] = igt_spinner_create_request(&client[i].spin,
+								   client[i].ctx, engine,
+								   MI_NOOP);
+				if (IS_ERR(rq[i])) {
+					err = PTR_ERR(rq[i]);
+					goto err_wedged;
+				}
+
+				/* Disable NEWCLIENT promotion */
+				i915_gem_active_set(&rq[i]->timeline->last_request,
+						    dummy);
+				i915_request_add(rq[i]);
+			}
+
+			dummy_request_free(dummy);
+
+			GEM_BUG_ON(i915_request_completed(rq[0]));
+			if (!igt_wait_for_spinner(&client[0].spin, rq[0])) {
+				pr_err("First client failed to start\n");
+				goto err_wedged;
+			}
+			GEM_BUG_ON(!i915_request_started(rq[0]));
+
+			if (i915_request_wait(rq[depth],
+					      I915_WAIT_LOCKED |
+					      I915_WAIT_PRIORITY,
+					      1) != -ETIME) {
+				pr_err("Waiter depth:%d completed!\n", depth);
+				goto err_wedged;
+			}
+
+			for (i = 0; i < ARRAY_SIZE(client); i++)
+				igt_spinner_end(&client[i].spin);
+
+			if (igt_flush_test(i915, I915_WAIT_LOCKED))
+				goto err_wedged;
+
+			if (engine->execlists.preempt_hang.count) {
+				pr_err("Preemption recorded x%d, depth %d; should have been suppressed!\n",
+				       engine->execlists.preempt_hang.count,
+				       depth);
+				err = -EINVAL;
+				goto err_client_3;
+			}
+		}
+	}
+
+	err = 0;
+err_client_3:
+	preempt_client_fini(&client[3]);
+err_client_2:
+	preempt_client_fini(&client[2]);
+err_client_1:
+	preempt_client_fini(&client[1]);
+err_client_0:
+	preempt_client_fini(&client[0]);
+err_unlock:
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+
+err_wedged:
+	for (i = 0; i < ARRAY_SIZE(client); i++)
+		igt_spinner_end(&client[i].spin);
+	i915_gem_set_wedged(i915);
+	err = -EIO;
+	goto err_client_3;
+}
+
 static int live_preempt_hang(void *arg)
 {
 	struct drm_i915_private *i915 = arg;
@@ -785,6 +943,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_preempt),
 		SUBTEST(live_late_preempt),
 		SUBTEST(live_suppress_self_preempt),
+		SUBTEST(live_suppress_wait_preempt),
 		SUBTEST(live_preempt_hang),
 		SUBTEST(live_preempt_smoke),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 03/11] drm/i915/selftests: Exercise some AB...BA preemption chains
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
  2019-01-30  2:18 ` [PATCH 02/11] drm/i915/execlists: Suppress redundant preemption Chris Wilson
@ 2019-01-30  2:18 ` Chris Wilson
  2019-01-30  2:18 ` [PATCH 04/11] drm/i915: Generalise GPU activity tracking Chris Wilson
                   ` (12 subsequent siblings)
  14 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:18 UTC (permalink / raw)
  To: intel-gfx

Build a chain using 2 contexts (A, B) then request a preemption such
that a later A request runs before the spinner in B.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/selftests/intel_lrc.c | 103 +++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index 8774a3ca5a97..1151c54d2acf 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -4,6 +4,8 @@
  * Copyright © 2018 Intel Corporation
  */
 
+#include <linux/prime_numbers.h>
+
 #include "../i915_reset.h"
 
 #include "../i915_selftest.h"
@@ -563,6 +565,106 @@ static int live_suppress_wait_preempt(void *arg)
 	goto err_client_3;
 }
 
+static int live_chain_preempt(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	struct preempt_client hi, lo;
+	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
+	int err = -ENOMEM;
+
+	/*
+	 * Build a chain AB...BA between two contexts (A, B) and request
+	 * preemption of the last request. It should then complete before
+	 * the previously submitted spinner in B.
+	 */
+
+	if (!HAS_LOGICAL_RING_PREEMPTION(i915))
+		return 0;
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	if (preempt_client_init(i915, &hi))
+		goto err_unlock;
+
+	if (preempt_client_init(i915, &lo))
+		goto err_client_hi;
+
+	for_each_engine(engine, i915, id) {
+		struct i915_sched_attr attr = {
+			.priority = I915_USER_PRIORITY(I915_PRIORITY_MAX),
+		};
+		int count, i;
+
+		for_each_prime_number_from(count, 1, 32) { /* must fit ring! */
+			struct i915_request *rq;
+
+			rq = igt_spinner_create_request(&hi.spin,
+							hi.ctx, engine,
+							MI_ARB_CHECK);
+			if (IS_ERR(rq))
+				goto err_wedged;
+			i915_request_add(rq);
+			if (!igt_wait_for_spinner(&hi.spin, rq))
+				goto err_wedged;
+
+			rq = igt_spinner_create_request(&lo.spin,
+							lo.ctx, engine,
+							MI_ARB_CHECK);
+			if (IS_ERR(rq))
+				goto err_wedged;
+			i915_request_add(rq);
+
+			for (i = 0; i < count; i++) {
+				rq = i915_request_alloc(engine, lo.ctx);
+				if (IS_ERR(rq))
+					goto err_wedged;
+				i915_request_add(rq);
+			}
+
+			rq = i915_request_alloc(engine, hi.ctx);
+			if (IS_ERR(rq))
+				goto err_wedged;
+			i915_request_add(rq);
+			engine->schedule(rq, &attr);
+
+			igt_spinner_end(&hi.spin);
+			if (i915_request_wait(rq, I915_WAIT_LOCKED, HZ / 5) < 0) {
+				struct drm_printer p =
+					drm_info_printer(i915->drm.dev);
+
+				pr_err("Failed to preempt over chain of %d\n",
+				       count);
+				intel_engine_dump(engine, &p,
+						  "%s\n", engine->name);
+				goto err_wedged;
+			}
+			igt_spinner_end(&lo.spin);
+		}
+	}
+
+	err = 0;
+err_client_lo:
+	preempt_client_fini(&lo);
+err_client_hi:
+	preempt_client_fini(&hi);
+err_unlock:
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+
+err_wedged:
+	igt_spinner_end(&hi.spin);
+	igt_spinner_end(&lo.spin);
+	i915_gem_set_wedged(i915);
+	err = -EIO;
+	goto err_client_lo;
+}
+
 static int live_preempt_hang(void *arg)
 {
 	struct drm_i915_private *i915 = arg;
@@ -944,6 +1046,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_late_preempt),
 		SUBTEST(live_suppress_self_preempt),
 		SUBTEST(live_suppress_wait_preempt),
+		SUBTEST(live_chain_preempt),
 		SUBTEST(live_preempt_hang),
 		SUBTEST(live_preempt_smoke),
 	};
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 04/11] drm/i915: Generalise GPU activity tracking
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
  2019-01-30  2:18 ` [PATCH 02/11] drm/i915/execlists: Suppress redundant preemption Chris Wilson
  2019-01-30  2:18 ` [PATCH 03/11] drm/i915/selftests: Exercise some AB...BA preemption chains Chris Wilson
@ 2019-01-30  2:18 ` Chris Wilson
  2019-01-30 12:02   ` Tvrtko Ursulin
  2019-01-30  2:19 ` [PATCH 05/11] drm/i915: Add timeline barrier support Chris Wilson
                   ` (11 subsequent siblings)
  14 siblings, 1 reply; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:18 UTC (permalink / raw)
  To: intel-gfx

We currently track GPU memory usage inside VMA, such that we never
release memory used by the GPU until after it has finished accessing it.
However, we may want to track other resources aside from VMA, or we may
want to split a VMA into multiple independent regions and track each
separately. For this purpose, generalise our request tracking (akin to
struct reservation_object) so that we can embed it into other objects.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Makefile                 |   4 +-
 drivers/gpu/drm/i915/i915_active.c            | 226 ++++++++++++++++++
 drivers/gpu/drm/i915/i915_active.h            |  66 +++++
 drivers/gpu/drm/i915/i915_active_types.h      |  26 ++
 drivers/gpu/drm/i915/i915_gem_gtt.c           |   3 +-
 drivers/gpu/drm/i915/i915_vma.c               | 173 +++-----------
 drivers/gpu/drm/i915/i915_vma.h               |   9 +-
 drivers/gpu/drm/i915/selftests/i915_active.c  | 158 ++++++++++++
 .../drm/i915/selftests/i915_live_selftests.h  |   3 +-
 9 files changed, 514 insertions(+), 154 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/i915_active.c
 create mode 100644 drivers/gpu/drm/i915/i915_active.h
 create mode 100644 drivers/gpu/drm/i915/i915_active_types.h
 create mode 100644 drivers/gpu/drm/i915/selftests/i915_active.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 210d0e8777b6..1787e1299b1b 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -57,7 +57,9 @@ i915-$(CONFIG_DEBUG_FS) += i915_debugfs.o intel_pipe_crc.o
 i915-$(CONFIG_PERF_EVENTS) += i915_pmu.o
 
 # GEM code
-i915-y += i915_cmd_parser.o \
+i915-y += \
+	  i915_active.o \
+	  i915_cmd_parser.o \
 	  i915_gem_batch_pool.o \
 	  i915_gem_clflush.o \
 	  i915_gem_context.o \
diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
new file mode 100644
index 000000000000..e0182e19cb8b
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -0,0 +1,226 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#include "i915_drv.h"
+#include "i915_active.h"
+
+#define BKL(ref) (&(ref)->i915->drm.struct_mutex)
+
+struct active_node {
+	struct i915_gem_active base;
+	struct i915_active *ref;
+	struct rb_node node;
+	u64 timeline;
+};
+
+static void
+__active_retire(struct i915_active *ref)
+{
+	GEM_BUG_ON(!ref->count);
+	if (!--ref->count)
+		ref->retire(ref);
+}
+
+static void
+node_retire(struct i915_gem_active *base, struct i915_request *rq)
+{
+	__active_retire(container_of(base, struct active_node, base)->ref);
+}
+
+static void
+last_retire(struct i915_gem_active *base, struct i915_request *rq)
+{
+	__active_retire(container_of(base, struct i915_active, last));
+}
+
+static struct i915_gem_active *
+active_instance(struct i915_active *ref, u64 idx)
+{
+	struct active_node *node;
+	struct rb_node **p, *parent;
+	struct i915_request *old;
+
+	/*
+	 * We track the most recently used timeline to skip a rbtree search
+	 * for the common case, under typical loads we never need the rbtree
+	 * at all. We can reuse the last slot if it is empty, that is
+	 * after the previous activity has been retired, or if it matches the
+	 * current timeline.
+	 *
+	 * Note that we allow the timeline to be active simultaneously in
+	 * the rbtree and the last cache. We do this to avoid having
+	 * to search and replace the rbtree element for a new timeline, with
+	 * the cost being that we must be aware that the ref may be retired
+	 * twice for the same timeline (as the older rbtree element will be
+	 * retired before the new request added to last).
+	 */
+	old = i915_gem_active_raw(&ref->last, BKL(ref));
+	if (!old || old->fence.context == idx)
+		goto out;
+
+	/* Move the currently active fence into the rbtree */
+	idx = old->fence.context;
+
+	parent = NULL;
+	p = &ref->tree.rb_node;
+	while (*p) {
+		parent = *p;
+
+		node = rb_entry(parent, struct active_node, node);
+		if (node->timeline == idx)
+			goto replace;
+
+		if (node->timeline < idx)
+			p = &parent->rb_right;
+		else
+			p = &parent->rb_left;
+	}
+
+	node = kmalloc(sizeof(*node), GFP_KERNEL);
+
+	/* kmalloc may retire the ref->last (thanks shrinker)! */
+	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
+		kfree(node);
+		goto out;
+	}
+
+	if (unlikely(!node))
+		return ERR_PTR(-ENOMEM);
+
+	init_request_active(&node->base, node_retire);
+	node->ref = ref;
+	node->timeline = idx;
+
+	rb_link_node(&node->node, parent, p);
+	rb_insert_color(&node->node, &ref->tree);
+
+replace:
+	/*
+	 * Overwrite the previous active slot in the rbtree with last,
+	 * leaving last zeroed. If the previous slot is still active,
+	 * we must be careful as we now only expect to receive one retire
+	 * callback not two, and so much undo the active counting for the
+	 * overwritten slot.
+	 */
+	if (i915_gem_active_isset(&node->base)) {
+		/* Retire ourselves from the old rq->active_list */
+		__list_del_entry(&node->base.link);
+		ref->count--;
+		GEM_BUG_ON(!ref->count);
+	}
+	GEM_BUG_ON(list_empty(&ref->last.link));
+	list_replace_init(&ref->last.link, &node->base.link);
+	node->base.request = fetch_and_zero(&ref->last.request);
+
+out:
+	return &ref->last;
+}
+
+void i915_active_init(struct drm_i915_private *i915,
+		      struct i915_active *ref,
+		      void (*retire)(struct i915_active *ref))
+{
+	ref->i915 = i915;
+	ref->retire = retire;
+	ref->tree = RB_ROOT;
+	init_request_active(&ref->last, last_retire);
+	ref->count = 0;
+}
+
+int i915_active_ref(struct i915_active *ref,
+		    u64 timeline,
+		    struct i915_request *rq)
+{
+	struct i915_gem_active *active;
+
+	active = active_instance(ref, timeline);
+	if (IS_ERR(active))
+		return PTR_ERR(active);
+
+	if (!i915_gem_active_isset(active))
+		ref->count++;
+	i915_gem_active_set(active, rq);
+
+	return 0;
+}
+
+bool i915_active_acquire(struct i915_active *ref)
+{
+	lockdep_assert_held(BKL(ref));
+	return !ref->count++;
+}
+
+void i915_active_release(struct i915_active *ref)
+{
+	lockdep_assert_held(BKL(ref));
+	__active_retire(ref);
+}
+
+int i915_active_wait(struct i915_active *ref)
+{
+	struct active_node *it, *n;
+	int ret;
+
+	ret = i915_gem_active_retire(&ref->last, BKL(ref));
+	if (ret)
+		return ret;
+
+	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
+		ret = i915_gem_active_retire(&it->base, BKL(ref));
+		if (ret)
+			return ret;
+
+		GEM_BUG_ON(i915_gem_active_isset(&it->base));
+		kfree(it);
+	}
+	ref->tree = RB_ROOT;
+
+	return 0;
+}
+
+static int __i915_request_await_active(struct i915_request *rq,
+				       struct i915_gem_active *active)
+{
+	struct i915_request *barrier =
+		i915_gem_active_raw(active, &rq->i915->drm.struct_mutex);
+
+	return barrier ? i915_request_await_dma_fence(rq, &barrier->fence) : 0;
+}
+
+int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
+{
+	struct active_node *it, *n;
+	int ret;
+
+	ret = __i915_request_await_active(rq, &ref->last);
+	if (ret)
+		return ret;
+
+	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
+		ret = __i915_request_await_active(rq, &it->base);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+void i915_active_fini(struct i915_active *ref)
+{
+	struct active_node *it, *n;
+
+	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
+
+	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
+		GEM_BUG_ON(i915_gem_active_isset(&it->base));
+		kfree(it);
+	}
+	ref->tree = RB_ROOT;
+}
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftests/i915_active.c"
+#endif
diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
new file mode 100644
index 000000000000..c0729a046f98
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_active.h
@@ -0,0 +1,66 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#ifndef _I915_ACTIVE_H_
+#define _I915_ACTIVE_H_
+
+#include "i915_active_types.h"
+
+#include <linux/rbtree.h>
+
+#include "i915_request.h"
+
+/*
+ * GPU activity tracking
+ *
+ * Each set of commands submitted to the GPU compromises a single request that
+ * signals a fence upon completion. struct i915_request combines the
+ * command submission, scheduling and fence signaling roles. If we want to see
+ * if a particular task is complete, we need to grab the fence (struct
+ * i915_request) for that task and check or wait for it to be signaled. More
+ * often though we want to track the status of a bunch of tasks, for example
+ * to wait for the GPU to finish accessing some memory across a variety of
+ * different command pipelines from different clients. We could choose to
+ * track every single request associated with the task, but knowing that
+ * each request belongs to an ordered timeline (later requests within a
+ * timeline must wait for earlier requests), we need only track the
+ * latest request in each timeline to determine the overall status of the
+ * task.
+ *
+ * struct i915_active provides this tracking across timelines. It builds a
+ * composite shared-fence, and is updated as new work is submitted to the task,
+ * forming a snapshot of the current status. It should be embedded into the
+ * different resources that need to track their associated GPU activity to
+ * provide a callback when that GPU activity has ceased, or otherwise to
+ * provide a serialisation point either for request submission or for CPU
+ * synchronisation.
+ */
+
+void i915_active_init(struct drm_i915_private *i915,
+		      struct i915_active *ref,
+		      void (*retire)(struct i915_active *ref));
+
+int i915_active_ref(struct i915_active *ref,
+		    u64 timeline,
+		    struct i915_request *rq);
+
+int i915_active_wait(struct i915_active *ref);
+
+int i915_request_await_active(struct i915_request *rq,
+			      struct i915_active *ref);
+
+bool i915_active_acquire(struct i915_active *ref);
+void i915_active_release(struct i915_active *ref);
+
+static inline bool
+i915_active_is_idle(const struct i915_active *ref)
+{
+	return !ref->count;
+}
+
+void i915_active_fini(struct i915_active *ref);
+
+#endif /* _I915_ACTIVE_H_ */
diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
new file mode 100644
index 000000000000..411e502ed8dd
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_active_types.h
@@ -0,0 +1,26 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#ifndef _I915_ACTIVE_TYPES_H_
+#define _I915_ACTIVE_TYPES_H_
+
+#include <linux/rbtree.h>
+
+#include "i915_request.h"
+
+struct drm_i915_private;
+
+struct i915_active {
+	struct drm_i915_private *i915;
+
+	struct rb_root tree;
+	struct i915_gem_active last;
+	unsigned int count;
+
+	void (*retire)(struct i915_active *ref);
+};
+
+#endif /* _I915_ACTIVE_TYPES_H_ */
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 49b00996a15e..e625659c03a2 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1917,14 +1917,13 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
 	if (!vma)
 		return ERR_PTR(-ENOMEM);
 
+	i915_active_init(i915, &vma->active, NULL);
 	init_request_active(&vma->last_fence, NULL);
 
 	vma->vm = &ggtt->vm;
 	vma->ops = &pd_vma_ops;
 	vma->private = ppgtt;
 
-	vma->active = RB_ROOT;
-
 	vma->size = size;
 	vma->fence_size = size;
 	vma->flags = I915_VMA_GGTT;
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index d83b8ad5f859..d4772061e642 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -63,22 +63,23 @@ static void vma_print_allocator(struct i915_vma *vma, const char *reason)
 
 #endif
 
-struct i915_vma_active {
-	struct i915_gem_active base;
-	struct i915_vma *vma;
-	struct rb_node node;
-	u64 timeline;
-};
+static void obj_bump_mru(struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
 
-static void
-__i915_vma_retire(struct i915_vma *vma, struct i915_request *rq)
+	spin_lock(&i915->mm.obj_lock);
+	if (obj->bind_count)
+		list_move_tail(&obj->mm.link, &i915->mm.bound_list);
+	spin_unlock(&i915->mm.obj_lock);
+
+	obj->mm.dirty = true; /* be paranoid  */
+}
+
+static void __i915_vma_retire(struct i915_active *ref)
 {
+	struct i915_vma *vma = container_of(ref, typeof(*vma), active);
 	struct drm_i915_gem_object *obj = vma->obj;
 
-	GEM_BUG_ON(!i915_vma_is_active(vma));
-	if (--vma->active_count)
-		return;
-
 	GEM_BUG_ON(!i915_gem_object_is_active(obj));
 	if (--obj->active_count)
 		return;
@@ -90,16 +91,12 @@ __i915_vma_retire(struct i915_vma *vma, struct i915_request *rq)
 		reservation_object_unlock(obj->resv);
 	}
 
-	/* Bump our place on the bound list to keep it roughly in LRU order
+	/*
+	 * Bump our place on the bound list to keep it roughly in LRU order
 	 * so that we don't steal from recently used but inactive objects
 	 * (unless we are forced to ofc!)
 	 */
-	spin_lock(&rq->i915->mm.obj_lock);
-	if (obj->bind_count)
-		list_move_tail(&obj->mm.link, &rq->i915->mm.bound_list);
-	spin_unlock(&rq->i915->mm.obj_lock);
-
-	obj->mm.dirty = true; /* be paranoid  */
+	obj_bump_mru(obj);
 
 	if (i915_gem_object_has_active_reference(obj)) {
 		i915_gem_object_clear_active_reference(obj);
@@ -107,21 +104,6 @@ __i915_vma_retire(struct i915_vma *vma, struct i915_request *rq)
 	}
 }
 
-static void
-i915_vma_retire(struct i915_gem_active *base, struct i915_request *rq)
-{
-	struct i915_vma_active *active =
-		container_of(base, typeof(*active), base);
-
-	__i915_vma_retire(active->vma, rq);
-}
-
-static void
-i915_vma_last_retire(struct i915_gem_active *base, struct i915_request *rq)
-{
-	__i915_vma_retire(container_of(base, struct i915_vma, last_active), rq);
-}
-
 static struct i915_vma *
 vma_create(struct drm_i915_gem_object *obj,
 	   struct i915_address_space *vm,
@@ -137,10 +119,9 @@ vma_create(struct drm_i915_gem_object *obj,
 	if (vma == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	vma->active = RB_ROOT;
-
-	init_request_active(&vma->last_active, i915_vma_last_retire);
+	i915_active_init(vm->i915, &vma->active, __i915_vma_retire);
 	init_request_active(&vma->last_fence, NULL);
+
 	vma->vm = vm;
 	vma->ops = &vm->vma_ops;
 	vma->obj = obj;
@@ -823,7 +804,6 @@ void i915_vma_reopen(struct i915_vma *vma)
 static void __i915_vma_destroy(struct i915_vma *vma)
 {
 	struct drm_i915_private *i915 = vma->vm->i915;
-	struct i915_vma_active *iter, *n;
 
 	GEM_BUG_ON(vma->node.allocated);
 	GEM_BUG_ON(vma->fence);
@@ -843,10 +823,7 @@ static void __i915_vma_destroy(struct i915_vma *vma)
 		spin_unlock(&obj->vma.lock);
 	}
 
-	rbtree_postorder_for_each_entry_safe(iter, n, &vma->active, node) {
-		GEM_BUG_ON(i915_gem_active_isset(&iter->base));
-		kfree(iter);
-	}
+	i915_active_fini(&vma->active);
 
 	kmem_cache_free(i915->vmas, vma);
 }
@@ -931,104 +908,15 @@ static void export_fence(struct i915_vma *vma,
 	reservation_object_unlock(resv);
 }
 
-static struct i915_gem_active *active_instance(struct i915_vma *vma, u64 idx)
-{
-	struct i915_vma_active *active;
-	struct rb_node **p, *parent;
-	struct i915_request *old;
-
-	/*
-	 * We track the most recently used timeline to skip a rbtree search
-	 * for the common case, under typical loads we never need the rbtree
-	 * at all. We can reuse the last_active slot if it is empty, that is
-	 * after the previous activity has been retired, or if the active
-	 * matches the current timeline.
-	 *
-	 * Note that we allow the timeline to be active simultaneously in
-	 * the rbtree and the last_active cache. We do this to avoid having
-	 * to search and replace the rbtree element for a new timeline, with
-	 * the cost being that we must be aware that the vma may be retired
-	 * twice for the same timeline (as the older rbtree element will be
-	 * retired before the new request added to last_active).
-	 */
-	old = i915_gem_active_raw(&vma->last_active,
-				  &vma->vm->i915->drm.struct_mutex);
-	if (!old || old->fence.context == idx)
-		goto out;
-
-	/* Move the currently active fence into the rbtree */
-	idx = old->fence.context;
-
-	parent = NULL;
-	p = &vma->active.rb_node;
-	while (*p) {
-		parent = *p;
-
-		active = rb_entry(parent, struct i915_vma_active, node);
-		if (active->timeline == idx)
-			goto replace;
-
-		if (active->timeline < idx)
-			p = &parent->rb_right;
-		else
-			p = &parent->rb_left;
-	}
-
-	active = kmalloc(sizeof(*active), GFP_KERNEL);
-
-	/* kmalloc may retire the vma->last_active request (thanks shrinker)! */
-	if (unlikely(!i915_gem_active_raw(&vma->last_active,
-					  &vma->vm->i915->drm.struct_mutex))) {
-		kfree(active);
-		goto out;
-	}
-
-	if (unlikely(!active))
-		return ERR_PTR(-ENOMEM);
-
-	init_request_active(&active->base, i915_vma_retire);
-	active->vma = vma;
-	active->timeline = idx;
-
-	rb_link_node(&active->node, parent, p);
-	rb_insert_color(&active->node, &vma->active);
-
-replace:
-	/*
-	 * Overwrite the previous active slot in the rbtree with last_active,
-	 * leaving last_active zeroed. If the previous slot is still active,
-	 * we must be careful as we now only expect to receive one retire
-	 * callback not two, and so much undo the active counting for the
-	 * overwritten slot.
-	 */
-	if (i915_gem_active_isset(&active->base)) {
-		/* Retire ourselves from the old rq->active_list */
-		__list_del_entry(&active->base.link);
-		vma->active_count--;
-		GEM_BUG_ON(!vma->active_count);
-	}
-	GEM_BUG_ON(list_empty(&vma->last_active.link));
-	list_replace_init(&vma->last_active.link, &active->base.link);
-	active->base.request = fetch_and_zero(&vma->last_active.request);
-
-out:
-	return &vma->last_active;
-}
-
 int i915_vma_move_to_active(struct i915_vma *vma,
 			    struct i915_request *rq,
 			    unsigned int flags)
 {
 	struct drm_i915_gem_object *obj = vma->obj;
-	struct i915_gem_active *active;
 
 	lockdep_assert_held(&rq->i915->drm.struct_mutex);
 	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
 
-	active = active_instance(vma, rq->fence.context);
-	if (IS_ERR(active))
-		return PTR_ERR(active);
-
 	/*
 	 * Add a reference if we're newly entering the active list.
 	 * The order in which we add operations to the retirement queue is
@@ -1037,9 +925,15 @@ int i915_vma_move_to_active(struct i915_vma *vma,
 	 * add the active reference first and queue for it to be dropped
 	 * *last*.
 	 */
-	if (!i915_gem_active_isset(active) && !vma->active_count++)
+	if (!vma->active.count)
 		obj->active_count++;
-	i915_gem_active_set(active, rq);
+
+	if (unlikely(i915_active_ref(&vma->active, rq->fence.context, rq))) {
+		if (!vma->active.count)
+			obj->active_count--;
+		return -ENOMEM;
+	}
+
 	GEM_BUG_ON(!i915_vma_is_active(vma));
 	GEM_BUG_ON(!obj->active_count);
 
@@ -1073,8 +967,6 @@ int i915_vma_unbind(struct i915_vma *vma)
 	 */
 	might_sleep();
 	if (i915_vma_is_active(vma)) {
-		struct i915_vma_active *active, *n;
-
 		/*
 		 * When a closed VMA is retired, it is unbound - eek.
 		 * In order to prevent it from being recursively closed,
@@ -1090,19 +982,10 @@ int i915_vma_unbind(struct i915_vma *vma)
 		 */
 		__i915_vma_pin(vma);
 
-		ret = i915_gem_active_retire(&vma->last_active,
-					     &vma->vm->i915->drm.struct_mutex);
+		ret = i915_active_wait(&vma->active);
 		if (ret)
 			goto unpin;
 
-		rbtree_postorder_for_each_entry_safe(active, n,
-						     &vma->active, node) {
-			ret = i915_gem_active_retire(&active->base,
-						     &vma->vm->i915->drm.struct_mutex);
-			if (ret)
-				goto unpin;
-		}
-
 		ret = i915_gem_active_retire(&vma->last_fence,
 					     &vma->vm->i915->drm.struct_mutex);
 unpin:
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 5793abe509a2..3c03d4569481 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -34,6 +34,7 @@
 #include "i915_gem_fence_reg.h"
 #include "i915_gem_object.h"
 
+#include "i915_active.h"
 #include "i915_request.h"
 
 enum i915_cache_level;
@@ -108,9 +109,7 @@ struct i915_vma {
 #define I915_VMA_USERFAULT	BIT(I915_VMA_USERFAULT_BIT)
 #define I915_VMA_GGTT_WRITE	BIT(15)
 
-	unsigned int active_count;
-	struct rb_root active;
-	struct i915_gem_active last_active;
+	struct i915_active active;
 	struct i915_gem_active last_fence;
 
 	/**
@@ -154,9 +153,9 @@ i915_vma_instance(struct drm_i915_gem_object *obj,
 void i915_vma_unpin_and_release(struct i915_vma **p_vma, unsigned int flags);
 #define I915_VMA_RELEASE_MAP BIT(0)
 
-static inline bool i915_vma_is_active(struct i915_vma *vma)
+static inline bool i915_vma_is_active(const struct i915_vma *vma)
 {
-	return vma->active_count;
+	return !i915_active_is_idle(&vma->active);
 }
 
 int __must_check i915_vma_move_to_active(struct i915_vma *vma,
diff --git a/drivers/gpu/drm/i915/selftests/i915_active.c b/drivers/gpu/drm/i915/selftests/i915_active.c
new file mode 100644
index 000000000000..7c5c3068565b
--- /dev/null
+++ b/drivers/gpu/drm/i915/selftests/i915_active.c
@@ -0,0 +1,158 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2018 Intel Corporation
+ */
+
+#include "../i915_selftest.h"
+
+#include "igt_flush_test.h"
+#include "lib_sw_fence.h"
+
+struct live_active {
+	struct i915_active base;
+	bool retired;
+};
+
+static void __live_active_retire(struct i915_active *base)
+{
+	struct live_active *active = container_of(base, typeof(*active), base);
+
+	active->retired = true;
+}
+
+static int __live_active_setup(struct drm_i915_private *i915,
+			       struct live_active *active)
+{
+	struct intel_engine_cs *engine;
+	struct i915_sw_fence *submit;
+	enum intel_engine_id id;
+	unsigned int count = 0;
+	int err = 0;
+
+	i915_active_init(i915, &active->base, __live_active_retire);
+	active->retired = false;
+
+	if (!i915_active_acquire(&active->base)) {
+		pr_err("First i915_active_acquire should report being idle\n");
+		return -EINVAL;
+	}
+
+	submit = heap_fence_create(GFP_KERNEL);
+
+	for_each_engine(engine, i915, id) {
+		struct i915_request *rq;
+
+		rq = i915_request_alloc(engine, i915->kernel_context);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			break;
+		}
+
+		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+						       submit,
+						       GFP_KERNEL);
+		if (err < 0) {
+			pr_err("Failed to allocate submission fence!\n");
+			i915_request_add(rq);
+			break;
+		}
+
+		err = i915_active_ref(&active->base, rq->fence.context, rq);
+		if (err) {
+			pr_err("Failed to track active ref!\n");
+			i915_request_add(rq);
+			break;
+		}
+
+		i915_request_add(rq);
+		count++;
+	}
+
+	i915_active_release(&active->base);
+	if (active->retired) {
+		pr_err("i915_active retired before submission!\n");
+		err = -EINVAL;
+	}
+	if (active->base.count != count) {
+		pr_err("i915_active not tracking all requests, found %d, expected %d\n",
+		       active->base.count, count);
+		err = -EINVAL;
+	}
+
+	i915_sw_fence_commit(submit);
+	heap_fence_put(submit);
+
+	return err;
+}
+
+static int live_active_wait(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct live_active active;
+	intel_wakeref_t wakeref;
+	int err;
+
+	/* Check that we get a callback when requests upon waiting */
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	err = __live_active_setup(i915, &active);
+
+	i915_active_wait(&active.base);
+	if (!active.retired) {
+		pr_err("i915_active not retired after waiting!\n");
+		err = -EINVAL;
+	}
+
+	i915_active_fini(&active.base);
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+}
+
+static int live_active_retire(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct live_active active;
+	intel_wakeref_t wakeref;
+	int err;
+
+	/* Check that we get a callback when requests are indirectly retired */
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	err = __live_active_setup(i915, &active);
+
+	/* waits for & retires all requests */
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+
+	if (!active.retired) {
+		pr_err("i915_active not retired after flushing!\n");
+		err = -EINVAL;
+	}
+
+	i915_active_fini(&active.base);
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+}
+
+int i915_active_live_selftests(struct drm_i915_private *i915)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(live_active_wait),
+		SUBTEST(live_active_retire),
+	};
+
+	if (i915_terminally_wedged(&i915->gpu_error))
+		return 0;
+
+	return i915_subtests(tests, i915);
+}
diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
index 76b4f87fc853..6d766925ad04 100644
--- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
@@ -12,8 +12,9 @@
 selftest(sanitycheck, i915_live_sanitycheck) /* keep first (igt selfcheck) */
 selftest(uncore, intel_uncore_live_selftests)
 selftest(workarounds, intel_workarounds_live_selftests)
-selftest(requests, i915_request_live_selftests)
 selftest(timelines, i915_timeline_live_selftests)
+selftest(requests, i915_request_live_selftests)
+selftest(active, i915_active_live_selftests)
 selftest(objects, i915_gem_object_live_selftests)
 selftest(dmabuf, i915_gem_dmabuf_live_selftests)
 selftest(coherency, i915_gem_coherency_live_selftests)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 05/11] drm/i915: Add timeline barrier support
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (2 preceding siblings ...)
  2019-01-30  2:18 ` [PATCH 04/11] drm/i915: Generalise GPU activity tracking Chris Wilson
@ 2019-01-30  2:19 ` Chris Wilson
  2019-01-30  2:19 ` [PATCH 06/11] drm/i915: Allocate active tracking nodes from a slabcache Chris Wilson
                   ` (10 subsequent siblings)
  14 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:19 UTC (permalink / raw)
  To: intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Timeline barrier allows serialization between different timelines.

After calling i915_timeline_set_barrier with a request, all following
submissions on this timeline will be set up as depending on this request,
or barrier. Once the barrier has been completed it automatically gets
cleared and things continue as normal.

This facility will be used by the upcoming context SSEU code.

v2:
 * Assert barrier has been retired on timeline_fini. (Chris Wilson)
 * Fix mock_timeline.

v3:
 * Improved comment language. (Chris Wilson)

v4:
 * Maintain ordering with previous barriers set on the timeline.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Suggested-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_request.c           | 17 ++++++++++++++
 drivers/gpu/drm/i915/i915_timeline.c          | 21 ++++++++++++++++++
 drivers/gpu/drm/i915/i915_timeline.h          | 22 +++++++++++++++++++
 .../gpu/drm/i915/selftests/mock_timeline.c    |  1 +
 4 files changed, 61 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 9ed5baf157a3..4b1869295362 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -526,6 +526,19 @@ i915_request_alloc_slow(struct intel_context *ce)
 	return kmem_cache_alloc(ce->gem_context->i915->requests, GFP_KERNEL);
 }
 
+static int add_barrier(struct i915_request *rq, struct i915_gem_active *active)
+{
+	struct i915_request *barrier =
+		i915_gem_active_raw(active, &rq->i915->drm.struct_mutex);
+
+	return barrier ? i915_request_await_dma_fence(rq, &barrier->fence) : 0;
+}
+
+static int add_timeline_barrier(struct i915_request *rq)
+{
+	return add_barrier(rq, &rq->timeline->barrier);
+}
+
 /**
  * i915_request_alloc - allocate a request structure
  *
@@ -668,6 +681,10 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 */
 	rq->head = rq->ring->emit;
 
+	ret = add_timeline_barrier(rq);
+	if (ret)
+		goto err_unwind;
+
 	ret = engine->request_alloc(rq);
 	if (ret)
 		goto err_unwind;
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 5ea3af393ffe..b354843a5040 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -163,6 +163,7 @@ int i915_timeline_init(struct drm_i915_private *i915,
 
 	spin_lock_init(&timeline->lock);
 
+	init_request_active(&timeline->barrier, NULL);
 	init_request_active(&timeline->last_request, NULL);
 	INIT_LIST_HEAD(&timeline->requests);
 
@@ -235,6 +236,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 {
 	GEM_BUG_ON(timeline->pin_count);
 	GEM_BUG_ON(!list_empty(&timeline->requests));
+	GEM_BUG_ON(i915_gem_active_isset(&timeline->barrier));
 
 	i915_syncmap_free(&timeline->sync);
 	hwsp_free(timeline);
@@ -266,6 +268,25 @@ i915_timeline_create(struct drm_i915_private *i915,
 	return timeline;
 }
 
+int i915_timeline_set_barrier(struct i915_timeline *tl, struct i915_request *rq)
+{
+	struct i915_request *old;
+	int err;
+
+	lockdep_assert_held(&rq->i915->drm.struct_mutex);
+
+	/* Must maintain ordering wrt existing barriers */
+	old = i915_gem_active_raw(&tl->barrier, &rq->i915->drm.struct_mutex);
+	if (old) {
+		err = i915_request_await_dma_fence(rq, &old->fence);
+		if (err)
+			return err;
+	}
+
+	i915_gem_active_set(&tl->barrier, rq);
+	return 0;
+}
+
 int i915_timeline_pin(struct i915_timeline *tl)
 {
 	int err;
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index 8caeb66d1cd5..d167e04073c5 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -74,6 +74,16 @@ struct i915_timeline {
 	 */
 	struct i915_syncmap *sync;
 
+	/**
+	 * Barrier provides the ability to serialize ordering between different
+	 * timelines.
+	 *
+	 * Users can call i915_timeline_set_barrier which will make all
+	 * subsequent submissions to this timeline be executed only after the
+	 * barrier has been completed.
+	 */
+	struct i915_gem_active barrier;
+
 	struct list_head link;
 	const char *name;
 	struct drm_i915_private *i915;
@@ -155,4 +165,16 @@ void i915_timelines_init(struct drm_i915_private *i915);
 void i915_timelines_park(struct drm_i915_private *i915);
 void i915_timelines_fini(struct drm_i915_private *i915);
 
+/**
+ * i915_timeline_set_barrier - orders submission between different timelines
+ * @timeline: timeline to set the barrier on
+ * @rq: request after which new submissions can proceed
+ *
+ * Sets the passed in request as the serialization point for all subsequent
+ * submissions on @timeline. Subsequent requests will not be submitted to GPU
+ * until the barrier has been completed.
+ */
+int i915_timeline_set_barrier(struct i915_timeline *timeline,
+			      struct i915_request *rq);
+
 #endif
diff --git a/drivers/gpu/drm/i915/selftests/mock_timeline.c b/drivers/gpu/drm/i915/selftests/mock_timeline.c
index cf39ccd9fc05..e5659aaa856d 100644
--- a/drivers/gpu/drm/i915/selftests/mock_timeline.c
+++ b/drivers/gpu/drm/i915/selftests/mock_timeline.c
@@ -15,6 +15,7 @@ void mock_timeline_init(struct i915_timeline *timeline, u64 context)
 
 	spin_lock_init(&timeline->lock);
 
+	init_request_active(&timeline->barrier, NULL);
 	init_request_active(&timeline->last_request, NULL);
 	INIT_LIST_HEAD(&timeline->requests);
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 06/11] drm/i915: Allocate active tracking nodes from a slabcache
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (3 preceding siblings ...)
  2019-01-30  2:19 ` [PATCH 05/11] drm/i915: Add timeline barrier support Chris Wilson
@ 2019-01-30  2:19 ` Chris Wilson
  2019-01-30 15:57   ` Tvrtko Ursulin
                     ` (2 more replies)
  2019-01-30  2:19 ` [PATCH 07/11] drm/i915: Pull i915_gem_active into the i915_active family Chris Wilson
                   ` (9 subsequent siblings)
  14 siblings, 3 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:19 UTC (permalink / raw)
  To: intel-gfx

Wrap the active tracking for a GPU references in a slabcache for faster
allocations, and keep track of inflight nodes so we can reap the
stale entries upon parking (thereby trimming our memory usage).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_active.c            | 55 ++++++++++++++++---
 drivers/gpu/drm/i915/i915_active.h            | 21 +++++--
 drivers/gpu/drm/i915/i915_active_types.h      | 12 +++-
 drivers/gpu/drm/i915/i915_drv.h               |  2 +
 drivers/gpu/drm/i915/i915_gem.c               | 16 +++++-
 drivers/gpu/drm/i915/i915_gem_gtt.c           |  2 +-
 drivers/gpu/drm/i915/i915_vma.c               |  3 +-
 drivers/gpu/drm/i915/selftests/i915_active.c  |  3 +-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  6 ++
 9 files changed, 100 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index e0182e19cb8b..3c7abbde42ac 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -7,7 +7,9 @@
 #include "i915_drv.h"
 #include "i915_active.h"
 
-#define BKL(ref) (&(ref)->i915->drm.struct_mutex)
+#define i915_from_gt(x) \
+	container_of(x, struct drm_i915_private, gt.active_refs)
+#define BKL(ref) (&i915_from_gt((ref)->gt)->drm.struct_mutex)
 
 struct active_node {
 	struct i915_gem_active base;
@@ -79,11 +81,11 @@ active_instance(struct i915_active *ref, u64 idx)
 			p = &parent->rb_left;
 	}
 
-	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	node = kmem_cache_alloc(ref->gt->slab_cache, GFP_KERNEL);
 
 	/* kmalloc may retire the ref->last (thanks shrinker)! */
 	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
-		kfree(node);
+		kmem_cache_free(ref->gt->slab_cache, node);
 		goto out;
 	}
 
@@ -94,6 +96,9 @@ active_instance(struct i915_active *ref, u64 idx)
 	node->ref = ref;
 	node->timeline = idx;
 
+	if (RB_EMPTY_ROOT(&ref->tree))
+		list_add(&ref->active_link, &ref->gt->active_refs);
+
 	rb_link_node(&node->node, parent, p);
 	rb_insert_color(&node->node, &ref->tree);
 
@@ -119,11 +124,11 @@ active_instance(struct i915_active *ref, u64 idx)
 	return &ref->last;
 }
 
-void i915_active_init(struct drm_i915_private *i915,
+void i915_active_init(struct i915_gt_active *gt,
 		      struct i915_active *ref,
 		      void (*retire)(struct i915_active *ref))
 {
-	ref->i915 = i915;
+	ref->gt = gt;
 	ref->retire = retire;
 	ref->tree = RB_ROOT;
 	init_request_active(&ref->last, last_retire);
@@ -161,6 +166,7 @@ void i915_active_release(struct i915_active *ref)
 
 int i915_active_wait(struct i915_active *ref)
 {
+	struct kmem_cache *slab = ref->gt->slab_cache;
 	struct active_node *it, *n;
 	int ret;
 
@@ -168,15 +174,19 @@ int i915_active_wait(struct i915_active *ref)
 	if (ret)
 		return ret;
 
+	if (RB_EMPTY_ROOT(&ref->tree))
+		return 0;
+
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
 		ret = i915_gem_active_retire(&it->base, BKL(ref));
 		if (ret)
 			return ret;
 
 		GEM_BUG_ON(i915_gem_active_isset(&it->base));
-		kfree(it);
+		kmem_cache_free(slab, it);
 	}
 	ref->tree = RB_ROOT;
+	list_del(&ref->active_link);
 
 	return 0;
 }
@@ -210,15 +220,46 @@ int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
 
 void i915_active_fini(struct i915_active *ref)
 {
+	struct kmem_cache *slab = ref->gt->slab_cache;
 	struct active_node *it, *n;
 
+	lockdep_assert_held(BKL(ref));
 	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
 
+	if (RB_EMPTY_ROOT(&ref->tree))
+		return;
+
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
 		GEM_BUG_ON(i915_gem_active_isset(&it->base));
-		kfree(it);
+		kmem_cache_free(slab, it);
 	}
 	ref->tree = RB_ROOT;
+	list_del(&ref->active_link);
+}
+
+int i915_gt_active_init(struct i915_gt_active *gt)
+{
+	gt->slab_cache = KMEM_CACHE(active_node, SLAB_HWCACHE_ALIGN);
+	if (!gt->slab_cache)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&gt->active_refs);
+
+	return 0;
+}
+
+void i915_gt_active_park(struct i915_gt_active *gt)
+{
+	struct i915_active *it, *n;
+
+	list_for_each_entry_safe(it, n, &gt->active_refs, active_link)
+		i915_active_fini(it);
+}
+
+void i915_gt_active_fini(struct i915_gt_active *gt)
+{
+	GEM_BUG_ON(!list_empty(&gt->active_refs));
+	kmem_cache_destroy(gt->slab_cache);
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
index c0729a046f98..41c4a5da84c8 100644
--- a/drivers/gpu/drm/i915/i915_active.h
+++ b/drivers/gpu/drm/i915/i915_active.h
@@ -9,10 +9,6 @@
 
 #include "i915_active_types.h"
 
-#include <linux/rbtree.h>
-
-#include "i915_request.h"
-
 /*
  * GPU activity tracking
  *
@@ -39,7 +35,7 @@
  * synchronisation.
  */
 
-void i915_active_init(struct drm_i915_private *i915,
+void i915_active_init(struct i915_gt_active *gt,
 		      struct i915_active *ref,
 		      void (*retire)(struct i915_active *ref));
 
@@ -63,4 +59,19 @@ i915_active_is_idle(const struct i915_active *ref)
 
 void i915_active_fini(struct i915_active *ref);
 
+/*
+ * Active refs memory management
+ *
+ * To be more economical with memory, we reap all the i915_active trees on
+ * parking the GPU (when we know the GPU is inactive) and allocate the nodes
+ * from a local slab cache to hopefully reduce the fragmentation as we will
+ * then be able to free all pages en masse upon idling.
+ */
+
+int i915_gt_active_init(struct i915_gt_active *gt);
+void i915_gt_active_park(struct i915_gt_active *gt);
+void i915_gt_active_fini(struct i915_gt_active *gt);
+
+#define i915_gt_active(i915) (&(i915)->gt.active_refs)
+
 #endif /* _I915_ACTIVE_H_ */
diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
index 411e502ed8dd..3d41c33ca78c 100644
--- a/drivers/gpu/drm/i915/i915_active_types.h
+++ b/drivers/gpu/drm/i915/i915_active_types.h
@@ -7,14 +7,17 @@
 #ifndef _I915_ACTIVE_TYPES_H_
 #define _I915_ACTIVE_TYPES_H_
 
+#include <linux/list.h>
 #include <linux/rbtree.h>
 
 #include "i915_request.h"
 
-struct drm_i915_private;
+struct i915_gt_active;
+struct kmem_cache;
 
 struct i915_active {
-	struct drm_i915_private *i915;
+	struct i915_gt_active *gt;
+	struct list_head active_link;
 
 	struct rb_root tree;
 	struct i915_gem_active last;
@@ -23,4 +26,9 @@ struct i915_active {
 	void (*retire)(struct i915_active *ref);
 };
 
+struct i915_gt_active {
+	struct list_head active_refs;
+	struct kmem_cache *slab_cache;
+};
+
 #endif /* _I915_ACTIVE_TYPES_H_ */
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 8ec28a7f5452..480ab3e00ba8 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1984,6 +1984,8 @@ struct drm_i915_private {
 			struct list_head hwsp_free_list;
 		} timelines;
 
+		struct i915_gt_active active_refs;
+
 		struct list_head active_rings;
 		struct list_head closed_vma;
 		u32 active_requests;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index caccff87a2a1..2bc735df408b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -130,6 +130,7 @@ static u32 __i915_gem_park(struct drm_i915_private *i915)
 
 	intel_engines_park(i915);
 	i915_timelines_park(i915);
+	i915_gt_active_park(i915_gt_active(i915));
 
 	i915_pmu_gt_parked(i915);
 	i915_vma_parked(i915);
@@ -4998,15 +4999,19 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
 	}
 
+	ret = i915_gt_active_init(i915_gt_active(dev_priv));
+	if (ret)
+		return ret;
+
 	i915_timelines_init(dev_priv);
 
 	ret = i915_gem_init_userptr(dev_priv);
 	if (ret)
-		return ret;
+		goto err_timelines;
 
 	ret = intel_uc_init_misc(dev_priv);
 	if (ret)
-		return ret;
+		goto err_userptr;
 
 	ret = intel_wopcm_init(&dev_priv->wopcm);
 	if (ret)
@@ -5122,9 +5127,13 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 err_uc_misc:
 	intel_uc_fini_misc(dev_priv);
 
-	if (ret != -EIO) {
+err_userptr:
+	if (ret != -EIO)
 		i915_gem_cleanup_userptr(dev_priv);
+err_timelines:
+	if (ret != -EIO) {
 		i915_timelines_fini(dev_priv);
+		i915_gt_active_fini(i915_gt_active(dev_priv));
 	}
 
 	if (ret == -EIO) {
@@ -5177,6 +5186,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv)
 	intel_uc_fini_misc(dev_priv);
 	i915_gem_cleanup_userptr(dev_priv);
 	i915_timelines_fini(dev_priv);
+	i915_gt_active_fini(i915_gt_active(dev_priv));
 
 	i915_gem_drain_freed_objects(dev_priv);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index e625659c03a2..d8819de0d6ee 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1917,7 +1917,7 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
 	if (!vma)
 		return ERR_PTR(-ENOMEM);
 
-	i915_active_init(i915, &vma->active, NULL);
+	i915_active_init(i915_gt_active(i915), &vma->active, NULL);
 	init_request_active(&vma->last_fence, NULL);
 
 	vma->vm = &ggtt->vm;
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index d4772061e642..2456bfb4877b 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -119,7 +119,8 @@ vma_create(struct drm_i915_gem_object *obj,
 	if (vma == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	i915_active_init(vm->i915, &vma->active, __i915_vma_retire);
+	i915_active_init(i915_gt_active(vm->i915),
+			 &vma->active, __i915_vma_retire);
 	init_request_active(&vma->last_fence, NULL);
 
 	vma->vm = vm;
diff --git a/drivers/gpu/drm/i915/selftests/i915_active.c b/drivers/gpu/drm/i915/selftests/i915_active.c
index 7c5c3068565b..0e923476920e 100644
--- a/drivers/gpu/drm/i915/selftests/i915_active.c
+++ b/drivers/gpu/drm/i915/selftests/i915_active.c
@@ -30,7 +30,8 @@ static int __live_active_setup(struct drm_i915_private *i915,
 	unsigned int count = 0;
 	int err = 0;
 
-	i915_active_init(i915, &active->base, __live_active_retire);
+	i915_active_init(i915_gt_active(i915),
+			 &active->base, __live_active_retire);
 	active->retired = false;
 
 	if (!i915_active_acquire(&active->base)) {
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 074a0d9cbf26..5b88f74c1677 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -69,6 +69,7 @@ static void mock_device_release(struct drm_device *dev)
 	mutex_unlock(&i915->drm.struct_mutex);
 
 	i915_timelines_fini(i915);
+	i915_gt_active_fini(i915_gt_active(i915));
 
 	drain_workqueue(i915->wq);
 	i915_gem_drain_freed_objects(i915);
@@ -228,6 +229,9 @@ struct drm_i915_private *mock_gem_device(void)
 	if (!i915->priorities)
 		goto err_dependencies;
 
+	if (i915_gt_active_init(i915_gt_active(i915)))
+		goto err_priorities;
+
 	i915_timelines_init(i915);
 
 	INIT_LIST_HEAD(&i915->gt.active_rings);
@@ -257,6 +261,8 @@ struct drm_i915_private *mock_gem_device(void)
 err_unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
 	i915_timelines_fini(i915);
+	i915_gt_active_fini(i915_gt_active(i915));
+err_priorities:
 	kmem_cache_destroy(i915->priorities);
 err_dependencies:
 	kmem_cache_destroy(i915->dependencies);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 07/11] drm/i915: Pull i915_gem_active into the i915_active family
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (4 preceding siblings ...)
  2019-01-30  2:19 ` [PATCH 06/11] drm/i915: Allocate active tracking nodes from a slabcache Chris Wilson
@ 2019-01-30  2:19 ` Chris Wilson
  2019-01-30 16:11   ` Tvrtko Ursulin
  2019-01-30  2:19 ` [PATCH 08/11] drm/i915: Keep timeline HWSP allocated until the system is idle Chris Wilson
                   ` (8 subsequent siblings)
  14 siblings, 1 reply; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:19 UTC (permalink / raw)
  To: intel-gfx

Looking forward, we need to break the struct_mutex dependency on
i915_gem_active. In the meantime, external use of i915_gem_active is
quite beguiling, little do new users suspect that it implies a barrier
as each request it tracks must be ordered wrt the previous one. As one
of many, it can be used to track activity across multiple timelines, a
shared fence, which fits our unordered request submission much better. We
need to steer external users away from the singular, exclusive fence
imposed by i915_gem_active to i915_active instead. As part of that
process, we move i915_gem_active out of i915_request.c into
i915_active.c to start separating the two concepts, and rename it to
i915_active_request (both to tie it to the concept of tracking just one
request, and to give it a longer, less appealing name).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_active.c            |  64 ++-
 drivers/gpu/drm/i915/i915_active.h            | 348 ++++++++++++++++
 drivers/gpu/drm/i915/i915_active_types.h      |  13 +-
 drivers/gpu/drm/i915/i915_debugfs.c           |   2 +-
 drivers/gpu/drm/i915/i915_gem.c               |  10 +-
 drivers/gpu/drm/i915/i915_gem_context.c       |   4 +-
 drivers/gpu/drm/i915/i915_gem_fence_reg.c     |   4 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c           |   2 +-
 drivers/gpu/drm/i915/i915_gem_object.h        |   2 +-
 drivers/gpu/drm/i915/i915_gpu_error.c         |  10 +-
 drivers/gpu/drm/i915/i915_request.c           |  35 +-
 drivers/gpu/drm/i915/i915_request.h           | 383 ------------------
 drivers/gpu/drm/i915/i915_reset.c             |   2 +-
 drivers/gpu/drm/i915/i915_timeline.c          |  25 +-
 drivers/gpu/drm/i915/i915_timeline.h          |  14 +-
 drivers/gpu/drm/i915/i915_vma.c               |  12 +-
 drivers/gpu/drm/i915/i915_vma.h               |   2 +-
 drivers/gpu/drm/i915/intel_engine_cs.c        |   2 +-
 drivers/gpu/drm/i915/intel_overlay.c          |  33 +-
 drivers/gpu/drm/i915/selftests/intel_lrc.c    |   4 +-
 .../gpu/drm/i915/selftests/mock_timeline.c    |   4 +-
 21 files changed, 473 insertions(+), 502 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index 3c7abbde42ac..007098e44959 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -12,7 +12,7 @@
 #define BKL(ref) (&i915_from_gt((ref)->gt)->drm.struct_mutex)
 
 struct active_node {
-	struct i915_gem_active base;
+	struct i915_active_request base;
 	struct i915_active *ref;
 	struct rb_node node;
 	u64 timeline;
@@ -27,18 +27,18 @@ __active_retire(struct i915_active *ref)
 }
 
 static void
-node_retire(struct i915_gem_active *base, struct i915_request *rq)
+node_retire(struct i915_active_request *base, struct i915_request *rq)
 {
 	__active_retire(container_of(base, struct active_node, base)->ref);
 }
 
 static void
-last_retire(struct i915_gem_active *base, struct i915_request *rq)
+last_retire(struct i915_active_request *base, struct i915_request *rq)
 {
 	__active_retire(container_of(base, struct i915_active, last));
 }
 
-static struct i915_gem_active *
+static struct i915_active_request *
 active_instance(struct i915_active *ref, u64 idx)
 {
 	struct active_node *node;
@@ -59,7 +59,7 @@ active_instance(struct i915_active *ref, u64 idx)
 	 * twice for the same timeline (as the older rbtree element will be
 	 * retired before the new request added to last).
 	 */
-	old = i915_gem_active_raw(&ref->last, BKL(ref));
+	old = i915_active_request_raw(&ref->last, BKL(ref));
 	if (!old || old->fence.context == idx)
 		goto out;
 
@@ -84,7 +84,7 @@ active_instance(struct i915_active *ref, u64 idx)
 	node = kmem_cache_alloc(ref->gt->slab_cache, GFP_KERNEL);
 
 	/* kmalloc may retire the ref->last (thanks shrinker)! */
-	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
+	if (unlikely(!i915_active_request_raw(&ref->last, BKL(ref)))) {
 		kmem_cache_free(ref->gt->slab_cache, node);
 		goto out;
 	}
@@ -92,7 +92,7 @@ active_instance(struct i915_active *ref, u64 idx)
 	if (unlikely(!node))
 		return ERR_PTR(-ENOMEM);
 
-	init_request_active(&node->base, node_retire);
+	i915_active_request_init(&node->base, NULL, node_retire);
 	node->ref = ref;
 	node->timeline = idx;
 
@@ -110,7 +110,7 @@ active_instance(struct i915_active *ref, u64 idx)
 	 * callback not two, and so much undo the active counting for the
 	 * overwritten slot.
 	 */
-	if (i915_gem_active_isset(&node->base)) {
+	if (i915_active_request_isset(&node->base)) {
 		/* Retire ourselves from the old rq->active_list */
 		__list_del_entry(&node->base.link);
 		ref->count--;
@@ -131,7 +131,7 @@ void i915_active_init(struct i915_gt_active *gt,
 	ref->gt = gt;
 	ref->retire = retire;
 	ref->tree = RB_ROOT;
-	init_request_active(&ref->last, last_retire);
+	i915_active_request_init(&ref->last, NULL, last_retire);
 	ref->count = 0;
 }
 
@@ -139,15 +139,15 @@ int i915_active_ref(struct i915_active *ref,
 		    u64 timeline,
 		    struct i915_request *rq)
 {
-	struct i915_gem_active *active;
+	struct i915_active_request *active;
 
 	active = active_instance(ref, timeline);
 	if (IS_ERR(active))
 		return PTR_ERR(active);
 
-	if (!i915_gem_active_isset(active))
+	if (!i915_active_request_isset(active))
 		ref->count++;
-	i915_gem_active_set(active, rq);
+	__i915_active_request_set(active, rq);
 
 	return 0;
 }
@@ -170,7 +170,7 @@ int i915_active_wait(struct i915_active *ref)
 	struct active_node *it, *n;
 	int ret;
 
-	ret = i915_gem_active_retire(&ref->last, BKL(ref));
+	ret = i915_active_request_retire(&ref->last, BKL(ref));
 	if (ret)
 		return ret;
 
@@ -178,11 +178,11 @@ int i915_active_wait(struct i915_active *ref)
 		return 0;
 
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
-		ret = i915_gem_active_retire(&it->base, BKL(ref));
+		ret = i915_active_request_retire(&it->base, BKL(ref));
 		if (ret)
 			return ret;
 
-		GEM_BUG_ON(i915_gem_active_isset(&it->base));
+		GEM_BUG_ON(i915_active_request_isset(&it->base));
 		kmem_cache_free(slab, it);
 	}
 	ref->tree = RB_ROOT;
@@ -191,11 +191,11 @@ int i915_active_wait(struct i915_active *ref)
 	return 0;
 }
 
-static int __i915_request_await_active(struct i915_request *rq,
-				       struct i915_gem_active *active)
+int i915_request_await_active_request(struct i915_request *rq,
+				      struct i915_active_request *active)
 {
 	struct i915_request *barrier =
-		i915_gem_active_raw(active, &rq->i915->drm.struct_mutex);
+		i915_active_request_raw(active, &rq->i915->drm.struct_mutex);
 
 	return barrier ? i915_request_await_dma_fence(rq, &barrier->fence) : 0;
 }
@@ -205,12 +205,12 @@ int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
 	struct active_node *it, *n;
 	int ret;
 
-	ret = __i915_request_await_active(rq, &ref->last);
+	ret = i915_request_await_active_request(rq, &ref->last);
 	if (ret)
 		return ret;
 
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
-		ret = __i915_request_await_active(rq, &it->base);
+		ret = i915_request_await_active_request(rq, &it->base);
 		if (ret)
 			return ret;
 	}
@@ -224,13 +224,13 @@ void i915_active_fini(struct i915_active *ref)
 	struct active_node *it, *n;
 
 	lockdep_assert_held(BKL(ref));
-	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
+	GEM_BUG_ON(i915_active_request_isset(&ref->last));
 
 	if (RB_EMPTY_ROOT(&ref->tree))
 		return;
 
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
-		GEM_BUG_ON(i915_gem_active_isset(&it->base));
+		GEM_BUG_ON(i915_active_request_isset(&it->base));
 		kmem_cache_free(slab, it);
 	}
 	ref->tree = RB_ROOT;
@@ -262,6 +262,26 @@ void i915_gt_active_fini(struct i915_gt_active *gt)
 	kmem_cache_destroy(gt->slab_cache);
 }
 
+int i915_active_request_set(struct i915_active_request *active,
+			    struct i915_request *rq)
+{
+	int err;
+
+	/* Must maintain ordering wrt previous active requests */
+	err = i915_request_await_active_request(rq, active);
+	if (err)
+		return err;
+
+	__i915_active_request_set(active, rq);
+	return 0;
+}
+
+void i915_active_retire_noop(struct i915_active_request *active,
+			     struct i915_request *request)
+{
+	/* Space left intentionally blank */
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/i915_active.c"
 #endif
diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
index 41c4a5da84c8..e24421a6ac5c 100644
--- a/drivers/gpu/drm/i915/i915_active.h
+++ b/drivers/gpu/drm/i915/i915_active.h
@@ -7,8 +7,354 @@
 #ifndef _I915_ACTIVE_H_
 #define _I915_ACTIVE_H_
 
+#include <linux/lockdep.h>
+
 #include "i915_active_types.h"
 
+/*
+ * We treat requests as fences. This is not be to confused with our
+ * "fence registers" but pipeline synchronisation objects ala GL_ARB_sync.
+ * We use the fences to synchronize access from the CPU with activity on the
+ * GPU, for example, we should not rewrite an object's PTE whilst the GPU
+ * is reading them. We also track fences at a higher level to provide
+ * implicit synchronisation around GEM objects, e.g. set-domain will wait
+ * for outstanding GPU rendering before marking the object ready for CPU
+ * access, or a pageflip will wait until the GPU is complete before showing
+ * the frame on the scanout.
+ *
+ * In order to use a fence, the object must track the fence it needs to
+ * serialise with. For example, GEM objects want to track both read and
+ * write access so that we can perform concurrent read operations between
+ * the CPU and GPU engines, as well as waiting for all rendering to
+ * complete, or waiting for the last GPU user of a "fence register". The
+ * object then embeds a #i915_active_request to track the most recent (in
+ * retirement order) request relevant for the desired mode of access.
+ * The #i915_active_request is updated with i915_active_request_set() to
+ * track the most recent fence request, typically this is done as part of
+ * i915_vma_move_to_active().
+ *
+ * When the #i915_active_request completes (is retired), it will
+ * signal its completion to the owner through a callback as well as mark
+ * itself as idle (i915_active_request.request == NULL). The owner
+ * can then perform any action, such as delayed freeing of an active
+ * resource including itself.
+ */
+
+void i915_active_retire_noop(struct i915_active_request *active,
+			     struct i915_request *request);
+
+/**
+ * i915_active_request_init - prepares the activity tracker for use
+ * @active - the active tracker
+ * @rq - initial request to track, can be NULL
+ * @func - a callback when then the tracker is retired (becomes idle),
+ *         can be NULL
+ *
+ * i915_active_request_init() prepares the embedded @active struct for use as
+ * an activity tracker, that is for tracking the last known active request
+ * associated with it. When the last request becomes idle, when it is retired
+ * after completion, the optional callback @func is invoked.
+ */
+static inline void
+i915_active_request_init(struct i915_active_request *active,
+			 struct i915_request *rq,
+			 i915_active_retire_fn retire)
+{
+	RCU_INIT_POINTER(active->request, rq);
+	INIT_LIST_HEAD(&active->link);
+	active->retire = retire ?: i915_active_retire_noop;
+}
+
+#define INIT_ACTIVE_REQUEST(name) i915_active_request_init((name), NULL, NULL)
+
+/**
+ * i915_active_request_set - updates the tracker to watch the current request
+ * @active - the active tracker
+ * @request - the request to watch
+ *
+ * __i915_active_request_set() watches the given @request for completion. Whilst
+ * that @request is busy, the @active reports busy. When that @request is
+ * retired, the @active tracker is updated to report idle.
+ */
+static inline void
+__i915_active_request_set(struct i915_active_request *active,
+			  struct i915_request *request)
+{
+	list_move(&active->link, &request->active_list);
+	rcu_assign_pointer(active->request, request);
+}
+
+int __must_check
+i915_active_request_set(struct i915_active_request *active,
+			struct i915_request *rq);
+
+/**
+ * i915_active_request_set_retire_fn - updates the retirement callback
+ * @active - the active tracker
+ * @fn - the routine called when the request is retired
+ * @mutex - struct_mutex used to guard retirements
+ *
+ * i915_active_request_set_retire_fn() updates the function pointer that
+ * is called when the final request associated with the @active tracker
+ * is retired.
+ */
+static inline void
+i915_active_request_set_retire_fn(struct i915_active_request *active,
+				  i915_active_retire_fn fn,
+				  struct mutex *mutex)
+{
+	lockdep_assert_held(mutex);
+	active->retire = fn ?: i915_active_retire_noop;
+}
+
+static inline struct i915_request *
+__i915_active_request_peek(const struct i915_active_request *active)
+{
+	/*
+	 * Inside the error capture (running with the driver in an unknown
+	 * state), we want to bend the rules slightly (a lot).
+	 *
+	 * Work is in progress to make it safer, in the meantime this keeps
+	 * the known issue from spamming the logs.
+	 */
+	return rcu_dereference_protected(active->request, 1);
+}
+
+/**
+ * i915_active_request_raw - return the active request
+ * @active - the active tracker
+ *
+ * i915_active_request_raw() returns the current request being tracked, or NULL.
+ * It does not obtain a reference on the request for the caller, so the caller
+ * must hold struct_mutex.
+ */
+static inline struct i915_request *
+i915_active_request_raw(const struct i915_active_request *active,
+			struct mutex *mutex)
+{
+	return rcu_dereference_protected(active->request,
+					 lockdep_is_held(mutex));
+}
+
+/**
+ * i915_active_request_peek - report the active request being monitored
+ * @active - the active tracker
+ *
+ * i915_active_request_peek() returns the current request being tracked if
+ * still active, or NULL. It does not obtain a reference on the request
+ * for the caller, so the caller must hold struct_mutex.
+ */
+static inline struct i915_request *
+i915_active_request_peek(const struct i915_active_request *active,
+			 struct mutex *mutex)
+{
+	struct i915_request *request;
+
+	request = i915_active_request_raw(active, mutex);
+	if (!request || i915_request_completed(request))
+		return NULL;
+
+	return request;
+}
+
+/**
+ * i915_active_request_get - return a reference to the active request
+ * @active - the active tracker
+ *
+ * i915_active_request_get() returns a reference to the active request, or NULL
+ * if the active tracker is idle. The caller must hold struct_mutex.
+ */
+static inline struct i915_request *
+i915_active_request_get(const struct i915_active_request *active,
+			struct mutex *mutex)
+{
+	return i915_request_get(i915_active_request_peek(active, mutex));
+}
+
+/**
+ * __i915_active_request_get_rcu - return a reference to the active request
+ * @active - the active tracker
+ *
+ * __i915_active_request_get() returns a reference to the active request,
+ * or NULL if the active tracker is idle. The caller must hold the RCU read
+ * lock, but the returned pointer is safe to use outside of RCU.
+ */
+static inline struct i915_request *
+__i915_active_request_get_rcu(const struct i915_active_request *active)
+{
+	/*
+	 * Performing a lockless retrieval of the active request is super
+	 * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
+	 * slab of request objects will not be freed whilst we hold the
+	 * RCU read lock. It does not guarantee that the request itself
+	 * will not be freed and then *reused*. Viz,
+	 *
+	 * Thread A			Thread B
+	 *
+	 * rq = active.request
+	 *				retire(rq) -> free(rq);
+	 *				(rq is now first on the slab freelist)
+	 *				active.request = NULL
+	 *
+	 *				rq = new submission on a new object
+	 * ref(rq)
+	 *
+	 * To prevent the request from being reused whilst the caller
+	 * uses it, we take a reference like normal. Whilst acquiring
+	 * the reference we check that it is not in a destroyed state
+	 * (refcnt == 0). That prevents the request being reallocated
+	 * whilst the caller holds on to it. To check that the request
+	 * was not reallocated as we acquired the reference we have to
+	 * check that our request remains the active request across
+	 * the lookup, in the same manner as a seqlock. The visibility
+	 * of the pointer versus the reference counting is controlled
+	 * by using RCU barriers (rcu_dereference and rcu_assign_pointer).
+	 *
+	 * In the middle of all that, we inspect whether the request is
+	 * complete. Retiring is lazy so the request may be completed long
+	 * before the active tracker is updated. Querying whether the
+	 * request is complete is far cheaper (as it involves no locked
+	 * instructions setting cachelines to exclusive) than acquiring
+	 * the reference, so we do it first. The RCU read lock ensures the
+	 * pointer dereference is valid, but does not ensure that the
+	 * seqno nor HWS is the right one! However, if the request was
+	 * reallocated, that means the active tracker's request was complete.
+	 * If the new request is also complete, then both are and we can
+	 * just report the active tracker is idle. If the new request is
+	 * incomplete, then we acquire a reference on it and check that
+	 * it remained the active request.
+	 *
+	 * It is then imperative that we do not zero the request on
+	 * reallocation, so that we can chase the dangling pointers!
+	 * See i915_request_alloc().
+	 */
+	do {
+		struct i915_request *request;
+
+		request = rcu_dereference(active->request);
+		if (!request || i915_request_completed(request))
+			return NULL;
+
+		/*
+		 * An especially silly compiler could decide to recompute the
+		 * result of i915_request_completed, more specifically
+		 * re-emit the load for request->fence.seqno. A race would catch
+		 * a later seqno value, which could flip the result from true to
+		 * false. Which means part of the instructions below might not
+		 * be executed, while later on instructions are executed. Due to
+		 * barriers within the refcounting the inconsistency can't reach
+		 * past the call to i915_request_get_rcu, but not executing
+		 * that while still executing i915_request_put() creates
+		 * havoc enough.  Prevent this with a compiler barrier.
+		 */
+		barrier();
+
+		request = i915_request_get_rcu(request);
+
+		/*
+		 * What stops the following rcu_access_pointer() from occurring
+		 * before the above i915_request_get_rcu()? If we were
+		 * to read the value before pausing to get the reference to
+		 * the request, we may not notice a change in the active
+		 * tracker.
+		 *
+		 * The rcu_access_pointer() is a mere compiler barrier, which
+		 * means both the CPU and compiler are free to perform the
+		 * memory read without constraint. The compiler only has to
+		 * ensure that any operations after the rcu_access_pointer()
+		 * occur afterwards in program order. This means the read may
+		 * be performed earlier by an out-of-order CPU, or adventurous
+		 * compiler.
+		 *
+		 * The atomic operation at the heart of
+		 * i915_request_get_rcu(), see dma_fence_get_rcu(), is
+		 * atomic_inc_not_zero() which is only a full memory barrier
+		 * when successful. That is, if i915_request_get_rcu()
+		 * returns the request (and so with the reference counted
+		 * incremented) then the following read for rcu_access_pointer()
+		 * must occur after the atomic operation and so confirm
+		 * that this request is the one currently being tracked.
+		 *
+		 * The corresponding write barrier is part of
+		 * rcu_assign_pointer().
+		 */
+		if (!request || request == rcu_access_pointer(active->request))
+			return rcu_pointer_handoff(request);
+
+		i915_request_put(request);
+	} while (1);
+}
+
+/**
+ * i915_active_request_get_unlocked - return a reference to the active request
+ * @active - the active tracker
+ *
+ * i915_active_request_get_unlocked() returns a reference to the active request,
+ * or NULL if the active tracker is idle. The reference is obtained under RCU,
+ * so no locking is required by the caller.
+ *
+ * The reference should be freed with i915_request_put().
+ */
+static inline struct i915_request *
+i915_active_request_get_unlocked(const struct i915_active_request *active)
+{
+	struct i915_request *request;
+
+	rcu_read_lock();
+	request = __i915_active_request_get_rcu(active);
+	rcu_read_unlock();
+
+	return request;
+}
+
+/**
+ * i915_active_request_isset - report whether the active tracker is assigned
+ * @active - the active tracker
+ *
+ * i915_active_request_isset() returns true if the active tracker is currently
+ * assigned to a request. Due to the lazy retiring, that request may be idle
+ * and this may report stale information.
+ */
+static inline bool
+i915_active_request_isset(const struct i915_active_request *active)
+{
+	return rcu_access_pointer(active->request);
+}
+
+/**
+ * i915_active_request_retire - waits until the request is retired
+ * @active - the active request on which to wait
+ *
+ * i915_active_request_retire() waits until the request is completed,
+ * and then ensures that at least the retirement handler for this
+ * @active tracker is called before returning. If the @active
+ * tracker is idle, the function returns immediately.
+ */
+static inline int __must_check
+i915_active_request_retire(struct i915_active_request *active,
+			   struct mutex *mutex)
+{
+	struct i915_request *request;
+	long ret;
+
+	request = i915_active_request_raw(active, mutex);
+	if (!request)
+		return 0;
+
+	ret = i915_request_wait(request,
+				I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
+				MAX_SCHEDULE_TIMEOUT);
+	if (ret < 0)
+		return ret;
+
+	list_del_init(&active->link);
+	RCU_INIT_POINTER(active->request, NULL);
+
+	active->retire(active, request);
+
+	return 0;
+}
+
 /*
  * GPU activity tracking
  *
@@ -47,6 +393,8 @@ int i915_active_wait(struct i915_active *ref);
 
 int i915_request_await_active(struct i915_request *rq,
 			      struct i915_active *ref);
+int i915_request_await_active_request(struct i915_request *rq,
+				      struct i915_active_request *active);
 
 bool i915_active_acquire(struct i915_active *ref);
 void i915_active_release(struct i915_active *ref);
diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
index 3d41c33ca78c..7c1b0b1958fa 100644
--- a/drivers/gpu/drm/i915/i915_active_types.h
+++ b/drivers/gpu/drm/i915/i915_active_types.h
@@ -9,18 +9,29 @@
 
 #include <linux/list.h>
 #include <linux/rbtree.h>
+#include <linux/rcupdate.h>
 
 #include "i915_request.h"
 
+struct i915_active_request;
 struct i915_gt_active;
 struct kmem_cache;
 
+typedef void (*i915_active_retire_fn)(struct i915_active_request *,
+				      struct i915_request *);
+
+struct i915_active_request {
+	struct i915_request __rcu *request;
+	struct list_head link;
+	i915_active_retire_fn retire;
+};
+
 struct i915_active {
 	struct i915_gt_active *gt;
 	struct list_head active_link;
 
 	struct rb_root tree;
-	struct i915_gem_active last;
+	struct i915_active_request last;
 	unsigned int count;
 
 	void (*retire)(struct i915_active *ref);
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 2cea263b4d79..9cf86c8df958 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -207,7 +207,7 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 		if (vma->fence)
 			seq_printf(m, " , fence: %d%s",
 				   vma->fence->id,
-				   i915_gem_active_isset(&vma->last_fence) ? "*" : "");
+				   i915_active_request_isset(&vma->last_fence) ? "*" : "");
 		seq_puts(m, ")");
 	}
 	if (obj->stolen)
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2bc735df408b..ceb06cf73fc3 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2988,7 +2988,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
 
 	GEM_BUG_ON(i915->gt.active_requests);
 	for_each_engine(engine, i915, id) {
-		GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
+		GEM_BUG_ON(__i915_active_request_peek(&engine->timeline.last_request));
 		GEM_BUG_ON(engine->last_retired_context !=
 			   to_intel_context(i915->kernel_context, engine));
 	}
@@ -3234,7 +3234,7 @@ wait_for_timelines(struct drm_i915_private *i915,
 	list_for_each_entry(tl, &gt->active_list, link) {
 		struct i915_request *rq;
 
-		rq = i915_gem_active_get_unlocked(&tl->last_request);
+		rq = i915_active_request_get_unlocked(&tl->last_request);
 		if (!rq)
 			continue;
 
@@ -4135,7 +4135,8 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 }
 
 static void
-frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
+frontbuffer_retire(struct i915_active_request *active,
+		   struct i915_request *request)
 {
 	struct drm_i915_gem_object *obj =
 		container_of(active, typeof(*obj), frontbuffer_write);
@@ -4162,7 +4163,8 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 	obj->resv = &obj->__builtin_resv;
 
 	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
-	init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
+	i915_active_request_init(&obj->frontbuffer_write,
+				 NULL, frontbuffer_retire);
 
 	obj->mm.madv = I915_MADV_WILLNEED;
 	INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 6faf1f6faab5..ea8e818d22bf 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -653,8 +653,8 @@ last_request_on_engine(struct i915_timeline *timeline,
 
 	GEM_BUG_ON(timeline == &engine->timeline);
 
-	rq = i915_gem_active_raw(&timeline->last_request,
-				 &engine->i915->drm.struct_mutex);
+	rq = i915_active_request_raw(&timeline->last_request,
+				     &engine->i915->drm.struct_mutex);
 	if (rq && rq->engine == engine) {
 		GEM_TRACE("last request for %s on engine %s: %llx:%llu\n",
 			  timeline->name, engine->name,
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
index bdb745d5747f..946a3a756787 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
@@ -223,7 +223,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 			 i915_gem_object_get_tiling(vma->obj)))
 			return -EINVAL;
 
-		ret = i915_gem_active_retire(&vma->last_fence,
+		ret = i915_active_request_retire(&vma->last_fence,
 					     &vma->obj->base.dev->struct_mutex);
 		if (ret)
 			return ret;
@@ -232,7 +232,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
 	if (fence->vma) {
 		struct i915_vma *old = fence->vma;
 
-		ret = i915_gem_active_retire(&old->last_fence,
+		ret = i915_active_request_retire(&old->last_fence,
 					     &old->obj->base.dev->struct_mutex);
 		if (ret)
 			return ret;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index d8819de0d6ee..be79c377fc59 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1918,7 +1918,7 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
 		return ERR_PTR(-ENOMEM);
 
 	i915_active_init(i915_gt_active(i915), &vma->active, NULL);
-	init_request_active(&vma->last_fence, NULL);
+	INIT_ACTIVE_REQUEST(&vma->last_fence);
 
 	vma->vm = &ggtt->vm;
 	vma->ops = &pd_vma_ops;
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index 73fec917d097..fab040331cdb 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -175,7 +175,7 @@ struct drm_i915_gem_object {
 
 	atomic_t frontbuffer_bits;
 	unsigned int frontbuffer_ggtt_origin; /* write once */
-	struct i915_gem_active frontbuffer_write;
+	struct i915_active_request frontbuffer_write;
 
 	/** Current tiling stride for the object, if it's tiled. */
 	unsigned int tiling_and_stride;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 6e2e5ed2bd0a..9a65341fec09 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1062,23 +1062,23 @@ i915_error_object_create(struct drm_i915_private *i915,
 }
 
 /* The error capture is special as tries to run underneath the normal
- * locking rules - so we use the raw version of the i915_gem_active lookup.
+ * locking rules - so we use the raw version of the i915_active_request lookup.
  */
 static inline u32
-__active_get_seqno(struct i915_gem_active *active)
+__active_get_seqno(struct i915_active_request *active)
 {
 	struct i915_request *request;
 
-	request = __i915_gem_active_peek(active);
+	request = __i915_active_request_peek(active);
 	return request ? request->global_seqno : 0;
 }
 
 static inline int
-__active_get_engine_id(struct i915_gem_active *active)
+__active_get_engine_id(struct i915_active_request *active)
 {
 	struct i915_request *request;
 
-	request = __i915_gem_active_peek(active);
+	request = __i915_active_request_peek(active);
 	return request ? request->engine->id : -1;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 4b1869295362..a09f47ccc703 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -29,6 +29,7 @@
 #include <linux/sched/signal.h>
 
 #include "i915_drv.h"
+#include "i915_active.h"
 #include "i915_reset.h"
 
 static const char *i915_fence_get_driver_name(struct dma_fence *fence)
@@ -125,12 +126,6 @@ static void unreserve_gt(struct drm_i915_private *i915)
 		i915_gem_park(i915);
 }
 
-void i915_gem_retire_noop(struct i915_gem_active *active,
-			  struct i915_request *request)
-{
-	/* Space left intentionally blank */
-}
-
 static void advance_ring(struct i915_request *request)
 {
 	struct intel_ring *ring = request->ring;
@@ -244,7 +239,7 @@ static void __retire_engine_upto(struct intel_engine_cs *engine,
 
 static void i915_request_retire(struct i915_request *request)
 {
-	struct i915_gem_active *active, *next;
+	struct i915_active_request *active, *next;
 
 	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d:%d\n",
 		  request->engine->name,
@@ -278,10 +273,10 @@ static void i915_request_retire(struct i915_request *request)
 		 * we may spend an inordinate amount of time simply handling
 		 * the retirement of requests and processing their callbacks.
 		 * Of which, this loop itself is particularly hot due to the
-		 * cache misses when jumping around the list of i915_gem_active.
-		 * So we try to keep this loop as streamlined as possible and
-		 * also prefetch the next i915_gem_active to try and hide
-		 * the likely cache miss.
+		 * cache misses when jumping around the list of
+		 * i915_active_request.  So we try to keep this loop as
+		 * streamlined as possible and also prefetch the next
+		 * i915_active_request to try and hide the likely cache miss.
 		 */
 		prefetchw(next);
 
@@ -526,17 +521,9 @@ i915_request_alloc_slow(struct intel_context *ce)
 	return kmem_cache_alloc(ce->gem_context->i915->requests, GFP_KERNEL);
 }
 
-static int add_barrier(struct i915_request *rq, struct i915_gem_active *active)
-{
-	struct i915_request *barrier =
-		i915_gem_active_raw(active, &rq->i915->drm.struct_mutex);
-
-	return barrier ? i915_request_await_dma_fence(rq, &barrier->fence) : 0;
-}
-
 static int add_timeline_barrier(struct i915_request *rq)
 {
-	return add_barrier(rq, &rq->timeline->barrier);
+	return i915_request_await_active_request(rq, &rq->timeline->barrier);
 }
 
 /**
@@ -595,7 +582,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 * We use RCU to look up requests in flight. The lookups may
 	 * race with the request being allocated from the slab freelist.
 	 * That is the request we are writing to here, may be in the process
-	 * of being read by __i915_gem_active_get_rcu(). As such,
+	 * of being read by __i915_active_request_get_rcu(). As such,
 	 * we have to be very careful when overwriting the contents. During
 	 * the RCU lookup, we change chase the request->engine pointer,
 	 * read the request->global_seqno and increment the reference count.
@@ -937,8 +924,8 @@ void i915_request_add(struct i915_request *request)
 	 * see a more recent value in the hws than we are tracking.
 	 */
 
-	prev = i915_gem_active_raw(&timeline->last_request,
-				   &request->i915->drm.struct_mutex);
+	prev = i915_active_request_raw(&timeline->last_request,
+				       &request->i915->drm.struct_mutex);
 	if (prev && !i915_request_completed(prev)) {
 		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
 					     &request->submitq);
@@ -954,7 +941,7 @@ void i915_request_add(struct i915_request *request)
 	spin_unlock_irq(&timeline->lock);
 
 	GEM_BUG_ON(timeline->seqno != request->fence.seqno);
-	i915_gem_active_set(&timeline->last_request, request);
+	__i915_active_request_set(&timeline->last_request, request);
 
 	list_add_tail(&request->ring_link, &ring->request_list);
 	if (list_is_first(&request->ring_link, &ring->request_list)) {
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 3cffb96203b9..40f3e8dcbdd5 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -403,387 +403,4 @@ static inline void i915_request_mark_complete(struct i915_request *rq)
 
 void i915_retire_requests(struct drm_i915_private *i915);
 
-/*
- * We treat requests as fences. This is not be to confused with our
- * "fence registers" but pipeline synchronisation objects ala GL_ARB_sync.
- * We use the fences to synchronize access from the CPU with activity on the
- * GPU, for example, we should not rewrite an object's PTE whilst the GPU
- * is reading them. We also track fences at a higher level to provide
- * implicit synchronisation around GEM objects, e.g. set-domain will wait
- * for outstanding GPU rendering before marking the object ready for CPU
- * access, or a pageflip will wait until the GPU is complete before showing
- * the frame on the scanout.
- *
- * In order to use a fence, the object must track the fence it needs to
- * serialise with. For example, GEM objects want to track both read and
- * write access so that we can perform concurrent read operations between
- * the CPU and GPU engines, as well as waiting for all rendering to
- * complete, or waiting for the last GPU user of a "fence register". The
- * object then embeds a #i915_gem_active to track the most recent (in
- * retirement order) request relevant for the desired mode of access.
- * The #i915_gem_active is updated with i915_gem_active_set() to track the
- * most recent fence request, typically this is done as part of
- * i915_vma_move_to_active().
- *
- * When the #i915_gem_active completes (is retired), it will
- * signal its completion to the owner through a callback as well as mark
- * itself as idle (i915_gem_active.request == NULL). The owner
- * can then perform any action, such as delayed freeing of an active
- * resource including itself.
- */
-struct i915_gem_active;
-
-typedef void (*i915_gem_retire_fn)(struct i915_gem_active *,
-				   struct i915_request *);
-
-struct i915_gem_active {
-	struct i915_request __rcu *request;
-	struct list_head link;
-	i915_gem_retire_fn retire;
-};
-
-void i915_gem_retire_noop(struct i915_gem_active *,
-			  struct i915_request *request);
-
-/**
- * init_request_active - prepares the activity tracker for use
- * @active - the active tracker
- * @func - a callback when then the tracker is retired (becomes idle),
- *         can be NULL
- *
- * init_request_active() prepares the embedded @active struct for use as
- * an activity tracker, that is for tracking the last known active request
- * associated with it. When the last request becomes idle, when it is retired
- * after completion, the optional callback @func is invoked.
- */
-static inline void
-init_request_active(struct i915_gem_active *active,
-		    i915_gem_retire_fn retire)
-{
-	RCU_INIT_POINTER(active->request, NULL);
-	INIT_LIST_HEAD(&active->link);
-	active->retire = retire ?: i915_gem_retire_noop;
-}
-
-/**
- * i915_gem_active_set - updates the tracker to watch the current request
- * @active - the active tracker
- * @request - the request to watch
- *
- * i915_gem_active_set() watches the given @request for completion. Whilst
- * that @request is busy, the @active reports busy. When that @request is
- * retired, the @active tracker is updated to report idle.
- */
-static inline void
-i915_gem_active_set(struct i915_gem_active *active,
-		    struct i915_request *request)
-{
-	list_move(&active->link, &request->active_list);
-	rcu_assign_pointer(active->request, request);
-}
-
-/**
- * i915_gem_active_set_retire_fn - updates the retirement callback
- * @active - the active tracker
- * @fn - the routine called when the request is retired
- * @mutex - struct_mutex used to guard retirements
- *
- * i915_gem_active_set_retire_fn() updates the function pointer that
- * is called when the final request associated with the @active tracker
- * is retired.
- */
-static inline void
-i915_gem_active_set_retire_fn(struct i915_gem_active *active,
-			      i915_gem_retire_fn fn,
-			      struct mutex *mutex)
-{
-	lockdep_assert_held(mutex);
-	active->retire = fn ?: i915_gem_retire_noop;
-}
-
-static inline struct i915_request *
-__i915_gem_active_peek(const struct i915_gem_active *active)
-{
-	/*
-	 * Inside the error capture (running with the driver in an unknown
-	 * state), we want to bend the rules slightly (a lot).
-	 *
-	 * Work is in progress to make it safer, in the meantime this keeps
-	 * the known issue from spamming the logs.
-	 */
-	return rcu_dereference_protected(active->request, 1);
-}
-
-/**
- * i915_gem_active_raw - return the active request
- * @active - the active tracker
- *
- * i915_gem_active_raw() returns the current request being tracked, or NULL.
- * It does not obtain a reference on the request for the caller, so the caller
- * must hold struct_mutex.
- */
-static inline struct i915_request *
-i915_gem_active_raw(const struct i915_gem_active *active, struct mutex *mutex)
-{
-	return rcu_dereference_protected(active->request,
-					 lockdep_is_held(mutex));
-}
-
-/**
- * i915_gem_active_peek - report the active request being monitored
- * @active - the active tracker
- *
- * i915_gem_active_peek() returns the current request being tracked if
- * still active, or NULL. It does not obtain a reference on the request
- * for the caller, so the caller must hold struct_mutex.
- */
-static inline struct i915_request *
-i915_gem_active_peek(const struct i915_gem_active *active, struct mutex *mutex)
-{
-	struct i915_request *request;
-
-	request = i915_gem_active_raw(active, mutex);
-	if (!request || i915_request_completed(request))
-		return NULL;
-
-	return request;
-}
-
-/**
- * i915_gem_active_get - return a reference to the active request
- * @active - the active tracker
- *
- * i915_gem_active_get() returns a reference to the active request, or NULL
- * if the active tracker is idle. The caller must hold struct_mutex.
- */
-static inline struct i915_request *
-i915_gem_active_get(const struct i915_gem_active *active, struct mutex *mutex)
-{
-	return i915_request_get(i915_gem_active_peek(active, mutex));
-}
-
-/**
- * __i915_gem_active_get_rcu - return a reference to the active request
- * @active - the active tracker
- *
- * __i915_gem_active_get() returns a reference to the active request, or NULL
- * if the active tracker is idle. The caller must hold the RCU read lock, but
- * the returned pointer is safe to use outside of RCU.
- */
-static inline struct i915_request *
-__i915_gem_active_get_rcu(const struct i915_gem_active *active)
-{
-	/*
-	 * Performing a lockless retrieval of the active request is super
-	 * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
-	 * slab of request objects will not be freed whilst we hold the
-	 * RCU read lock. It does not guarantee that the request itself
-	 * will not be freed and then *reused*. Viz,
-	 *
-	 * Thread A			Thread B
-	 *
-	 * rq = active.request
-	 *				retire(rq) -> free(rq);
-	 *				(rq is now first on the slab freelist)
-	 *				active.request = NULL
-	 *
-	 *				rq = new submission on a new object
-	 * ref(rq)
-	 *
-	 * To prevent the request from being reused whilst the caller
-	 * uses it, we take a reference like normal. Whilst acquiring
-	 * the reference we check that it is not in a destroyed state
-	 * (refcnt == 0). That prevents the request being reallocated
-	 * whilst the caller holds on to it. To check that the request
-	 * was not reallocated as we acquired the reference we have to
-	 * check that our request remains the active request across
-	 * the lookup, in the same manner as a seqlock. The visibility
-	 * of the pointer versus the reference counting is controlled
-	 * by using RCU barriers (rcu_dereference and rcu_assign_pointer).
-	 *
-	 * In the middle of all that, we inspect whether the request is
-	 * complete. Retiring is lazy so the request may be completed long
-	 * before the active tracker is updated. Querying whether the
-	 * request is complete is far cheaper (as it involves no locked
-	 * instructions setting cachelines to exclusive) than acquiring
-	 * the reference, so we do it first. The RCU read lock ensures the
-	 * pointer dereference is valid, but does not ensure that the
-	 * seqno nor HWS is the right one! However, if the request was
-	 * reallocated, that means the active tracker's request was complete.
-	 * If the new request is also complete, then both are and we can
-	 * just report the active tracker is idle. If the new request is
-	 * incomplete, then we acquire a reference on it and check that
-	 * it remained the active request.
-	 *
-	 * It is then imperative that we do not zero the request on
-	 * reallocation, so that we can chase the dangling pointers!
-	 * See i915_request_alloc().
-	 */
-	do {
-		struct i915_request *request;
-
-		request = rcu_dereference(active->request);
-		if (!request || i915_request_completed(request))
-			return NULL;
-
-		/*
-		 * An especially silly compiler could decide to recompute the
-		 * result of i915_request_completed, more specifically
-		 * re-emit the load for request->fence.seqno. A race would catch
-		 * a later seqno value, which could flip the result from true to
-		 * false. Which means part of the instructions below might not
-		 * be executed, while later on instructions are executed. Due to
-		 * barriers within the refcounting the inconsistency can't reach
-		 * past the call to i915_request_get_rcu, but not executing
-		 * that while still executing i915_request_put() creates
-		 * havoc enough.  Prevent this with a compiler barrier.
-		 */
-		barrier();
-
-		request = i915_request_get_rcu(request);
-
-		/*
-		 * What stops the following rcu_access_pointer() from occurring
-		 * before the above i915_request_get_rcu()? If we were
-		 * to read the value before pausing to get the reference to
-		 * the request, we may not notice a change in the active
-		 * tracker.
-		 *
-		 * The rcu_access_pointer() is a mere compiler barrier, which
-		 * means both the CPU and compiler are free to perform the
-		 * memory read without constraint. The compiler only has to
-		 * ensure that any operations after the rcu_access_pointer()
-		 * occur afterwards in program order. This means the read may
-		 * be performed earlier by an out-of-order CPU, or adventurous
-		 * compiler.
-		 *
-		 * The atomic operation at the heart of
-		 * i915_request_get_rcu(), see dma_fence_get_rcu(), is
-		 * atomic_inc_not_zero() which is only a full memory barrier
-		 * when successful. That is, if i915_request_get_rcu()
-		 * returns the request (and so with the reference counted
-		 * incremented) then the following read for rcu_access_pointer()
-		 * must occur after the atomic operation and so confirm
-		 * that this request is the one currently being tracked.
-		 *
-		 * The corresponding write barrier is part of
-		 * rcu_assign_pointer().
-		 */
-		if (!request || request == rcu_access_pointer(active->request))
-			return rcu_pointer_handoff(request);
-
-		i915_request_put(request);
-	} while (1);
-}
-
-/**
- * i915_gem_active_get_unlocked - return a reference to the active request
- * @active - the active tracker
- *
- * i915_gem_active_get_unlocked() returns a reference to the active request,
- * or NULL if the active tracker is idle. The reference is obtained under RCU,
- * so no locking is required by the caller.
- *
- * The reference should be freed with i915_request_put().
- */
-static inline struct i915_request *
-i915_gem_active_get_unlocked(const struct i915_gem_active *active)
-{
-	struct i915_request *request;
-
-	rcu_read_lock();
-	request = __i915_gem_active_get_rcu(active);
-	rcu_read_unlock();
-
-	return request;
-}
-
-/**
- * i915_gem_active_isset - report whether the active tracker is assigned
- * @active - the active tracker
- *
- * i915_gem_active_isset() returns true if the active tracker is currently
- * assigned to a request. Due to the lazy retiring, that request may be idle
- * and this may report stale information.
- */
-static inline bool
-i915_gem_active_isset(const struct i915_gem_active *active)
-{
-	return rcu_access_pointer(active->request);
-}
-
-/**
- * i915_gem_active_wait - waits until the request is completed
- * @active - the active request on which to wait
- * @flags - how to wait
- * @timeout - how long to wait at most
- * @rps - userspace client to charge for a waitboost
- *
- * i915_gem_active_wait() waits until the request is completed before
- * returning, without requiring any locks to be held. Note that it does not
- * retire any requests before returning.
- *
- * This function relies on RCU in order to acquire the reference to the active
- * request without holding any locks. See __i915_gem_active_get_rcu() for the
- * glory details on how that is managed. Once the reference is acquired, we
- * can then wait upon the request, and afterwards release our reference,
- * free of any locking.
- *
- * This function wraps i915_request_wait(), see it for the full details on
- * the arguments.
- *
- * Returns 0 if successful, or a negative error code.
- */
-static inline int
-i915_gem_active_wait(const struct i915_gem_active *active, unsigned int flags)
-{
-	struct i915_request *request;
-	long ret = 0;
-
-	request = i915_gem_active_get_unlocked(active);
-	if (request) {
-		ret = i915_request_wait(request, flags, MAX_SCHEDULE_TIMEOUT);
-		i915_request_put(request);
-	}
-
-	return ret < 0 ? ret : 0;
-}
-
-/**
- * i915_gem_active_retire - waits until the request is retired
- * @active - the active request on which to wait
- *
- * i915_gem_active_retire() waits until the request is completed,
- * and then ensures that at least the retirement handler for this
- * @active tracker is called before returning. If the @active
- * tracker is idle, the function returns immediately.
- */
-static inline int __must_check
-i915_gem_active_retire(struct i915_gem_active *active,
-		       struct mutex *mutex)
-{
-	struct i915_request *request;
-	long ret;
-
-	request = i915_gem_active_raw(active, mutex);
-	if (!request)
-		return 0;
-
-	ret = i915_request_wait(request,
-				I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
-				MAX_SCHEDULE_TIMEOUT);
-	if (ret < 0)
-		return ret;
-
-	list_del_init(&active->link);
-	RCU_INIT_POINTER(active->request, NULL);
-
-	active->retire(active, request);
-
-	return 0;
-}
-
-#define for_each_active(mask, idx) \
-	for (; mask ? idx = ffs(mask) - 1, 1 : 0; mask &= ~BIT(idx))
-
 #endif /* I915_REQUEST_H */
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 328b35410672..555f358bf6ba 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -888,7 +888,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 		struct i915_request *rq;
 		long timeout;
 
-		rq = i915_gem_active_get_unlocked(&tl->last_request);
+		rq = i915_active_request_get_unlocked(&tl->last_request);
 		if (!rq)
 			continue;
 
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index b354843a5040..b2202d2e58a2 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -163,8 +163,8 @@ int i915_timeline_init(struct drm_i915_private *i915,
 
 	spin_lock_init(&timeline->lock);
 
-	init_request_active(&timeline->barrier, NULL);
-	init_request_active(&timeline->last_request, NULL);
+	INIT_ACTIVE_REQUEST(&timeline->barrier);
+	INIT_ACTIVE_REQUEST(&timeline->last_request);
 	INIT_LIST_HEAD(&timeline->requests);
 
 	i915_syncmap_init(&timeline->sync);
@@ -236,7 +236,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 {
 	GEM_BUG_ON(timeline->pin_count);
 	GEM_BUG_ON(!list_empty(&timeline->requests));
-	GEM_BUG_ON(i915_gem_active_isset(&timeline->barrier));
+	GEM_BUG_ON(i915_active_request_isset(&timeline->barrier));
 
 	i915_syncmap_free(&timeline->sync);
 	hwsp_free(timeline);
@@ -268,25 +268,6 @@ i915_timeline_create(struct drm_i915_private *i915,
 	return timeline;
 }
 
-int i915_timeline_set_barrier(struct i915_timeline *tl, struct i915_request *rq)
-{
-	struct i915_request *old;
-	int err;
-
-	lockdep_assert_held(&rq->i915->drm.struct_mutex);
-
-	/* Must maintain ordering wrt existing barriers */
-	old = i915_gem_active_raw(&tl->barrier, &rq->i915->drm.struct_mutex);
-	if (old) {
-		err = i915_request_await_dma_fence(rq, &old->fence);
-		if (err)
-			return err;
-	}
-
-	i915_gem_active_set(&tl->barrier, rq);
-	return 0;
-}
-
 int i915_timeline_pin(struct i915_timeline *tl)
 {
 	int err;
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index d167e04073c5..7bec7d2e45bf 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -28,6 +28,7 @@
 #include <linux/list.h>
 #include <linux/kref.h>
 
+#include "i915_active.h"
 #include "i915_request.h"
 #include "i915_syncmap.h"
 #include "i915_utils.h"
@@ -58,10 +59,10 @@ struct i915_timeline {
 
 	/* Contains an RCU guarded pointer to the last request. No reference is
 	 * held to the request, users must carefully acquire a reference to
-	 * the request using i915_gem_active_get_request_rcu(), or hold the
+	 * the request using i915_active_request_get_request_rcu(), or hold the
 	 * struct_mutex.
 	 */
-	struct i915_gem_active last_request;
+	struct i915_active_request last_request;
 
 	/**
 	 * We track the most recent seqno that we wait on in every context so
@@ -82,7 +83,7 @@ struct i915_timeline {
 	 * subsequent submissions to this timeline be executed only after the
 	 * barrier has been completed.
 	 */
-	struct i915_gem_active barrier;
+	struct i915_active_request barrier;
 
 	struct list_head link;
 	const char *name;
@@ -174,7 +175,10 @@ void i915_timelines_fini(struct drm_i915_private *i915);
  * submissions on @timeline. Subsequent requests will not be submitted to GPU
  * until the barrier has been completed.
  */
-int i915_timeline_set_barrier(struct i915_timeline *timeline,
-			      struct i915_request *rq);
+static inline int
+i915_timeline_set_barrier(struct i915_timeline *tl, struct i915_request *rq)
+{
+	return i915_active_request_set(&tl->barrier, rq);
+}
 
 #endif
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 2456bfb4877b..376821c37d72 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -121,7 +121,7 @@ vma_create(struct drm_i915_gem_object *obj,
 
 	i915_active_init(i915_gt_active(vm->i915),
 			 &vma->active, __i915_vma_retire);
-	init_request_active(&vma->last_fence, NULL);
+	INIT_ACTIVE_REQUEST(&vma->last_fence);
 
 	vma->vm = vm;
 	vma->ops = &vm->vma_ops;
@@ -809,7 +809,7 @@ static void __i915_vma_destroy(struct i915_vma *vma)
 	GEM_BUG_ON(vma->node.allocated);
 	GEM_BUG_ON(vma->fence);
 
-	GEM_BUG_ON(i915_gem_active_isset(&vma->last_fence));
+	GEM_BUG_ON(i915_active_request_isset(&vma->last_fence));
 
 	mutex_lock(&vma->vm->mutex);
 	list_del(&vma->vm_link);
@@ -943,14 +943,14 @@ int i915_vma_move_to_active(struct i915_vma *vma,
 		obj->write_domain = I915_GEM_DOMAIN_RENDER;
 
 		if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
-			i915_gem_active_set(&obj->frontbuffer_write, rq);
+			__i915_active_request_set(&obj->frontbuffer_write, rq);
 
 		obj->read_domains = 0;
 	}
 	obj->read_domains |= I915_GEM_GPU_DOMAINS;
 
 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
-		i915_gem_active_set(&vma->last_fence, rq);
+		__i915_active_request_set(&vma->last_fence, rq);
 
 	export_fence(vma, rq, flags);
 	return 0;
@@ -987,8 +987,8 @@ int i915_vma_unbind(struct i915_vma *vma)
 		if (ret)
 			goto unpin;
 
-		ret = i915_gem_active_retire(&vma->last_fence,
-					     &vma->vm->i915->drm.struct_mutex);
+		ret = i915_active_request_retire(&vma->last_fence,
+					      &vma->vm->i915->drm.struct_mutex);
 unpin:
 		__i915_vma_unpin(vma);
 		if (ret)
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 3c03d4569481..7c742027f866 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -110,7 +110,7 @@ struct i915_vma {
 #define I915_VMA_GGTT_WRITE	BIT(15)
 
 	struct i915_active active;
-	struct i915_gem_active last_fence;
+	struct i915_active_request last_fence;
 
 	/**
 	 * Support different GGTT views into the same object.
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 71c01eb13af1..49fa43ff02ba 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1086,7 +1086,7 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine)
 	 * the last request that remains in the timeline. When idle, it is
 	 * the last executed context as tracked by retirement.
 	 */
-	rq = __i915_gem_active_peek(&engine->timeline.last_request);
+	rq = __i915_active_request_peek(&engine->timeline.last_request);
 	if (rq)
 		return rq->hw_context == kernel_context;
 	else
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index a9238fd07e30..c0df1dbb0069 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -186,7 +186,7 @@ struct intel_overlay {
 	struct overlay_registers __iomem *regs;
 	u32 flip_addr;
 	/* flip handling */
-	struct i915_gem_active last_flip;
+	struct i915_active_request last_flip;
 };
 
 static void i830_overlay_clock_gating(struct drm_i915_private *dev_priv,
@@ -214,23 +214,23 @@ static void i830_overlay_clock_gating(struct drm_i915_private *dev_priv,
 
 static void intel_overlay_submit_request(struct intel_overlay *overlay,
 					 struct i915_request *rq,
-					 i915_gem_retire_fn retire)
+					 i915_active_retire_fn retire)
 {
-	GEM_BUG_ON(i915_gem_active_peek(&overlay->last_flip,
-					&overlay->i915->drm.struct_mutex));
-	i915_gem_active_set_retire_fn(&overlay->last_flip, retire,
-				      &overlay->i915->drm.struct_mutex);
-	i915_gem_active_set(&overlay->last_flip, rq);
+	GEM_BUG_ON(i915_active_request_peek(&overlay->last_flip,
+					    &overlay->i915->drm.struct_mutex));
+	i915_active_request_set_retire_fn(&overlay->last_flip, retire,
+					  &overlay->i915->drm.struct_mutex);
+	__i915_active_request_set(&overlay->last_flip, rq);
 	i915_request_add(rq);
 }
 
 static int intel_overlay_do_wait_request(struct intel_overlay *overlay,
 					 struct i915_request *rq,
-					 i915_gem_retire_fn retire)
+					 i915_active_retire_fn retire)
 {
 	intel_overlay_submit_request(overlay, rq, retire);
-	return i915_gem_active_retire(&overlay->last_flip,
-				      &overlay->i915->drm.struct_mutex);
+	return i915_active_request_retire(&overlay->last_flip,
+					  &overlay->i915->drm.struct_mutex);
 }
 
 static struct i915_request *alloc_request(struct intel_overlay *overlay)
@@ -351,8 +351,9 @@ static void intel_overlay_release_old_vma(struct intel_overlay *overlay)
 	i915_vma_put(vma);
 }
 
-static void intel_overlay_release_old_vid_tail(struct i915_gem_active *active,
-					       struct i915_request *rq)
+static void
+intel_overlay_release_old_vid_tail(struct i915_active_request *active,
+				   struct i915_request *rq)
 {
 	struct intel_overlay *overlay =
 		container_of(active, typeof(*overlay), last_flip);
@@ -360,7 +361,7 @@ static void intel_overlay_release_old_vid_tail(struct i915_gem_active *active,
 	intel_overlay_release_old_vma(overlay);
 }
 
-static void intel_overlay_off_tail(struct i915_gem_active *active,
+static void intel_overlay_off_tail(struct i915_active_request *active,
 				   struct i915_request *rq)
 {
 	struct intel_overlay *overlay =
@@ -423,8 +424,8 @@ static int intel_overlay_off(struct intel_overlay *overlay)
  * We have to be careful not to repeat work forever an make forward progess. */
 static int intel_overlay_recover_from_interrupt(struct intel_overlay *overlay)
 {
-	return i915_gem_active_retire(&overlay->last_flip,
-				      &overlay->i915->drm.struct_mutex);
+	return i915_active_request_retire(&overlay->last_flip,
+					  &overlay->i915->drm.struct_mutex);
 }
 
 /* Wait for pending overlay flip and release old frame.
@@ -1357,7 +1358,7 @@ void intel_overlay_setup(struct drm_i915_private *dev_priv)
 	overlay->contrast = 75;
 	overlay->saturation = 146;
 
-	init_request_active(&overlay->last_flip, NULL);
+	INIT_ACTIVE_REQUEST(&overlay->last_flip);
 
 	mutex_lock(&dev_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index 1151c54d2acf..b0331b0bfbc0 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -503,8 +503,8 @@ static int live_suppress_wait_preempt(void *arg)
 				}
 
 				/* Disable NEWCLIENT promotion */
-				i915_gem_active_set(&rq[i]->timeline->last_request,
-						    dummy);
+				__i915_active_request_set(&rq[i]->timeline->last_request,
+							  dummy);
 				i915_request_add(rq[i]);
 			}
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_timeline.c b/drivers/gpu/drm/i915/selftests/mock_timeline.c
index e5659aaa856d..d2de9ece2118 100644
--- a/drivers/gpu/drm/i915/selftests/mock_timeline.c
+++ b/drivers/gpu/drm/i915/selftests/mock_timeline.c
@@ -15,8 +15,8 @@ void mock_timeline_init(struct i915_timeline *timeline, u64 context)
 
 	spin_lock_init(&timeline->lock);
 
-	init_request_active(&timeline->barrier, NULL);
-	init_request_active(&timeline->last_request, NULL);
+	INIT_ACTIVE_REQUEST(&timeline->barrier);
+	INIT_ACTIVE_REQUEST(&timeline->last_request);
 	INIT_LIST_HEAD(&timeline->requests);
 
 	i915_syncmap_init(&timeline->sync);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 08/11] drm/i915: Keep timeline HWSP allocated until the system is idle
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (5 preceding siblings ...)
  2019-01-30  2:19 ` [PATCH 07/11] drm/i915: Pull i915_gem_active into the i915_active family Chris Wilson
@ 2019-01-30  2:19 ` Chris Wilson
  2019-01-30 17:54   ` Tvrtko Ursulin
  2019-01-30  2:19 ` [PATCH 09/11] drm/i915/execlists: Refactor out can_merge_rq() Chris Wilson
                   ` (7 subsequent siblings)
  14 siblings, 1 reply; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:19 UTC (permalink / raw)
  To: intel-gfx

In preparation for enabling HW semaphores, we need to keep in flight
timeline HWSP alive until the entire system is idle, as any other
timeline active on the GPU may still refer back to the already retired
timeline. We both have to delay recycling available cachelines and
unpinning old HWSP until the next idle point (i.e. on parking).

That we have to keep the HWSP alive for external references on HW raises
an interesting conundrum. On a busy system, we may never see a global
idle point, essentially meaning the resource will be leaking until we
are forced to sleep. What we need is a set of RCU primitives for the GPU!
This should also help mitigate the resource starvation issues
promulgating from keeping all logical state pinned until idle (instead
of as currently handled until the next context switch).

v2: Use idle barriers to free stale HWSP as soon as all current requests
are idle, rather than rely on the system reaching a global idle point.
(Tvrtko)
v3: Replace the idle barrier with read locks.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_request.c  |  30 ++--
 drivers/gpu/drm/i915/i915_timeline.c | 229 +++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_timeline.h |   9 +-
 3 files changed, 237 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index a09f47ccc703..07e4c3c68ecd 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -326,11 +326,6 @@ void i915_request_retire_upto(struct i915_request *rq)
 	} while (tmp != rq);
 }
 
-static u32 timeline_get_seqno(struct i915_timeline *tl)
-{
-	return tl->seqno += 1 + tl->has_initial_breadcrumb;
-}
-
 static void move_to_timeline(struct i915_request *request,
 			     struct i915_timeline *timeline)
 {
@@ -539,8 +534,10 @@ struct i915_request *
 i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 {
 	struct drm_i915_private *i915 = engine->i915;
-	struct i915_request *rq;
 	struct intel_context *ce;
+	struct i915_timeline *tl;
+	struct i915_request *rq;
+	u32 seqno;
 	int ret;
 
 	lockdep_assert_held(&i915->drm.struct_mutex);
@@ -615,24 +612,26 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 		}
 	}
 
-	rq->rcustate = get_state_synchronize_rcu();
-
 	INIT_LIST_HEAD(&rq->active_list);
+
+	tl = ce->ring->timeline;
+	ret = i915_timeline_get_seqno(tl, rq, &seqno);
+	if (ret)
+		goto err_free;
+
 	rq->i915 = i915;
 	rq->engine = engine;
 	rq->gem_context = ctx;
 	rq->hw_context = ce;
 	rq->ring = ce->ring;
-	rq->timeline = ce->ring->timeline;
+	rq->timeline = tl;
 	GEM_BUG_ON(rq->timeline == &engine->timeline);
-	rq->hwsp_seqno = rq->timeline->hwsp_seqno;
+	rq->hwsp_seqno = tl->hwsp_seqno;
+	rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */
 
 	spin_lock_init(&rq->lock);
-	dma_fence_init(&rq->fence,
-		       &i915_fence_ops,
-		       &rq->lock,
-		       rq->timeline->fence_context,
-		       timeline_get_seqno(rq->timeline));
+	dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock,
+		       tl->fence_context, seqno);
 
 	/* We bump the ref for the fence chain */
 	i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
@@ -693,6 +692,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	GEM_BUG_ON(!list_empty(&rq->sched.signalers_list));
 	GEM_BUG_ON(!list_empty(&rq->sched.waiters_list));
 
+err_free:
 	kmem_cache_free(i915->requests, rq);
 err_unreserve:
 	unreserve_gt(i915);
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index b2202d2e58a2..fd1a92a3663d 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -6,19 +6,28 @@
 
 #include "i915_drv.h"
 
-#include "i915_timeline.h"
+#include "i915_active.h"
 #include "i915_syncmap.h"
+#include "i915_timeline.h"
 
 struct i915_timeline_hwsp {
-	struct i915_vma *vma;
+	struct i915_gt_timelines *gt;
 	struct list_head free_link;
+	struct i915_vma *vma;
 	u64 free_bitmap;
 };
 
-static inline struct i915_timeline_hwsp *
-i915_timeline_hwsp(const struct i915_timeline *tl)
+struct i915_timeline_cacheline {
+	struct i915_active active;
+	struct i915_timeline_hwsp *hwsp;
+	unsigned int cacheline : 6;
+	unsigned int free : 1;
+};
+
+static inline struct drm_i915_private *
+hwsp_to_i915(struct i915_timeline_hwsp *hwsp)
 {
-	return tl->hwsp_ggtt->private;
+	return container_of(hwsp->gt, struct drm_i915_private, gt.timelines);
 }
 
 static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
@@ -71,6 +80,7 @@ hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
 		vma->private = hwsp;
 		hwsp->vma = vma;
 		hwsp->free_bitmap = ~0ull;
+		hwsp->gt = gt;
 
 		spin_lock(&gt->hwsp_lock);
 		list_add(&hwsp->free_link, &gt->hwsp_free_list);
@@ -88,14 +98,9 @@ hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
 	return hwsp->vma;
 }
 
-static void hwsp_free(struct i915_timeline *timeline)
+static void __idle_hwsp_free(struct i915_timeline_hwsp *hwsp, int cacheline)
 {
-	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
-	struct i915_timeline_hwsp *hwsp;
-
-	hwsp = i915_timeline_hwsp(timeline);
-	if (!hwsp) /* leave global HWSP alone! */
-		return;
+	struct i915_gt_timelines *gt = hwsp->gt;
 
 	spin_lock(&gt->hwsp_lock);
 
@@ -103,7 +108,8 @@ static void hwsp_free(struct i915_timeline *timeline)
 	if (!hwsp->free_bitmap)
 		list_add_tail(&hwsp->free_link, &gt->hwsp_free_list);
 
-	hwsp->free_bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
+	GEM_BUG_ON(cacheline >= BITS_PER_TYPE(hwsp->free_bitmap));
+	hwsp->free_bitmap |= BIT_ULL(cacheline);
 
 	/* And if no one is left using it, give the page back to the system */
 	if (hwsp->free_bitmap == ~0ull) {
@@ -115,6 +121,80 @@ static void hwsp_free(struct i915_timeline *timeline)
 	spin_unlock(&gt->hwsp_lock);
 }
 
+static void __idle_cacheline_free(struct i915_timeline_cacheline *cl)
+{
+	GEM_BUG_ON(!i915_active_is_idle(&cl->active));
+
+	i915_vma_put(cl->hwsp->vma);
+	__idle_hwsp_free(cl->hwsp, cl->cacheline);
+
+	i915_active_fini(&cl->active);
+	kfree(cl);
+}
+
+static void __idle_cacheline_park(struct i915_timeline_cacheline *cl)
+{
+	i915_active_fini(&cl->active);
+}
+
+static void __cacheline_retire(struct i915_active *active)
+{
+	struct i915_timeline_cacheline *cl =
+		container_of(active, typeof(*cl), active);
+
+	i915_vma_unpin(cl->hwsp->vma);
+	if (!cl->free)
+		__idle_cacheline_park(cl);
+	else
+		__idle_cacheline_free(cl);
+}
+
+static struct i915_timeline_cacheline *
+cacheline_alloc(struct i915_timeline_hwsp *hwsp, unsigned int cacheline)
+{
+	struct i915_timeline_cacheline *cl;
+
+	GEM_BUG_ON(cacheline >= 64);
+
+	cl = kmalloc(sizeof(*cl), GFP_KERNEL);
+	if (!cl)
+		return ERR_PTR(-ENOMEM);
+
+	i915_vma_get(hwsp->vma);
+	cl->hwsp = hwsp;
+	cl->cacheline = cacheline;
+	cl->free = false;
+
+	i915_active_init(i915_gt_active(hwsp_to_i915(hwsp)),
+			 &cl->active, __cacheline_retire);
+
+	return cl;
+}
+
+static void cacheline_acquire(struct i915_timeline_cacheline *cl)
+{
+	if (cl && i915_active_acquire(&cl->active))
+		__i915_vma_pin(cl->hwsp->vma);
+}
+
+static void cacheline_release(struct i915_timeline_cacheline *cl)
+{
+	if (cl)
+		i915_active_release(&cl->active);
+}
+
+static void cacheline_free(struct i915_timeline_cacheline *cl)
+{
+	if (!cl)
+		return;
+
+	GEM_BUG_ON(cl->free);
+	cl->free = true;
+
+	if (i915_active_is_idle(&cl->active))
+		__idle_cacheline_free(cl);
+}
+
 int i915_timeline_init(struct drm_i915_private *i915,
 		       struct i915_timeline *timeline,
 		       const char *name,
@@ -136,22 +216,32 @@ int i915_timeline_init(struct drm_i915_private *i915,
 	timeline->name = name;
 	timeline->pin_count = 0;
 	timeline->has_initial_breadcrumb = !hwsp;
+	timeline->hwsp_cacheline = NULL;
 
 	timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
 	if (!hwsp) {
+		struct i915_timeline_cacheline *cl;
 		unsigned int cacheline;
 
 		hwsp = hwsp_alloc(timeline, &cacheline);
 		if (IS_ERR(hwsp))
 			return PTR_ERR(hwsp);
 
+		cl = cacheline_alloc(hwsp->private, cacheline);
+		if (IS_ERR(cl)) {
+			__idle_hwsp_free(hwsp->private, cacheline);
+			return PTR_ERR(cl);
+		}
+
 		timeline->hwsp_offset = cacheline * CACHELINE_BYTES;
+		timeline->hwsp_cacheline = cl;
 	}
 	timeline->hwsp_ggtt = i915_vma_get(hwsp);
+	GEM_BUG_ON(timeline->hwsp_offset >= hwsp->size);
 
 	vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB);
 	if (IS_ERR(vaddr)) {
-		hwsp_free(timeline);
+		cacheline_free(timeline->hwsp_cacheline);
 		i915_vma_put(hwsp);
 		return PTR_ERR(vaddr);
 	}
@@ -239,7 +329,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 	GEM_BUG_ON(i915_active_request_isset(&timeline->barrier));
 
 	i915_syncmap_free(&timeline->sync);
-	hwsp_free(timeline);
+	cacheline_free(timeline->hwsp_cacheline);
 
 	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
 	i915_vma_put(timeline->hwsp_ggtt);
@@ -284,6 +374,7 @@ int i915_timeline_pin(struct i915_timeline *tl)
 		i915_ggtt_offset(tl->hwsp_ggtt) +
 		offset_in_page(tl->hwsp_offset);
 
+	cacheline_acquire(tl->hwsp_cacheline);
 	timeline_add_to_active(tl);
 
 	return 0;
@@ -293,6 +384,113 @@ int i915_timeline_pin(struct i915_timeline *tl)
 	return err;
 }
 
+static u32 timeline_advance(struct i915_timeline *tl)
+{
+	GEM_BUG_ON(!tl->pin_count);
+	GEM_BUG_ON(tl->seqno & tl->has_initial_breadcrumb);
+
+	return tl->seqno += 1 + tl->has_initial_breadcrumb;
+}
+
+static void timeline_rollback(struct i915_timeline *tl)
+{
+	tl->seqno -= 1 + tl->has_initial_breadcrumb;
+}
+
+static noinline int
+__i915_timeline_get_seqno(struct i915_timeline *tl,
+			  struct i915_request *rq,
+			  u32 *seqno)
+{
+	struct i915_timeline_cacheline *cl;
+	struct i915_vma *vma;
+	unsigned int cacheline;
+	int err;
+
+	/*
+	 * If there is an outstanding GPU reference to this cacheline,
+	 * such as it being sampled by a HW semaphore on another timeline,
+	 * we cannot wraparound our seqno value (the HW semaphore does
+	 * a strict greater-than-or-equals compare, not i915_seqno_passed).
+	 * So if the cacheline is still busy, we must detached ourselves
+	 * from it and leave it inflight alongside its users.
+	 *
+	 * However, if nobody is watching and we can guarantee that nobody
+	 * will, we could simply reuse the same cacheline.
+	 *
+	 * // while locked
+	 * if (i915_active_request_is_signaled(&tl->last_request) &&
+	 *     i915_active_is_signaled(&tl->hwsp_cacheline->active))
+	 *	return 0;
+	 *
+	 * That seems unlikely for a busy timeline that needed to wrap in
+	 * the first place, so just replace the cacheline.
+	 */
+
+	vma = hwsp_alloc(tl, &cacheline);
+	if (IS_ERR(vma)) {
+		err = PTR_ERR(vma);
+		goto err_rollback;
+	}
+
+	cl = cacheline_alloc(vma->private, cacheline);
+	if (IS_ERR(cl)) {
+		err = PTR_ERR(cl);
+		goto err_hwsp;
+	}
+
+	/*
+	 * Attach the old cacheline to the current request, so that we only
+	 * free it after the current request is retired, which ensures that
+	 * all writes into the cacheline from previous requests are complete.
+	 */
+	err = i915_active_ref(&tl->hwsp_cacheline->active,
+			      tl->fence_context, rq);
+	if (err)
+		goto err_cacheline;
+
+	tl->hwsp_ggtt = i915_vma_get(vma);
+	tl->hwsp_offset = cacheline * CACHELINE_BYTES;
+	__i915_vma_pin(tl->hwsp_ggtt);
+
+	cacheline_release(tl->hwsp_cacheline); /* ownership now xfered to rq */
+	cacheline_free(tl->hwsp_cacheline);
+
+	cacheline_acquire(cl);
+	tl->hwsp_cacheline = cl;
+
+	*seqno = timeline_advance(tl);
+	return 0;
+
+err_cacheline:
+	kfree(cl);
+err_hwsp:
+	__idle_hwsp_free(vma->private, cacheline);
+err_rollback:
+	timeline_rollback(tl);
+	return err;
+}
+
+int i915_timeline_get_seqno(struct i915_timeline *tl,
+			    struct i915_request *rq,
+			    u32 *seqno)
+{
+	*seqno = timeline_advance(tl);
+
+	/* Replace the HWSP on wraparound for HW semaphores */
+	if (unlikely(!*seqno && tl->hwsp_cacheline))
+		return __i915_timeline_get_seqno(tl, rq, seqno);
+
+	return 0;
+}
+
+int i915_timeline_read_lock(struct i915_timeline *tl, struct i915_request *rq)
+{
+	GEM_BUG_ON(!tl->pin_count);
+	return i915_active_ref(&tl->hwsp_cacheline->active,
+			       rq->fence.context, rq);
+}
+
 void i915_timeline_unpin(struct i915_timeline *tl)
 {
 	GEM_BUG_ON(!tl->pin_count);
@@ -300,6 +498,7 @@ void i915_timeline_unpin(struct i915_timeline *tl)
 		return;
 
 	timeline_remove_from_active(tl);
+	cacheline_release(tl->hwsp_cacheline);
 
 	/*
 	 * Since this timeline is idle, all bariers upon which we were waiting
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index 7bec7d2e45bf..d78ec6fbc000 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -34,7 +34,7 @@
 #include "i915_utils.h"
 
 struct i915_vma;
-struct i915_timeline_hwsp;
+struct i915_timeline_cacheline;
 
 struct i915_timeline {
 	u64 fence_context;
@@ -49,6 +49,8 @@ struct i915_timeline {
 	struct i915_vma *hwsp_ggtt;
 	u32 hwsp_offset;
 
+	struct i915_timeline_cacheline *hwsp_cacheline;
+
 	bool has_initial_breadcrumb;
 
 	/**
@@ -160,6 +162,11 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
 }
 
 int i915_timeline_pin(struct i915_timeline *tl);
+int i915_timeline_get_seqno(struct i915_timeline *tl,
+			    struct i915_request *rq,
+			    u32 *seqno);
+int i915_timeline_read_lock(struct i915_timeline *tl,
+			    struct i915_request *rq);
 void i915_timeline_unpin(struct i915_timeline *tl);
 
 void i915_timelines_init(struct drm_i915_private *i915);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 09/11] drm/i915/execlists: Refactor out can_merge_rq()
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (6 preceding siblings ...)
  2019-01-30  2:19 ` [PATCH 08/11] drm/i915: Keep timeline HWSP allocated until the system is idle Chris Wilson
@ 2019-01-30  2:19 ` Chris Wilson
  2019-01-30 18:05   ` Tvrtko Ursulin
  2019-01-30  2:19 ` [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+ Chris Wilson
                   ` (6 subsequent siblings)
  14 siblings, 1 reply; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:19 UTC (permalink / raw)
  To: intel-gfx

In the next patch, we add another user that wants to check whether
requests can be merge into a single HW execution, and in the future we
want to add more conditions under which requests from the same context
cannot be merge. In preparation, extract out can_merge_rq().

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 2616b0b3e8d5..e97ce54138d3 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -285,12 +285,11 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
 }
 
 __maybe_unused static inline bool
-assert_priority_queue(const struct intel_engine_execlists *execlists,
-		      const struct i915_request *prev,
+assert_priority_queue(const struct i915_request *prev,
 		      const struct i915_request *next)
 {
-	if (!prev)
-		return true;
+	const struct intel_engine_execlists *execlists =
+		&prev->engine->execlists;
 
 	/*
 	 * Without preemption, the prev may refer to the still active element
@@ -601,6 +600,17 @@ static bool can_merge_ctx(const struct intel_context *prev,
 	return true;
 }
 
+static bool can_merge_rq(const struct i915_request *prev,
+			 const struct i915_request *next)
+{
+	GEM_BUG_ON(!assert_priority_queue(prev, next));
+
+	if (!can_merge_ctx(prev->hw_context, next->hw_context))
+		return false;
+
+	return true;
+}
+
 static void port_assign(struct execlist_port *port, struct i915_request *rq)
 {
 	GEM_BUG_ON(rq == port_request(port));
@@ -753,8 +763,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		int i;
 
 		priolist_for_each_request_consume(rq, rn, p, i) {
-			GEM_BUG_ON(!assert_priority_queue(execlists, last, rq));
-
 			/*
 			 * Can we combine this request with the current port?
 			 * It has to be the same context/ringbuffer and not
@@ -766,8 +774,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			 * second request, and so we never need to tell the
 			 * hardware about the first.
 			 */
-			if (last &&
-			    !can_merge_ctx(rq->hw_context, last->hw_context)) {
+			if (last && !can_merge_rq(last, rq)) {
+				if (last->hw_context == rq->hw_context)
+					goto done;
+
 				/*
 				 * If we are on the second port and cannot
 				 * combine this request with the last, then we
@@ -787,7 +797,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 				    ctx_single_port_submission(rq->hw_context))
 					goto done;
 
-				GEM_BUG_ON(last->hw_context == rq->hw_context);
 
 				if (submit)
 					port_assign(port, last);
@@ -827,8 +836,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * request triggering preemption on the next dequeue (or subsequent
 	 * interrupt for secondary ports).
 	 */
-	execlists->queue_priority_hint =
-		port != execlists->port ? rq_prio(last) : INT_MIN;
+	execlists->queue_priority_hint = queue_prio(execlists);
 
 	if (submit) {
 		port_assign(port, last);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (7 preceding siblings ...)
  2019-01-30  2:19 ` [PATCH 09/11] drm/i915/execlists: Refactor out can_merge_rq() Chris Wilson
@ 2019-01-30  2:19 ` Chris Wilson
  2019-01-31 13:19   ` Tvrtko Ursulin
  2019-01-30  2:19 ` [PATCH 11/11] drm/i915: Prioritise non-busywait semaphore workloads Chris Wilson
                   ` (5 subsequent siblings)
  14 siblings, 1 reply; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:19 UTC (permalink / raw)
  To: intel-gfx

Having introduced per-context seqno, we now have a means to identity
progress across the system without feel of rollback as befell the
global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
advance of submission safe in the knowledge that our target seqno and
address is stable.

However, since we are telling the GPU to busy-spin on the target address
until it matches the signaling seqno, we only want to do so when we are
sure that busy-spin will be completed quickly. To achieve this we only
submit the request to HW once the signaler is itself executing (modulo
preemption causing us to wait longer), and we only do so for default and
above priority requests (so that idle priority tasks never themselves
hog the GPU waiting for others).

But what AB-BA deadlocks? If you remove B, there can be no deadlock...
The issue is that with a deep ELSP queue, we can queue up a pair of
AB-BA on different engines, thus forming a classic mutual exclusion
deadlock. We side-step that issue by restricting the queue depth to
avoid having multiple semaphores in flight and so we only ever take one
set of locks at a time.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_request.c       | 153 +++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_request.h       |   1 +
 drivers/gpu/drm/i915/i915_scheduler.c     |   1 +
 drivers/gpu/drm/i915/i915_scheduler.h     |   1 +
 drivers/gpu/drm/i915/i915_sw_fence.c      |   4 +-
 drivers/gpu/drm/i915/i915_sw_fence.h      |   3 +
 drivers/gpu/drm/i915/intel_gpu_commands.h |   5 +
 drivers/gpu/drm/i915/intel_lrc.c          |  14 +-
 8 files changed, 178 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 07e4c3c68ecd..6d825cd28ae6 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -22,8 +22,9 @@
  *
  */
 
-#include <linux/prefetch.h>
 #include <linux/dma-fence-array.h>
+#include <linux/irq_work.h>
+#include <linux/prefetch.h>
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/signal.h>
@@ -326,6 +327,76 @@ void i915_request_retire_upto(struct i915_request *rq)
 	} while (tmp != rq);
 }
 
+struct execute_cb {
+	struct list_head link;
+	struct irq_work work;
+	struct i915_sw_fence *fence;
+};
+
+static void irq_execute_cb(struct irq_work *wrk)
+{
+	struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
+
+	i915_sw_fence_complete(cb->fence);
+	kfree(cb);
+}
+
+static void __notify_execute_cb(struct i915_request *rq)
+{
+	struct execute_cb *cb;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (list_empty(&rq->execute_cb))
+		return;
+
+	list_for_each_entry(cb, &rq->execute_cb, link)
+		irq_work_queue(&cb->work);
+
+	/*
+	 * XXX Rollback on __i915_request_unsubmit()
+	 *
+	 * In the future, perhaps when we have an active time-slicing scheduler,
+	 * it will be interesting to unsubmit parallel execution and remove
+	 * busywaits from the GPU until their master is restarted. This is
+	 * quite hairy, we have to carefully rollback the fence and do a
+	 * preempt-to-idle cycle on the target engine, all the while the
+	 * master execute_cb may refire.
+	 */
+	INIT_LIST_HEAD(&rq->execute_cb);
+}
+
+static int
+i915_request_await_execution(struct i915_request *rq,
+			     struct i915_request *signal,
+			     gfp_t gfp)
+{
+	struct execute_cb *cb;
+	unsigned long flags;
+
+	if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags))
+		return 0;
+
+	cb = kmalloc(sizeof(*cb), gfp);
+	if (!cb)
+		return -ENOMEM;
+
+	cb->fence = &rq->submit;
+	i915_sw_fence_await(cb->fence);
+	init_irq_work(&cb->work, irq_execute_cb);
+
+	spin_lock_irqsave(&signal->lock, flags);
+	if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags)) {
+		i915_sw_fence_complete(cb->fence);
+		kfree(cb);
+	} else {
+		list_add_tail(&cb->link, &signal->execute_cb);
+	}
+	spin_unlock_irqrestore(&signal->lock, flags);
+
+	return 0;
+}
+
 static void move_to_timeline(struct i915_request *request,
 			     struct i915_timeline *timeline)
 {
@@ -373,6 +444,7 @@ void __i915_request_submit(struct i915_request *request)
 	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
 	    !i915_request_enable_breadcrumb(request))
 		intel_engine_queue_breadcrumbs(engine);
+	__notify_execute_cb(request);
 	spin_unlock(&request->lock);
 
 	engine->emit_fini_breadcrumb(request,
@@ -613,6 +685,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	}
 
 	INIT_LIST_HEAD(&rq->active_list);
+	INIT_LIST_HEAD(&rq->execute_cb);
 
 	tl = ce->ring->timeline;
 	ret = i915_timeline_get_seqno(tl, rq, &seqno);
@@ -700,6 +773,81 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	return ERR_PTR(ret);
 }
 
+static int
+emit_semaphore_wait(struct i915_request *to,
+		    struct i915_request *from,
+		    gfp_t gfp)
+{
+	u32 *cs;
+	int err;
+
+	GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
+
+	err = i915_timeline_read_lock(from->timeline, to);
+	if (err)
+		return err;
+
+	/*
+	 * If we know our signaling request has started, we know that it
+	 * must, at least, have passed its initial breadcrumb and that its
+	 * seqno can only increase, therefore any change in its breadcrumb
+	 * must indicate completion. By using a "not equal to start" compare
+	 * we avoid the murky issue of how to handle seqno wraparound in an
+	 * async environment (short answer, we must stop the world whenever
+	 * any context wraps!) as the likelihood of missing one request then
+	 * seeing the same start value for a new request is 1 in 2^31, and
+	 * even then we know that the new request has started and is in
+	 * progress, so we are sure it will complete soon enough (not to
+	 * worry about).
+	 */
+	if (i915_request_started(from)) {
+		cs = intel_ring_begin(to, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_NEQ_SDD;
+		*cs++ = from->fence.seqno - 1;
+		*cs++ = from->timeline->hwsp_offset;
+		*cs++ = 0;
+
+		intel_ring_advance(to, cs);
+	} else {
+		int err;
+
+		err = i915_request_await_execution(to, from, gfp);
+		if (err)
+			return err;
+
+		cs = intel_ring_begin(to, 4);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		/*
+		 * Using greater-than-or-equal here means we have to worry
+		 * about seqno wraparound. To side step that issue, we swap
+		 * the timeline HWSP upon wrapping, so that everyone listening
+		 * for the old (pre-wrap) values do not see the much smaller
+		 * (post-wrap) values than they were expecting (and so wait
+		 * forever).
+		 */
+		*cs++ = MI_SEMAPHORE_WAIT |
+			MI_SEMAPHORE_GLOBAL_GTT |
+			MI_SEMAPHORE_POLL |
+			MI_SEMAPHORE_SAD_GTE_SDD;
+		*cs++ = from->fence.seqno;
+		*cs++ = from->timeline->hwsp_offset;
+		*cs++ = 0;
+
+		intel_ring_advance(to, cs);
+	}
+
+	to->sched.semaphore = true;
+	return 0;
+}
+
 static int
 i915_request_await_request(struct i915_request *to, struct i915_request *from)
 {
@@ -723,6 +871,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
 		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
 						       &from->submit,
 						       I915_FENCE_GFP);
+	} else if (HAS_EXECLISTS(to->i915) &&
+		   to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
+		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
 	} else {
 		ret = i915_sw_fence_await_dma_fence(&to->submit,
 						    &from->fence, 0,
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 40f3e8dcbdd5..66a374ee177a 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -127,6 +127,7 @@ struct i915_request {
 	 */
 	struct i915_sw_fence submit;
 	wait_queue_entry_t submitq;
+	struct list_head execute_cb;
 
 	/*
 	 * A list of everyone we wait upon, and everyone who waits upon us.
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index d01683167c77..aa6c663dca09 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -29,6 +29,7 @@ void i915_sched_node_init(struct i915_sched_node *node)
 	INIT_LIST_HEAD(&node->waiters_list);
 	INIT_LIST_HEAD(&node->link);
 	node->attr.priority = I915_PRIORITY_INVALID;
+	node->semaphore = false;
 }
 
 static struct i915_dependency *
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index dbe9cb7ecd82..d764cf10536f 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -72,6 +72,7 @@ struct i915_sched_node {
 	struct list_head waiters_list; /* those after us, they depend upon us */
 	struct list_head link;
 	struct i915_sched_attr attr;
+	bool semaphore;
 };
 
 struct i915_dependency {
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 7c58b049ecb5..8d1400d378d7 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -192,7 +192,7 @@ static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
 	__i915_sw_fence_notify(fence, FENCE_FREE);
 }
 
-static void i915_sw_fence_complete(struct i915_sw_fence *fence)
+void i915_sw_fence_complete(struct i915_sw_fence *fence)
 {
 	debug_fence_assert(fence);
 
@@ -202,7 +202,7 @@ static void i915_sw_fence_complete(struct i915_sw_fence *fence)
 	__i915_sw_fence_complete(fence, NULL);
 }
 
-static void i915_sw_fence_await(struct i915_sw_fence *fence)
+void i915_sw_fence_await(struct i915_sw_fence *fence)
 {
 	debug_fence_assert(fence);
 	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 0e055ea0179f..6dec9e1d1102 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -79,6 +79,9 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    unsigned long timeout,
 				    gfp_t gfp);
 
+void i915_sw_fence_await(struct i915_sw_fence *fence);
+void i915_sw_fence_complete(struct i915_sw_fence *fence);
+
 static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
 {
 	return atomic_read(&fence->pending) <= 0;
diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
index b96a31bc1080..0efaadd3bc32 100644
--- a/drivers/gpu/drm/i915/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
@@ -106,7 +106,12 @@
 #define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
 #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
 #define   MI_SEMAPHORE_POLL		(1<<15)
+#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
 #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
+#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
+#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
+#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
+#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
 #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
 #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
 #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e97ce54138d3..80d17b75b2e6 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -421,7 +421,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
 	 * in the priority queue, but they will not gain immediate access to
 	 * the GPU.
 	 */
-	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY) {
+	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY &&
+	    i915_request_started(active)) {
 		prio |= ACTIVE_PRIORITY;
 		active->sched.attr.priority = prio;
 		list_move_tail(&active->sched.link,
@@ -605,6 +606,17 @@ static bool can_merge_rq(const struct i915_request *prev,
 {
 	GEM_BUG_ON(!assert_priority_queue(prev, next));
 
+	/*
+	 * To avoid AB-BA deadlocks, we simply restrict ourselves to only
+	 * submitting one semaphore (think HW spinlock) to HW at a time. This
+	 * prevents the execution callback on a later sempahore from being
+	 * queued on another engine, so no cycle can be formed. Preemption
+	 * rules should mean that if this semaphore is preempted, its
+	 * dependency chain is preserved and suitably promoted via PI.
+	 */
+	if (prev->sched.semaphore && !i915_request_started(prev))
+		return false;
+
 	if (!can_merge_ctx(prev->hw_context, next->hw_context))
 		return false;
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* [PATCH 11/11] drm/i915: Prioritise non-busywait semaphore workloads
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (8 preceding siblings ...)
  2019-01-30  2:19 ` [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+ Chris Wilson
@ 2019-01-30  2:19 ` Chris Wilson
  2019-01-30  2:43 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Patchwork
                   ` (4 subsequent siblings)
  14 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  2:19 UTC (permalink / raw)
  To: intel-gfx

We don't want to busywait on the GPU if we have other work to do. If we
give non-busywaiting workloads higher (initial) priority than workloads
that require a busywait, we will prioritise work that is ready to run
immediately.

Testcase: igt/gem_exec_schedule/semaphore
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_request.c   |  8 +++++++-
 drivers/gpu/drm/i915/i915_scheduler.c |  2 +-
 drivers/gpu/drm/i915/i915_scheduler.h | 11 +++++++----
 drivers/gpu/drm/i915/intel_lrc.c      |  5 +++--
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 6d825cd28ae6..30cd8724d39f 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -844,7 +844,7 @@ emit_semaphore_wait(struct i915_request *to,
 		intel_ring_advance(to, cs);
 	}
 
-	to->sched.semaphore = true;
+	to->sched.semaphore |= I915_SCHED_HAS_SEMAPHORE;
 	return 0;
 }
 
@@ -867,6 +867,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
 			return ret;
 	}
 
+	if (from->sched.semaphore && !i915_request_started(from))
+		to->sched.semaphore |= I915_SCHED_CHAIN_SEMAPHORE;
+
 	if (to->engine == from->engine) {
 		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
 						       &from->submit,
@@ -1117,6 +1120,9 @@ void i915_request_add(struct i915_request *request)
 	if (engine->schedule) {
 		struct i915_sched_attr attr = request->gem_context->sched;
 
+		if (!request->sched.semaphore)
+			attr.priority |= I915_PRIORITY_NOSEMAPHORE;
+
 		/*
 		 * Boost priorities to new clients (new request flows).
 		 *
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index aa6c663dca09..4394763d10d8 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -29,7 +29,7 @@ void i915_sched_node_init(struct i915_sched_node *node)
 	INIT_LIST_HEAD(&node->waiters_list);
 	INIT_LIST_HEAD(&node->link);
 	node->attr.priority = I915_PRIORITY_INVALID;
-	node->semaphore = false;
+	node->semaphore = 0;
 }
 
 static struct i915_dependency *
diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
index d764cf10536f..d84f09e8c248 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.h
+++ b/drivers/gpu/drm/i915/i915_scheduler.h
@@ -24,14 +24,15 @@ enum {
 	I915_PRIORITY_INVALID = INT_MIN
 };
 
-#define I915_USER_PRIORITY_SHIFT 2
+#define I915_USER_PRIORITY_SHIFT 3
 #define I915_USER_PRIORITY(x) ((x) << I915_USER_PRIORITY_SHIFT)
 
 #define I915_PRIORITY_COUNT BIT(I915_USER_PRIORITY_SHIFT)
 #define I915_PRIORITY_MASK (I915_PRIORITY_COUNT - 1)
 
-#define I915_PRIORITY_WAIT	((u8)BIT(0))
-#define I915_PRIORITY_NEWCLIENT	((u8)BIT(1))
+#define I915_PRIORITY_WAIT		((u8)BIT(0))
+#define I915_PRIORITY_NEWCLIENT		((u8)BIT(1))
+#define I915_PRIORITY_NOSEMAPHORE	((u8)BIT(2))
 
 struct i915_sched_attr {
 	/**
@@ -72,7 +73,9 @@ struct i915_sched_node {
 	struct list_head waiters_list; /* those after us, they depend upon us */
 	struct list_head link;
 	struct i915_sched_attr attr;
-	bool semaphore;
+	unsigned long semaphore;
+#define I915_SCHED_HAS_SEMAPHORE	BIT(0)
+#define I915_SCHED_CHAIN_SEMAPHORE	BIT(1)
 };
 
 struct i915_dependency {
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 80d17b75b2e6..5bb3964bb202 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -164,7 +164,7 @@
 #define WA_TAIL_DWORDS 2
 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 
-#define ACTIVE_PRIORITY (I915_PRIORITY_NEWCLIENT)
+#define ACTIVE_PRIORITY (I915_PRIORITY_NEWCLIENT | I915_PRIORITY_NOSEMAPHORE)
 
 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 					    struct intel_engine_cs *engine,
@@ -614,7 +614,8 @@ static bool can_merge_rq(const struct i915_request *prev,
 	 * rules should mean that if this semaphore is preempted, its
 	 * dependency chain is preserved and suitably promoted via PI.
 	 */
-	if (prev->sched.semaphore && !i915_request_started(prev))
+	if (prev->sched.semaphore & I915_SCHED_HAS_SEMAPHORE &&
+	    !i915_request_started(prev))
 		return false;
 
 	if (!can_merge_ctx(prev->hw_context, next->hw_context))
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (9 preceding siblings ...)
  2019-01-30  2:19 ` [PATCH 11/11] drm/i915: Prioritise non-busywait semaphore workloads Chris Wilson
@ 2019-01-30  2:43 ` Patchwork
  2019-01-30  2:48 ` ✗ Fi.CI.SPARSE: " Patchwork
                   ` (3 subsequent siblings)
  14 siblings, 0 replies; 38+ messages in thread
From: Patchwork @ 2019-01-30  2:43 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset
URL   : https://patchwork.freedesktop.org/series/55956/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
45b1ecb63e1a drm/i915: Revoke mmaps and prevent access to fence registers across reset
2c67b547d327 drm/i915/execlists: Suppress redundant preemption
70551aa2b8d3 drm/i915/selftests: Exercise some AB...BA preemption chains
298a346378c9 drm/i915: Generalise GPU activity tracking
-:31: WARNING:FILE_PATH_CHANGES: added, moved or deleted file(s), does MAINTAINERS need updating?
#31: 
new file mode 100644

-:36: WARNING:SPDX_LICENSE_TAG: Missing or malformed SPDX-License-Identifier tag in line 1
#36: FILE: drivers/gpu/drm/i915/i915_active.c:1:
+/*

-:268: WARNING:SPDX_LICENSE_TAG: Missing or malformed SPDX-License-Identifier tag in line 1
#268: FILE: drivers/gpu/drm/i915/i915_active.h:1:
+/*

-:340: WARNING:SPDX_LICENSE_TAG: Missing or malformed SPDX-License-Identifier tag in line 1
#340: FILE: drivers/gpu/drm/i915/i915_active_types.h:1:
+/*

-:695: WARNING:SPDX_LICENSE_TAG: Missing or malformed SPDX-License-Identifier tag in line 1
#695: FILE: drivers/gpu/drm/i915/selftests/i915_active.c:1:
+/*

total: 0 errors, 5 warnings, 0 checks, 793 lines checked
31ec4b8fd5cb drm/i915: Add timeline barrier support
c1155360bb39 drm/i915: Allocate active tracking nodes from a slabcache
2dc94f068a5e drm/i915: Pull i915_gem_active into the i915_active family
-:699: CHECK:PARENTHESIS_ALIGNMENT: Alignment should match open parenthesis
#699: FILE: drivers/gpu/drm/i915/i915_gem_fence_reg.c:227:
+		ret = i915_active_request_retire(&vma->last_fence,
 					     &vma->obj->base.dev->struct_mutex);

-:708: CHECK:PARENTHESIS_ALIGNMENT: Alignment should match open parenthesis
#708: FILE: drivers/gpu/drm/i915/i915_gem_fence_reg.c:236:
+		ret = i915_active_request_retire(&old->last_fence,
 					     &old->obj->base.dev->struct_mutex);

-:1415: CHECK:PARENTHESIS_ALIGNMENT: Alignment should match open parenthesis
#1415: FILE: drivers/gpu/drm/i915/i915_vma.c:991:
+		ret = i915_active_request_retire(&vma->last_fence,
+					      &vma->vm->i915->drm.struct_mutex);

total: 0 errors, 0 warnings, 3 checks, 1398 lines checked
69af025cbfc5 drm/i915: Keep timeline HWSP allocated until the system is idle
ad99d8227d36 drm/i915/execlists: Refactor out can_merge_rq()
136b83dc2f9c drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
-:310: CHECK:SPACING: spaces preferred around that '<<' (ctx:VxV)
#310: FILE: drivers/gpu/drm/i915/intel_gpu_commands.h:109:
+#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
                                  	  ^

-:312: CHECK:SPACING: spaces preferred around that '<<' (ctx:VxV)
#312: FILE: drivers/gpu/drm/i915/intel_gpu_commands.h:111:
+#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
                                  	  ^

-:313: CHECK:SPACING: spaces preferred around that '<<' (ctx:VxV)
#313: FILE: drivers/gpu/drm/i915/intel_gpu_commands.h:112:
+#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
                                   	  ^

-:314: CHECK:SPACING: spaces preferred around that '<<' (ctx:VxV)
#314: FILE: drivers/gpu/drm/i915/intel_gpu_commands.h:113:
+#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
                                  	  ^

-:315: CHECK:SPACING: spaces preferred around that '<<' (ctx:VxV)
#315: FILE: drivers/gpu/drm/i915/intel_gpu_commands.h:114:
+#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
                                   	  ^

total: 0 errors, 0 warnings, 5 checks, 274 lines checked
3a880e78d537 drm/i915: Prioritise non-busywait semaphore workloads

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* ✗ Fi.CI.SPARSE: warning for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (10 preceding siblings ...)
  2019-01-30  2:43 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Patchwork
@ 2019-01-30  2:48 ` Patchwork
  2019-01-30  3:07 ` ✓ Fi.CI.BAT: success " Patchwork
                   ` (2 subsequent siblings)
  14 siblings, 0 replies; 38+ messages in thread
From: Patchwork @ 2019-01-30  2:48 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset
URL   : https://patchwork.freedesktop.org/series/55956/
State : warning

== Summary ==

$ dim sparse origin/drm-tip
Sparse version: v0.5.2
Commit: drm/i915: Revoke mmaps and prevent access to fence registers across reset
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
-drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_gem.c:986:39: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/i915_reset.c:1301:5: warning: context imbalance in 'i915_reset_lock' - different lock contexts for basic block
+drivers/gpu/drm/i915/i915_reset.c:1322:6: warning: context imbalance in 'i915_reset_unlock' - unexpected unlock
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3548:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3542:16: warning: expression using sizeof(void)

Commit: drm/i915/execlists: Suppress redundant preemption
Okay!

Commit: drm/i915/selftests: Exercise some AB...BA preemption chains
Okay!

Commit: drm/i915: Generalise GPU activity tracking
+./include/uapi/linux/perf_event.h:147:56: warning: cast truncates bits from constant value (8000000000000000 becomes 0)

Commit: drm/i915: Add timeline barrier support
Okay!

Commit: drm/i915: Allocate active tracking nodes from a slabcache
-drivers/gpu/drm/i915/selftests/../i915_drv.h:3542:16: warning: expression using sizeof(void)
+drivers/gpu/drm/i915/selftests/../i915_drv.h:3544:16: warning: expression using sizeof(void)

Commit: drm/i915: Pull i915_gem_active into the i915_active family
Okay!

Commit: drm/i915: Keep timeline HWSP allocated until the system is idle
Okay!

Commit: drm/i915/execlists: Refactor out can_merge_rq()
Okay!

Commit: drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
Okay!

Commit: drm/i915: Prioritise non-busywait semaphore workloads
Okay!

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (11 preceding siblings ...)
  2019-01-30  2:48 ` ✗ Fi.CI.SPARSE: " Patchwork
@ 2019-01-30  3:07 ` Patchwork
  2019-01-30  7:32 ` ✗ Fi.CI.IGT: failure " Patchwork
  2019-01-30 18:41 ` ✗ Fi.CI.BAT: failure for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3) Patchwork
  14 siblings, 0 replies; 38+ messages in thread
From: Patchwork @ 2019-01-30  3:07 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset
URL   : https://patchwork.freedesktop.org/series/55956/
State : success

== Summary ==

CI Bug Log - changes from CI_DRM_5506 -> Patchwork_12083
====================================================

Summary
-------

  **SUCCESS**

  No regressions found.

  External URL: https://patchwork.freedesktop.org/api/1.0/series/55956/revisions/1/mbox/

Known issues
------------

  Here are the changes found in Patchwork_12083 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@i915_selftest@live_execlists:
    - fi-apl-guc:         NOTRUN -> INCOMPLETE [fdo#103927]

  * igt@kms_busy@basic-flip-a:
    - fi-gdg-551:         PASS -> FAIL [fdo#103182]

  * igt@kms_chamelium@hdmi-hpd-fast:
    - fi-kbl-7500u:       PASS -> FAIL [fdo#109485]

  * igt@kms_flip@basic-flip-vs-modeset:
    - fi-skl-6700hq:      PASS -> DMESG-WARN [fdo#105998] +1

  * igt@pm_rpm@module-reload:
    - fi-skl-6770hq:      PASS -> DMESG-WARN [fdo#105541]

  
#### Possible fixes ####

  * igt@kms_chamelium@common-hpd-after-suspend:
    - fi-kbl-7567u:       WARN [fdo#109380] -> PASS

  * igt@kms_pipe_crc_basic@nonblocking-crc-pipe-c:
    - fi-kbl-7567u:       {SKIP} [fdo#109271] -> PASS +33

  * igt@kms_pipe_crc_basic@suspend-read-crc-pipe-c:
    - fi-hsw-4770:        {SKIP} [fdo#109271] -> PASS +3

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#103182]: https://bugs.freedesktop.org/show_bug.cgi?id=103182
  [fdo#103927]: https://bugs.freedesktop.org/show_bug.cgi?id=103927
  [fdo#105541]: https://bugs.freedesktop.org/show_bug.cgi?id=105541
  [fdo#105998]: https://bugs.freedesktop.org/show_bug.cgi?id=105998
  [fdo#108622]: https://bugs.freedesktop.org/show_bug.cgi?id=108622
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109380]: https://bugs.freedesktop.org/show_bug.cgi?id=109380
  [fdo#109485]: https://bugs.freedesktop.org/show_bug.cgi?id=109485


Participating hosts (44 -> 41)
------------------------------

  Additional (2): fi-byt-j1900 fi-apl-guc 
  Missing    (5): fi-kbl-soraka fi-ilk-m540 fi-byt-squawks fi-bsw-cyan fi-bdw-samus 


Build changes
-------------

    * Linux: CI_DRM_5506 -> Patchwork_12083

  CI_DRM_5506: 864d26b6a3e3b28515324f330b2240391f86f458 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4798: 998e0a4aedf10fb5f7c271018cd80d874668bf55 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12083: 3a880e78d53731890ab3f0d64afef8c217a8c52c @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

3a880e78d537 drm/i915: Prioritise non-busywait semaphore workloads
136b83dc2f9c drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
ad99d8227d36 drm/i915/execlists: Refactor out can_merge_rq()
69af025cbfc5 drm/i915: Keep timeline HWSP allocated until the system is idle
2dc94f068a5e drm/i915: Pull i915_gem_active into the i915_active family
c1155360bb39 drm/i915: Allocate active tracking nodes from a slabcache
31ec4b8fd5cb drm/i915: Add timeline barrier support
298a346378c9 drm/i915: Generalise GPU activity tracking
70551aa2b8d3 drm/i915/selftests: Exercise some AB...BA preemption chains
2c67b547d327 drm/i915/execlists: Suppress redundant preemption
45b1ecb63e1a drm/i915: Revoke mmaps and prevent access to fence registers across reset

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12083/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* ✗ Fi.CI.IGT: failure for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (12 preceding siblings ...)
  2019-01-30  3:07 ` ✓ Fi.CI.BAT: success " Patchwork
@ 2019-01-30  7:32 ` Patchwork
  2019-01-30  9:17   ` Chris Wilson
  2019-01-30 18:41 ` ✗ Fi.CI.BAT: failure for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3) Patchwork
  14 siblings, 1 reply; 38+ messages in thread
From: Patchwork @ 2019-01-30  7:32 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset
URL   : https://patchwork.freedesktop.org/series/55956/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_5506_full -> Patchwork_12083_full
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_12083_full absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_12083_full, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_12083_full:

### IGT changes ###

#### Possible regressions ####

  * igt@gem_eio@unwedge-stress:
    - shard-glk:          PASS -> FAIL +6
    - shard-snb:          PASS -> FAIL +1
    - shard-apl:          PASS -> FAIL +1

  * igt@gem_eio@wait-wedge-1us:
    - shard-kbl:          PASS -> FAIL +1

  
Known issues
------------

  Here are the changes found in Patchwork_12083_full that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_eio@in-flight-contexts-10ms:
    - shard-snb:          PASS -> FAIL [fdo#107799] +6

  * igt@kms_cursor_crc@cursor-256x256-random:
    - shard-glk:          PASS -> FAIL [fdo#103232] +1

  * igt@kms_cursor_crc@cursor-256x85-random:
    - shard-apl:          PASS -> FAIL [fdo#103232] +4

  * igt@kms_flip@flip-vs-expired-vblank:
    - shard-glk:          PASS -> FAIL [fdo#105363]

  * igt@kms_plane@plane-position-covered-pipe-c-planes:
    - shard-apl:          PASS -> FAIL [fdo#103166]

  * igt@kms_plane_multiple@atomic-pipe-c-tiling-x:
    - shard-glk:          PASS -> FAIL [fdo#103166]

  * igt@kms_setmode@basic:
    - shard-kbl:          PASS -> FAIL [fdo#99912]

  * igt@perf_pmu@rc6-runtime-pm-long:
    - shard-kbl:          PASS -> FAIL [fdo#105010]

  
#### Possible fixes ####

  * igt@gem_mmap_gtt@hang:
    - shard-kbl:          FAIL [fdo#109469] -> PASS
    - shard-hsw:          FAIL [fdo#109469] -> PASS
    - shard-snb:          FAIL [fdo#109469] -> PASS
    - shard-glk:          FAIL [fdo#109469] -> PASS
    - shard-apl:          FAIL [fdo#109469] -> PASS

  * igt@kms_cursor_crc@cursor-256x256-suspend:
    - shard-apl:          FAIL [fdo#103191] / [fdo#103232] -> PASS

  * igt@kms_cursor_crc@cursor-64x21-onscreen:
    - shard-glk:          FAIL [fdo#103232] -> PASS

  * igt@kms_cursor_crc@cursor-64x64-dpms:
    - shard-apl:          FAIL [fdo#103232] -> PASS

  * igt@kms_cursor_crc@cursor-64x64-suspend:
    - shard-snb:          INCOMPLETE [fdo#105411] -> PASS

  * igt@kms_plane@pixel-format-pipe-c-planes-source-clamping:
    - shard-apl:          FAIL [fdo#108948] -> PASS

  * igt@kms_plane_multiple@atomic-pipe-b-tiling-none:
    - shard-glk:          FAIL [fdo#103166] -> PASS +1
    - shard-apl:          FAIL [fdo#103166] -> PASS +1

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [fdo#103166]: https://bugs.freedesktop.org/show_bug.cgi?id=103166
  [fdo#103191]: https://bugs.freedesktop.org/show_bug.cgi?id=103191
  [fdo#103232]: https://bugs.freedesktop.org/show_bug.cgi?id=103232
  [fdo#105010]: https://bugs.freedesktop.org/show_bug.cgi?id=105010
  [fdo#105363]: https://bugs.freedesktop.org/show_bug.cgi?id=105363
  [fdo#105411]: https://bugs.freedesktop.org/show_bug.cgi?id=105411
  [fdo#107799]: https://bugs.freedesktop.org/show_bug.cgi?id=107799
  [fdo#108948]: https://bugs.freedesktop.org/show_bug.cgi?id=108948
  [fdo#109271]: https://bugs.freedesktop.org/show_bug.cgi?id=109271
  [fdo#109278]: https://bugs.freedesktop.org/show_bug.cgi?id=109278
  [fdo#109469]: https://bugs.freedesktop.org/show_bug.cgi?id=109469
  [fdo#99912]: https://bugs.freedesktop.org/show_bug.cgi?id=99912


Participating hosts (7 -> 5)
------------------------------

  Missing    (2): shard-skl shard-iclb 


Build changes
-------------

    * Linux: CI_DRM_5506 -> Patchwork_12083

  CI_DRM_5506: 864d26b6a3e3b28515324f330b2240391f86f458 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_4798: 998e0a4aedf10fb5f7c271018cd80d874668bf55 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_12083: 3a880e78d53731890ab3f0d64afef8c217a8c52c @ git://anongit.freedesktop.org/gfx-ci/linux
  piglit_4509: fdc5a4ca11124ab8413c7988896eec4c97336694 @ git://anongit.freedesktop.org/piglit

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_12083/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: ✗ Fi.CI.IGT: failure for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset
  2019-01-30  7:32 ` ✗ Fi.CI.IGT: failure " Patchwork
@ 2019-01-30  9:17   ` Chris Wilson
  0 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30  9:17 UTC (permalink / raw)
  To: Patchwork; +Cc: intel-gfx

Quoting Patchwork (2019-01-30 07:32:25)
> #### Possible fixes ####
> 
>   * igt@gem_mmap_gtt@hang:
>     - shard-kbl:          FAIL [fdo#109469] -> PASS
>     - shard-hsw:          FAIL [fdo#109469] -> PASS
>     - shard-snb:          FAIL [fdo#109469] -> PASS
>     - shard-glk:          FAIL [fdo#109469] -> PASS
>     - shard-apl:          FAIL [fdo#109469] -> PASS

Good, that was the intent...

> #### Possible regressions ####
> 
>   * igt@gem_eio@unwedge-stress:
>     - shard-glk:          PASS -> FAIL +6
>     - shard-snb:          PASS -> FAIL +1
>     - shard-apl:          PASS -> FAIL +1
> 
>   * igt@gem_eio@wait-wedge-1us:
>     - shard-kbl:          PASS -> FAIL +1

Hmm. A mixture of too slow, and a missed reset???
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 04/11] drm/i915: Generalise GPU activity tracking
  2019-01-30  2:18 ` [PATCH 04/11] drm/i915: Generalise GPU activity tracking Chris Wilson
@ 2019-01-30 12:02   ` Tvrtko Ursulin
  2019-01-30 12:14     ` Chris Wilson
  0 siblings, 1 reply; 38+ messages in thread
From: Tvrtko Ursulin @ 2019-01-30 12:02 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 30/01/2019 02:18, Chris Wilson wrote:
> We currently track GPU memory usage inside VMA, such that we never
> release memory used by the GPU until after it has finished accessing it.
> However, we may want to track other resources aside from VMA, or we may
> want to split a VMA into multiple independent regions and track each
> separately. For this purpose, generalise our request tracking (akin to
> struct reservation_object) so that we can embed it into other objects.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/Makefile                 |   4 +-
>   drivers/gpu/drm/i915/i915_active.c            | 226 ++++++++++++++++++
>   drivers/gpu/drm/i915/i915_active.h            |  66 +++++
>   drivers/gpu/drm/i915/i915_active_types.h      |  26 ++
>   drivers/gpu/drm/i915/i915_gem_gtt.c           |   3 +-
>   drivers/gpu/drm/i915/i915_vma.c               | 173 +++-----------
>   drivers/gpu/drm/i915/i915_vma.h               |   9 +-
>   drivers/gpu/drm/i915/selftests/i915_active.c  | 158 ++++++++++++
>   .../drm/i915/selftests/i915_live_selftests.h  |   3 +-
>   9 files changed, 514 insertions(+), 154 deletions(-)
>   create mode 100644 drivers/gpu/drm/i915/i915_active.c
>   create mode 100644 drivers/gpu/drm/i915/i915_active.h
>   create mode 100644 drivers/gpu/drm/i915/i915_active_types.h
>   create mode 100644 drivers/gpu/drm/i915/selftests/i915_active.c
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 210d0e8777b6..1787e1299b1b 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -57,7 +57,9 @@ i915-$(CONFIG_DEBUG_FS) += i915_debugfs.o intel_pipe_crc.o
>   i915-$(CONFIG_PERF_EVENTS) += i915_pmu.o
>   
>   # GEM code
> -i915-y += i915_cmd_parser.o \
> +i915-y += \
> +	  i915_active.o \
> +	  i915_cmd_parser.o \
>   	  i915_gem_batch_pool.o \
>   	  i915_gem_clflush.o \
>   	  i915_gem_context.o \
> diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> new file mode 100644
> index 000000000000..e0182e19cb8b
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_active.c
> @@ -0,0 +1,226 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#include "i915_drv.h"
> +#include "i915_active.h"
> +
> +#define BKL(ref) (&(ref)->i915->drm.struct_mutex)
> +
> +struct active_node {
> +	struct i915_gem_active base;
> +	struct i915_active *ref;
> +	struct rb_node node;
> +	u64 timeline;
> +};
> +
> +static void
> +__active_retire(struct i915_active *ref)
> +{

You wouldn't consider naming this variable 'active' throughout? Ref 
reminds me so much of a kref eg. fence->refcount. Although 'active' has 
been used for gem_active so far. Not a blocker though, I'll get used to it.

> +	GEM_BUG_ON(!ref->count);
> +	if (!--ref->count)
> +		ref->retire(ref);
> +}
> +
> +static void
> +node_retire(struct i915_gem_active *base, struct i915_request *rq)
> +{
> +	__active_retire(container_of(base, struct active_node, base)->ref);
> +}
> +
> +static void
> +last_retire(struct i915_gem_active *base, struct i915_request *rq)
> +{
> +	__active_retire(container_of(base, struct i915_active, last));
> +}
> +
> +static struct i915_gem_active *
> +active_instance(struct i915_active *ref, u64 idx)
> +{
> +	struct active_node *node;
> +	struct rb_node **p, *parent;
> +	struct i915_request *old;
> +
> +	/*
> +	 * We track the most recently used timeline to skip a rbtree search
> +	 * for the common case, under typical loads we never need the rbtree
> +	 * at all. We can reuse the last slot if it is empty, that is
> +	 * after the previous activity has been retired, or if it matches the
> +	 * current timeline.
> +	 *
> +	 * Note that we allow the timeline to be active simultaneously in
> +	 * the rbtree and the last cache. We do this to avoid having
> +	 * to search and replace the rbtree element for a new timeline, with
> +	 * the cost being that we must be aware that the ref may be retired
> +	 * twice for the same timeline (as the older rbtree element will be
> +	 * retired before the new request added to last).
> +	 */
> +	old = i915_gem_active_raw(&ref->last, BKL(ref));
> +	if (!old || old->fence.context == idx)
> +		goto out;
> +
> +	/* Move the currently active fence into the rbtree */
> +	idx = old->fence.context;
> +
> +	parent = NULL;
> +	p = &ref->tree.rb_node;
> +	while (*p) {
> +		parent = *p;
> +
> +		node = rb_entry(parent, struct active_node, node);
> +		if (node->timeline == idx)
> +			goto replace;
> +
> +		if (node->timeline < idx)
> +			p = &parent->rb_right;
> +		else
> +			p = &parent->rb_left;
> +	}
> +
> +	node = kmalloc(sizeof(*node), GFP_KERNEL);
> +
> +	/* kmalloc may retire the ref->last (thanks shrinker)! */
> +	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
> +		kfree(node);
> +		goto out;
> +	}
> +
> +	if (unlikely(!node))
> +		return ERR_PTR(-ENOMEM);
> +
> +	init_request_active(&node->base, node_retire);
> +	node->ref = ref;
> +	node->timeline = idx;
> +
> +	rb_link_node(&node->node, parent, p);
> +	rb_insert_color(&node->node, &ref->tree);
> +
> +replace:
> +	/*
> +	 * Overwrite the previous active slot in the rbtree with last,
> +	 * leaving last zeroed. If the previous slot is still active,
> +	 * we must be careful as we now only expect to receive one retire
> +	 * callback not two, and so much undo the active counting for the
> +	 * overwritten slot.
> +	 */
> +	if (i915_gem_active_isset(&node->base)) {
> +		/* Retire ourselves from the old rq->active_list */
> +		__list_del_entry(&node->base.link);
> +		ref->count--;
> +		GEM_BUG_ON(!ref->count);
> +	}
> +	GEM_BUG_ON(list_empty(&ref->last.link));
> +	list_replace_init(&ref->last.link, &node->base.link);
> +	node->base.request = fetch_and_zero(&ref->last.request);
> +
> +out:
> +	return &ref->last;
> +}
> +
> +void i915_active_init(struct drm_i915_private *i915,
> +		      struct i915_active *ref,
> +		      void (*retire)(struct i915_active *ref))
> +{
> +	ref->i915 = i915;
> +	ref->retire = retire;
> +	ref->tree = RB_ROOT;
> +	init_request_active(&ref->last, last_retire);
> +	ref->count = 0;
> +}
> +
> +int i915_active_ref(struct i915_active *ref,
> +		    u64 timeline,
> +		    struct i915_request *rq)
> +{
> +	struct i915_gem_active *active;
> +
> +	active = active_instance(ref, timeline);
> +	if (IS_ERR(active))
> +		return PTR_ERR(active);
> +
> +	if (!i915_gem_active_isset(active))
> +		ref->count++;

Could stick a super-paranoid overflow GEM_BUG_ON here.

> +	i915_gem_active_set(active, rq);
> +
> +	return 0;
> +}
> +
> +bool i915_active_acquire(struct i915_active *ref)
> +{
> +	lockdep_assert_held(BKL(ref));
> +	return !ref->count++;
> +}
> +
> +void i915_active_release(struct i915_active *ref)
> +{
> +	lockdep_assert_held(BKL(ref));
> +	__active_retire(ref);
> +}
> +
> +int i915_active_wait(struct i915_active *ref)
> +{
> +	struct active_node *it, *n;
> +	int ret;
> +
> +	ret = i915_gem_active_retire(&ref->last, BKL(ref));
> +	if (ret)
> +		return ret;
> +
> +	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
> +		ret = i915_gem_active_retire(&it->base, BKL(ref));
> +		if (ret)
> +			return ret;
> +
> +		GEM_BUG_ON(i915_gem_active_isset(&it->base));
> +		kfree(it);
> +	}
> +	ref->tree = RB_ROOT;
> +
> +	return 0;
> +}
> +
> +static int __i915_request_await_active(struct i915_request *rq,
> +				       struct i915_gem_active *active)
> +{
> +	struct i915_request *barrier =
> +		i915_gem_active_raw(active, &rq->i915->drm.struct_mutex);
> +
> +	return barrier ? i915_request_await_dma_fence(rq, &barrier->fence) : 0;
> +}
> +
> +int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
> +{
> +	struct active_node *it, *n;
> +	int ret;
> +
> +	ret = __i915_request_await_active(rq, &ref->last);
> +	if (ret)
> +		return ret;
> +
> +	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
> +		ret = __i915_request_await_active(rq, &it->base);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +void i915_active_fini(struct i915_active *ref)
> +{
> +	struct active_node *it, *n;
> +
> +	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
> +
> +	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
> +		GEM_BUG_ON(i915_gem_active_isset(&it->base));
> +		kfree(it);
> +	}
> +	ref->tree = RB_ROOT;
> +}
> +
> +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> +#include "selftests/i915_active.c"
> +#endif
> diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
> new file mode 100644
> index 000000000000..c0729a046f98
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_active.h
> @@ -0,0 +1,66 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#ifndef _I915_ACTIVE_H_
> +#define _I915_ACTIVE_H_
> +
> +#include "i915_active_types.h"
> +
> +#include <linux/rbtree.h>
> +
> +#include "i915_request.h"
> +
> +/*
> + * GPU activity tracking
> + *
> + * Each set of commands submitted to the GPU compromises a single request that
> + * signals a fence upon completion. struct i915_request combines the
> + * command submission, scheduling and fence signaling roles. If we want to see
> + * if a particular task is complete, we need to grab the fence (struct
> + * i915_request) for that task and check or wait for it to be signaled. More
> + * often though we want to track the status of a bunch of tasks, for example
> + * to wait for the GPU to finish accessing some memory across a variety of
> + * different command pipelines from different clients. We could choose to
> + * track every single request associated with the task, but knowing that
> + * each request belongs to an ordered timeline (later requests within a
> + * timeline must wait for earlier requests), we need only track the
> + * latest request in each timeline to determine the overall status of the
> + * task.
> + *
> + * struct i915_active provides this tracking across timelines. It builds a
> + * composite shared-fence, and is updated as new work is submitted to the task,
> + * forming a snapshot of the current status. It should be embedded into the
> + * different resources that need to track their associated GPU activity to
> + * provide a callback when that GPU activity has ceased, or otherwise to
> + * provide a serialisation point either for request submission or for CPU
> + * synchronisation.
> + */
> +
> +void i915_active_init(struct drm_i915_private *i915,
> +		      struct i915_active *ref,
> +		      void (*retire)(struct i915_active *ref));
> +
> +int i915_active_ref(struct i915_active *ref,
> +		    u64 timeline,
> +		    struct i915_request *rq);
> +
> +int i915_active_wait(struct i915_active *ref);
> +
> +int i915_request_await_active(struct i915_request *rq,
> +			      struct i915_active *ref);
> +
> +bool i915_active_acquire(struct i915_active *ref);
> +void i915_active_release(struct i915_active *ref);
> +
> +static inline bool
> +i915_active_is_idle(const struct i915_active *ref)
> +{
> +	return !ref->count;
> +}
> +
> +void i915_active_fini(struct i915_active *ref);
> +
> +#endif /* _I915_ACTIVE_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
> new file mode 100644
> index 000000000000..411e502ed8dd
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_active_types.h
> @@ -0,0 +1,26 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2019 Intel Corporation
> + */
> +
> +#ifndef _I915_ACTIVE_TYPES_H_
> +#define _I915_ACTIVE_TYPES_H_
> +
> +#include <linux/rbtree.h>
> +
> +#include "i915_request.h"
> +
> +struct drm_i915_private;
> +
> +struct i915_active {
> +	struct drm_i915_private *i915;
> +
> +	struct rb_root tree;
> +	struct i915_gem_active last;
> +	unsigned int count;
> +
> +	void (*retire)(struct i915_active *ref);
> +};
> +
> +#endif /* _I915_ACTIVE_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 49b00996a15e..e625659c03a2 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1917,14 +1917,13 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
>   	if (!vma)
>   		return ERR_PTR(-ENOMEM);
>   
> +	i915_active_init(i915, &vma->active, NULL);
>   	init_request_active(&vma->last_fence, NULL);
>   
>   	vma->vm = &ggtt->vm;
>   	vma->ops = &pd_vma_ops;
>   	vma->private = ppgtt;
>   
> -	vma->active = RB_ROOT;
> -
>   	vma->size = size;
>   	vma->fence_size = size;
>   	vma->flags = I915_VMA_GGTT;
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index d83b8ad5f859..d4772061e642 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -63,22 +63,23 @@ static void vma_print_allocator(struct i915_vma *vma, const char *reason)
>   
>   #endif
>   
> -struct i915_vma_active {
> -	struct i915_gem_active base;
> -	struct i915_vma *vma;
> -	struct rb_node node;
> -	u64 timeline;
> -};
> +static void obj_bump_mru(struct drm_i915_gem_object *obj)
> +{
> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
>   
> -static void
> -__i915_vma_retire(struct i915_vma *vma, struct i915_request *rq)
> +	spin_lock(&i915->mm.obj_lock);
> +	if (obj->bind_count)
> +		list_move_tail(&obj->mm.link, &i915->mm.bound_list);
> +	spin_unlock(&i915->mm.obj_lock);
> +
> +	obj->mm.dirty = true; /* be paranoid  */
> +}
> +
> +static void __i915_vma_retire(struct i915_active *ref)
>   {
> +	struct i915_vma *vma = container_of(ref, typeof(*vma), active);
>   	struct drm_i915_gem_object *obj = vma->obj;
>   
> -	GEM_BUG_ON(!i915_vma_is_active(vma));
> -	if (--vma->active_count)
> -		return;
> -
>   	GEM_BUG_ON(!i915_gem_object_is_active(obj));
>   	if (--obj->active_count)
>   		return;
> @@ -90,16 +91,12 @@ __i915_vma_retire(struct i915_vma *vma, struct i915_request *rq)
>   		reservation_object_unlock(obj->resv);
>   	}
>   
> -	/* Bump our place on the bound list to keep it roughly in LRU order
> +	/*
> +	 * Bump our place on the bound list to keep it roughly in LRU order
>   	 * so that we don't steal from recently used but inactive objects
>   	 * (unless we are forced to ofc!)
>   	 */
> -	spin_lock(&rq->i915->mm.obj_lock);
> -	if (obj->bind_count)
> -		list_move_tail(&obj->mm.link, &rq->i915->mm.bound_list);
> -	spin_unlock(&rq->i915->mm.obj_lock);
> -
> -	obj->mm.dirty = true; /* be paranoid  */
> +	obj_bump_mru(obj);
>   
>   	if (i915_gem_object_has_active_reference(obj)) {
>   		i915_gem_object_clear_active_reference(obj);
> @@ -107,21 +104,6 @@ __i915_vma_retire(struct i915_vma *vma, struct i915_request *rq)
>   	}
>   }
>   
> -static void
> -i915_vma_retire(struct i915_gem_active *base, struct i915_request *rq)
> -{
> -	struct i915_vma_active *active =
> -		container_of(base, typeof(*active), base);
> -
> -	__i915_vma_retire(active->vma, rq);
> -}
> -
> -static void
> -i915_vma_last_retire(struct i915_gem_active *base, struct i915_request *rq)
> -{
> -	__i915_vma_retire(container_of(base, struct i915_vma, last_active), rq);
> -}
> -
>   static struct i915_vma *
>   vma_create(struct drm_i915_gem_object *obj,
>   	   struct i915_address_space *vm,
> @@ -137,10 +119,9 @@ vma_create(struct drm_i915_gem_object *obj,
>   	if (vma == NULL)
>   		return ERR_PTR(-ENOMEM);
>   
> -	vma->active = RB_ROOT;
> -
> -	init_request_active(&vma->last_active, i915_vma_last_retire);
> +	i915_active_init(vm->i915, &vma->active, __i915_vma_retire);
>   	init_request_active(&vma->last_fence, NULL);
> +
>   	vma->vm = vm;
>   	vma->ops = &vm->vma_ops;
>   	vma->obj = obj;
> @@ -823,7 +804,6 @@ void i915_vma_reopen(struct i915_vma *vma)
>   static void __i915_vma_destroy(struct i915_vma *vma)
>   {
>   	struct drm_i915_private *i915 = vma->vm->i915;
> -	struct i915_vma_active *iter, *n;
>   
>   	GEM_BUG_ON(vma->node.allocated);
>   	GEM_BUG_ON(vma->fence);
> @@ -843,10 +823,7 @@ static void __i915_vma_destroy(struct i915_vma *vma)
>   		spin_unlock(&obj->vma.lock);
>   	}
>   
> -	rbtree_postorder_for_each_entry_safe(iter, n, &vma->active, node) {
> -		GEM_BUG_ON(i915_gem_active_isset(&iter->base));
> -		kfree(iter);
> -	}
> +	i915_active_fini(&vma->active);
>   
>   	kmem_cache_free(i915->vmas, vma);
>   }
> @@ -931,104 +908,15 @@ static void export_fence(struct i915_vma *vma,
>   	reservation_object_unlock(resv);
>   }
>   
> -static struct i915_gem_active *active_instance(struct i915_vma *vma, u64 idx)
> -{
> -	struct i915_vma_active *active;
> -	struct rb_node **p, *parent;
> -	struct i915_request *old;
> -
> -	/*
> -	 * We track the most recently used timeline to skip a rbtree search
> -	 * for the common case, under typical loads we never need the rbtree
> -	 * at all. We can reuse the last_active slot if it is empty, that is
> -	 * after the previous activity has been retired, or if the active
> -	 * matches the current timeline.
> -	 *
> -	 * Note that we allow the timeline to be active simultaneously in
> -	 * the rbtree and the last_active cache. We do this to avoid having
> -	 * to search and replace the rbtree element for a new timeline, with
> -	 * the cost being that we must be aware that the vma may be retired
> -	 * twice for the same timeline (as the older rbtree element will be
> -	 * retired before the new request added to last_active).
> -	 */
> -	old = i915_gem_active_raw(&vma->last_active,
> -				  &vma->vm->i915->drm.struct_mutex);
> -	if (!old || old->fence.context == idx)
> -		goto out;
> -
> -	/* Move the currently active fence into the rbtree */
> -	idx = old->fence.context;
> -
> -	parent = NULL;
> -	p = &vma->active.rb_node;
> -	while (*p) {
> -		parent = *p;
> -
> -		active = rb_entry(parent, struct i915_vma_active, node);
> -		if (active->timeline == idx)
> -			goto replace;
> -
> -		if (active->timeline < idx)
> -			p = &parent->rb_right;
> -		else
> -			p = &parent->rb_left;
> -	}
> -
> -	active = kmalloc(sizeof(*active), GFP_KERNEL);
> -
> -	/* kmalloc may retire the vma->last_active request (thanks shrinker)! */
> -	if (unlikely(!i915_gem_active_raw(&vma->last_active,
> -					  &vma->vm->i915->drm.struct_mutex))) {
> -		kfree(active);
> -		goto out;
> -	}
> -
> -	if (unlikely(!active))
> -		return ERR_PTR(-ENOMEM);
> -
> -	init_request_active(&active->base, i915_vma_retire);
> -	active->vma = vma;
> -	active->timeline = idx;
> -
> -	rb_link_node(&active->node, parent, p);
> -	rb_insert_color(&active->node, &vma->active);
> -
> -replace:
> -	/*
> -	 * Overwrite the previous active slot in the rbtree with last_active,
> -	 * leaving last_active zeroed. If the previous slot is still active,
> -	 * we must be careful as we now only expect to receive one retire
> -	 * callback not two, and so much undo the active counting for the
> -	 * overwritten slot.
> -	 */
> -	if (i915_gem_active_isset(&active->base)) {
> -		/* Retire ourselves from the old rq->active_list */
> -		__list_del_entry(&active->base.link);
> -		vma->active_count--;
> -		GEM_BUG_ON(!vma->active_count);
> -	}
> -	GEM_BUG_ON(list_empty(&vma->last_active.link));
> -	list_replace_init(&vma->last_active.link, &active->base.link);
> -	active->base.request = fetch_and_zero(&vma->last_active.request);
> -
> -out:
> -	return &vma->last_active;
> -}
> -
>   int i915_vma_move_to_active(struct i915_vma *vma,
>   			    struct i915_request *rq,
>   			    unsigned int flags)
>   {
>   	struct drm_i915_gem_object *obj = vma->obj;
> -	struct i915_gem_active *active;
>   
>   	lockdep_assert_held(&rq->i915->drm.struct_mutex);
>   	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
>   
> -	active = active_instance(vma, rq->fence.context);
> -	if (IS_ERR(active))
> -		return PTR_ERR(active);
> -
>   	/*
>   	 * Add a reference if we're newly entering the active list.
>   	 * The order in which we add operations to the retirement queue is
> @@ -1037,9 +925,15 @@ int i915_vma_move_to_active(struct i915_vma *vma,
>   	 * add the active reference first and queue for it to be dropped
>   	 * *last*.
>   	 */
> -	if (!i915_gem_active_isset(active) && !vma->active_count++)
> +	if (!vma->active.count)
>   		obj->active_count++;
> -	i915_gem_active_set(active, rq);
> +
> +	if (unlikely(i915_active_ref(&vma->active, rq->fence.context, rq))) {
> +		if (!vma->active.count)
> +			obj->active_count--;
> +		return -ENOMEM;
> +	}

Optionally you could make i915_active_ref return the old ref count or 
error. Then this could become simpler:

	ret = i915_active_ref(..);
	if (unlikely(ret < 0))
		return -ENOMEM;
	else if (ret == 0)
		obj->active_count++;

> +
>   	GEM_BUG_ON(!i915_vma_is_active(vma));
>   	GEM_BUG_ON(!obj->active_count);
>   
> @@ -1073,8 +967,6 @@ int i915_vma_unbind(struct i915_vma *vma)
>   	 */
>   	might_sleep();
>   	if (i915_vma_is_active(vma)) {
> -		struct i915_vma_active *active, *n;
> -
>   		/*
>   		 * When a closed VMA is retired, it is unbound - eek.
>   		 * In order to prevent it from being recursively closed,
> @@ -1090,19 +982,10 @@ int i915_vma_unbind(struct i915_vma *vma)
>   		 */
>   		__i915_vma_pin(vma);
>   
> -		ret = i915_gem_active_retire(&vma->last_active,
> -					     &vma->vm->i915->drm.struct_mutex);
> +		ret = i915_active_wait(&vma->active);
>   		if (ret)
>   			goto unpin;
>   
> -		rbtree_postorder_for_each_entry_safe(active, n,
> -						     &vma->active, node) {
> -			ret = i915_gem_active_retire(&active->base,
> -						     &vma->vm->i915->drm.struct_mutex);
> -			if (ret)
> -				goto unpin;
> -		}
> -
>   		ret = i915_gem_active_retire(&vma->last_fence,
>   					     &vma->vm->i915->drm.struct_mutex);
>   unpin:
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index 5793abe509a2..3c03d4569481 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -34,6 +34,7 @@
>   #include "i915_gem_fence_reg.h"
>   #include "i915_gem_object.h"
>   
> +#include "i915_active.h"
>   #include "i915_request.h"
>   
>   enum i915_cache_level;
> @@ -108,9 +109,7 @@ struct i915_vma {
>   #define I915_VMA_USERFAULT	BIT(I915_VMA_USERFAULT_BIT)
>   #define I915_VMA_GGTT_WRITE	BIT(15)
>   
> -	unsigned int active_count;
> -	struct rb_root active;
> -	struct i915_gem_active last_active;
> +	struct i915_active active;
>   	struct i915_gem_active last_fence;
>   
>   	/**
> @@ -154,9 +153,9 @@ i915_vma_instance(struct drm_i915_gem_object *obj,
>   void i915_vma_unpin_and_release(struct i915_vma **p_vma, unsigned int flags);
>   #define I915_VMA_RELEASE_MAP BIT(0)
>   
> -static inline bool i915_vma_is_active(struct i915_vma *vma)
> +static inline bool i915_vma_is_active(const struct i915_vma *vma)
>   {
> -	return vma->active_count;
> +	return !i915_active_is_idle(&vma->active);
>   }
>   
>   int __must_check i915_vma_move_to_active(struct i915_vma *vma,
> diff --git a/drivers/gpu/drm/i915/selftests/i915_active.c b/drivers/gpu/drm/i915/selftests/i915_active.c
> new file mode 100644
> index 000000000000..7c5c3068565b
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/selftests/i915_active.c
> @@ -0,0 +1,158 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2018 Intel Corporation
> + */
> +
> +#include "../i915_selftest.h"
> +
> +#include "igt_flush_test.h"
> +#include "lib_sw_fence.h"
> +
> +struct live_active {
> +	struct i915_active base;
> +	bool retired;
> +};
> +
> +static void __live_active_retire(struct i915_active *base)
> +{
> +	struct live_active *active = container_of(base, typeof(*active), base);
> +
> +	active->retired = true;
> +}
> +
> +static int __live_active_setup(struct drm_i915_private *i915,
> +			       struct live_active *active)
> +{
> +	struct intel_engine_cs *engine;
> +	struct i915_sw_fence *submit;
> +	enum intel_engine_id id;
> +	unsigned int count = 0;
> +	int err = 0;
> +
> +	i915_active_init(i915, &active->base, __live_active_retire);
> +	active->retired = false;
> +
> +	if (!i915_active_acquire(&active->base)) {
> +		pr_err("First i915_active_acquire should report being idle\n");
> +		return -EINVAL;
> +	}
> +
> +	submit = heap_fence_create(GFP_KERNEL);
> +
> +	for_each_engine(engine, i915, id) {
> +		struct i915_request *rq;
> +
> +		rq = i915_request_alloc(engine, i915->kernel_context);
> +		if (IS_ERR(rq)) {
> +			err = PTR_ERR(rq);
> +			break;
> +		}
> +
> +		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
> +						       submit,
> +						       GFP_KERNEL);
> +		if (err < 0) {
> +			pr_err("Failed to allocate submission fence!\n");
> +			i915_request_add(rq);
> +			break;
> +		}
> +
> +		err = i915_active_ref(&active->base, rq->fence.context, rq);
> +		if (err) {
> +			pr_err("Failed to track active ref!\n");
> +			i915_request_add(rq);
> +			break;
> +		}
> +
> +		i915_request_add(rq);
> +		count++;
> +	}
> +
> +	i915_active_release(&active->base);
> +	if (active->retired) {
> +		pr_err("i915_active retired before submission!\n");
> +		err = -EINVAL;
> +	}
> +	if (active->base.count != count) {
> +		pr_err("i915_active not tracking all requests, found %d, expected %d\n",
> +		       active->base.count, count);
> +		err = -EINVAL;
> +	}
> +
> +	i915_sw_fence_commit(submit);
> +	heap_fence_put(submit);
> +
> +	return err;
> +}
> +
> +static int live_active_wait(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct live_active active;
> +	intel_wakeref_t wakeref;
> +	int err;
> +
> +	/* Check that we get a callback when requests upon waiting */
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	wakeref = intel_runtime_pm_get(i915);
> +
> +	err = __live_active_setup(i915, &active);
> +
> +	i915_active_wait(&active.base);
> +	if (!active.retired) {
> +		pr_err("i915_active not retired after waiting!\n");
> +		err = -EINVAL;
> +	}
> +
> +	i915_active_fini(&active.base);
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		err = -EIO;
> +
> +	intel_runtime_pm_put(i915, wakeref);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	return err;
> +}
> +
> +static int live_active_retire(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct live_active active;
> +	intel_wakeref_t wakeref;
> +	int err;
> +
> +	/* Check that we get a callback when requests are indirectly retired */
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	wakeref = intel_runtime_pm_get(i915);
> +
> +	err = __live_active_setup(i915, &active);
> +
> +	/* waits for & retires all requests */
> +	if (igt_flush_test(i915, I915_WAIT_LOCKED))
> +		err = -EIO;
> +
> +	if (!active.retired) {
> +		pr_err("i915_active not retired after flushing!\n");
> +		err = -EINVAL;
> +	}
> +
> +	i915_active_fini(&active.base);
> +	intel_runtime_pm_put(i915, wakeref);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +	return err;
> +}
> +
> +int i915_active_live_selftests(struct drm_i915_private *i915)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(live_active_wait),
> +		SUBTEST(live_active_retire),
> +	};
> +
> +	if (i915_terminally_wedged(&i915->gpu_error))
> +		return 0;
> +
> +	return i915_subtests(tests, i915);
> +}
> diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> index 76b4f87fc853..6d766925ad04 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> @@ -12,8 +12,9 @@
>   selftest(sanitycheck, i915_live_sanitycheck) /* keep first (igt selfcheck) */
>   selftest(uncore, intel_uncore_live_selftests)
>   selftest(workarounds, intel_workarounds_live_selftests)
> -selftest(requests, i915_request_live_selftests)
>   selftest(timelines, i915_timeline_live_selftests)
> +selftest(requests, i915_request_live_selftests)
> +selftest(active, i915_active_live_selftests)
>   selftest(objects, i915_gem_object_live_selftests)
>   selftest(dmabuf, i915_gem_dmabuf_live_selftests)
>   selftest(coherency, i915_gem_coherency_live_selftests)
> 

Body looks good. I left selftests for after I get a further in the series.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 04/11] drm/i915: Generalise GPU activity tracking
  2019-01-30 12:02   ` Tvrtko Ursulin
@ 2019-01-30 12:14     ` Chris Wilson
  0 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30 12:14 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-30 12:02:02)
> 
> On 30/01/2019 02:18, Chris Wilson wrote:
> > +struct active_node {
> > +     struct i915_gem_active base;
> > +     struct i915_active *ref;
> > +     struct rb_node node;
> > +     u64 timeline;
> > +};
> > +
> > +static void
> > +__active_retire(struct i915_active *ref)
> > +{
> 
> You wouldn't consider naming this variable 'active' throughout? Ref 
> reminds me so much of a kref eg. fence->refcount. Although 'active' has 
> been used for gem_active so far. Not a blocker though, I'll get used to it.

Similarity to kref wasn't a coincidence, because we also use the concept
of active reference for this as well.

I'm not sold on the name yet. I liked active_tracker but thought that
was slightly too long for something I expect to be fairly ubiquitous.
struct reservation_object, I don't like because reservation is a phase
before building the request.

i915_shared_fence and i915_exclusive_fence; maybe? But fences tend to
be the single-shot, aka i915_request.

i915_hedgerow. I'm being silly now.

If only I can come up with as catchy a name as rcu.

rgu? Read-gpu-update. grf; gpu-read-reference.

> > +int i915_active_ref(struct i915_active *ref,
> > +                 u64 timeline,
> > +                 struct i915_request *rq)
> > +{
> > +     struct i915_gem_active *active;
> > +
> > +     active = active_instance(ref, timeline);
> > +     if (IS_ERR(active))
> > +             return PTR_ERR(active);
> > +
> > +     if (!i915_gem_active_isset(active))
> > +             ref->count++;
> 
> Could stick a super-paranoid overflow GEM_BUG_ON here.

Yeah. Wouldn't it be nice if refcount_t wasn't quite so tied to
refcount_t. Just a plain old count_t and atomic_count_t.

> > -     if (!i915_gem_active_isset(active) && !vma->active_count++)
> > +     if (!vma->active.count)
> >               obj->active_count++;
> > -     i915_gem_active_set(active, rq);
> > +
> > +     if (unlikely(i915_active_ref(&vma->active, rq->fence.context, rq))) {
> > +             if (!vma->active.count)
> > +                     obj->active_count--;
> > +             return -ENOMEM;
> > +     }
> 
> Optionally you could make i915_active_ref return the old ref count or 
> error. Then this could become simpler:
> 
>         ret = i915_active_ref(..);
>         if (unlikely(ret < 0))
>                 return -ENOMEM;
>         else if (ret == 0)
>                 obj->active_count++;

Heh, didn't immediately strike me as simpler, but I did also consider
it.

I think for the atomic variant, we may just pass an init_func(). So
watch this space.
 
> Body looks good. I left selftests for after I get a further in the series.

They weren't very exciting I'm afraid, just aiming to have minimal
walking through the api points.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 06/11] drm/i915: Allocate active tracking nodes from a slabcache
  2019-01-30  2:19 ` [PATCH 06/11] drm/i915: Allocate active tracking nodes from a slabcache Chris Wilson
@ 2019-01-30 15:57   ` Tvrtko Ursulin
  2019-01-30 16:08     ` Chris Wilson
  2019-01-30 16:37   ` [PATCH v2] " Chris Wilson
  2019-01-30 17:25   ` [PATCH v3] " Chris Wilson
  2 siblings, 1 reply; 38+ messages in thread
From: Tvrtko Ursulin @ 2019-01-30 15:57 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 30/01/2019 02:19, Chris Wilson wrote:
> Wrap the active tracking for a GPU references in a slabcache for faster
> allocations, and keep track of inflight nodes so we can reap the
> stale entries upon parking (thereby trimming our memory usage).

I suggest a two staged approach. First patch add a slab cache (you can 
also add kmem_cache_shrink on park as we do for other caches), then add 
the parking/reaping bit.

Under what scenarios we end up not freeing active nodes sufficiently? It 
would have to be some user which keeps many contexts around, having only 
used them once?

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_active.c            | 55 ++++++++++++++++---
>   drivers/gpu/drm/i915/i915_active.h            | 21 +++++--
>   drivers/gpu/drm/i915/i915_active_types.h      | 12 +++-
>   drivers/gpu/drm/i915/i915_drv.h               |  2 +
>   drivers/gpu/drm/i915/i915_gem.c               | 16 +++++-
>   drivers/gpu/drm/i915/i915_gem_gtt.c           |  2 +-
>   drivers/gpu/drm/i915/i915_vma.c               |  3 +-
>   drivers/gpu/drm/i915/selftests/i915_active.c  |  3 +-
>   .../gpu/drm/i915/selftests/mock_gem_device.c  |  6 ++
>   9 files changed, 100 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> index e0182e19cb8b..3c7abbde42ac 100644
> --- a/drivers/gpu/drm/i915/i915_active.c
> +++ b/drivers/gpu/drm/i915/i915_active.c
> @@ -7,7 +7,9 @@
>   #include "i915_drv.h"
>   #include "i915_active.h"
>   
> -#define BKL(ref) (&(ref)->i915->drm.struct_mutex)
> +#define i915_from_gt(x) \
> +	container_of(x, struct drm_i915_private, gt.active_refs)
> +#define BKL(ref) (&i915_from_gt((ref)->gt)->drm.struct_mutex)
>   
>   struct active_node {
>   	struct i915_gem_active base;
> @@ -79,11 +81,11 @@ active_instance(struct i915_active *ref, u64 idx)
>   			p = &parent->rb_left;
>   	}
>   
> -	node = kmalloc(sizeof(*node), GFP_KERNEL);
> +	node = kmem_cache_alloc(ref->gt->slab_cache, GFP_KERNEL);
>   
>   	/* kmalloc may retire the ref->last (thanks shrinker)! */
>   	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
> -		kfree(node);
> +		kmem_cache_free(ref->gt->slab_cache, node);
>   		goto out;
>   	}
>   
> @@ -94,6 +96,9 @@ active_instance(struct i915_active *ref, u64 idx)
>   	node->ref = ref;
>   	node->timeline = idx;
>   
> +	if (RB_EMPTY_ROOT(&ref->tree))
> +		list_add(&ref->active_link, &ref->gt->active_refs);
> +
>   	rb_link_node(&node->node, parent, p);
>   	rb_insert_color(&node->node, &ref->tree);
>   
> @@ -119,11 +124,11 @@ active_instance(struct i915_active *ref, u64 idx)
>   	return &ref->last;
>   }
>   
> -void i915_active_init(struct drm_i915_private *i915,
> +void i915_active_init(struct i915_gt_active *gt,
>   		      struct i915_active *ref,
>   		      void (*retire)(struct i915_active *ref))
>   {
> -	ref->i915 = i915;
> +	ref->gt = gt;
>   	ref->retire = retire;
>   	ref->tree = RB_ROOT;
>   	init_request_active(&ref->last, last_retire);
> @@ -161,6 +166,7 @@ void i915_active_release(struct i915_active *ref)
>   
>   int i915_active_wait(struct i915_active *ref)
>   {
> +	struct kmem_cache *slab = ref->gt->slab_cache;
>   	struct active_node *it, *n;
>   	int ret;
>   
> @@ -168,15 +174,19 @@ int i915_active_wait(struct i915_active *ref)
>   	if (ret)
>   		return ret;
>   
> +	if (RB_EMPTY_ROOT(&ref->tree))
> +		return 0;
> +
>   	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
>   		ret = i915_gem_active_retire(&it->base, BKL(ref));
>   		if (ret)
>   			return ret;
>   
>   		GEM_BUG_ON(i915_gem_active_isset(&it->base));
> -		kfree(it);
> +		kmem_cache_free(slab, it);
>   	}
>   	ref->tree = RB_ROOT;
> +	list_del(&ref->active_link);
>   
>   	return 0;
>   }
> @@ -210,15 +220,46 @@ int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
>   
>   void i915_active_fini(struct i915_active *ref)
>   {
> +	struct kmem_cache *slab = ref->gt->slab_cache;
>   	struct active_node *it, *n;
>   
> +	lockdep_assert_held(BKL(ref));
>   	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
>   
> +	if (RB_EMPTY_ROOT(&ref->tree))
> +		return;
> +
>   	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
>   		GEM_BUG_ON(i915_gem_active_isset(&it->base));
> -		kfree(it);
> +		kmem_cache_free(slab, it);
>   	}
>   	ref->tree = RB_ROOT;
> +	list_del(&ref->active_link);
> +}
> +
> +int i915_gt_active_init(struct i915_gt_active *gt)
> +{
> +	gt->slab_cache = KMEM_CACHE(active_node, SLAB_HWCACHE_ALIGN);
> +	if (!gt->slab_cache)
> +		return -ENOMEM;
> +
> +	INIT_LIST_HEAD(&gt->active_refs);
> +
> +	return 0;
> +}
> +
> +void i915_gt_active_park(struct i915_gt_active *gt)
> +{
> +	struct i915_active *it, *n;
> +
> +	list_for_each_entry_safe(it, n, &gt->active_refs, active_link)
> +		i915_active_fini(it);
> +}
> +
> +void i915_gt_active_fini(struct i915_gt_active *gt)
> +{
> +	GEM_BUG_ON(!list_empty(&gt->active_refs));
> +	kmem_cache_destroy(gt->slab_cache);
>   }
>   
>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
> index c0729a046f98..41c4a5da84c8 100644
> --- a/drivers/gpu/drm/i915/i915_active.h
> +++ b/drivers/gpu/drm/i915/i915_active.h
> @@ -9,10 +9,6 @@
>   
>   #include "i915_active_types.h"
>   
> -#include <linux/rbtree.h>
> -
> -#include "i915_request.h"
> -
>   /*
>    * GPU activity tracking
>    *
> @@ -39,7 +35,7 @@
>    * synchronisation.
>    */
>   
> -void i915_active_init(struct drm_i915_private *i915,
> +void i915_active_init(struct i915_gt_active *gt,
>   		      struct i915_active *ref,
>   		      void (*retire)(struct i915_active *ref));
>   
> @@ -63,4 +59,19 @@ i915_active_is_idle(const struct i915_active *ref)
>   
>   void i915_active_fini(struct i915_active *ref);
>   
> +/*
> + * Active refs memory management
> + *
> + * To be more economical with memory, we reap all the i915_active trees on
> + * parking the GPU (when we know the GPU is inactive) and allocate the nodes
> + * from a local slab cache to hopefully reduce the fragmentation as we will
> + * then be able to free all pages en masse upon idling.
> + */
> +
> +int i915_gt_active_init(struct i915_gt_active *gt);
> +void i915_gt_active_park(struct i915_gt_active *gt);
> +void i915_gt_active_fini(struct i915_gt_active *gt);
> +
> +#define i915_gt_active(i915) (&(i915)->gt.active_refs)
> +
>   #endif /* _I915_ACTIVE_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
> index 411e502ed8dd..3d41c33ca78c 100644
> --- a/drivers/gpu/drm/i915/i915_active_types.h
> +++ b/drivers/gpu/drm/i915/i915_active_types.h
> @@ -7,14 +7,17 @@
>   #ifndef _I915_ACTIVE_TYPES_H_
>   #define _I915_ACTIVE_TYPES_H_
>   
> +#include <linux/list.h>
>   #include <linux/rbtree.h>
>   
>   #include "i915_request.h"
>   
> -struct drm_i915_private;
> +struct i915_gt_active;
> +struct kmem_cache;
>   
>   struct i915_active {
> -	struct drm_i915_private *i915;
> +	struct i915_gt_active *gt;

gt_active would be better - gt is to vague.

> +	struct list_head active_link;
>   
>   	struct rb_root tree;
>   	struct i915_gem_active last;
> @@ -23,4 +26,9 @@ struct i915_active {
>   	void (*retire)(struct i915_active *ref);
>   };
>   
> +struct i915_gt_active {
> +	struct list_head active_refs;
> +	struct kmem_cache *slab_cache;
> +};
> +
>   #endif /* _I915_ACTIVE_TYPES_H_ */
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 8ec28a7f5452..480ab3e00ba8 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1984,6 +1984,8 @@ struct drm_i915_private {
>   			struct list_head hwsp_free_list;
>   		} timelines;
>   
> +		struct i915_gt_active active_refs;
> +
>   		struct list_head active_rings;
>   		struct list_head closed_vma;
>   		u32 active_requests;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index caccff87a2a1..2bc735df408b 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -130,6 +130,7 @@ static u32 __i915_gem_park(struct drm_i915_private *i915)
>   
>   	intel_engines_park(i915);
>   	i915_timelines_park(i915);
> +	i915_gt_active_park(i915_gt_active(i915));

The i915_gt_active macro is just too horrible IMHO. Why? :)

Make i915_gt_active_park take i915, or i915->gt_active.

>   
>   	i915_pmu_gt_parked(i915);
>   	i915_vma_parked(i915);
> @@ -4998,15 +4999,19 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>   		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
>   	}
>   
> +	ret = i915_gt_active_init(i915_gt_active(dev_priv));
> +	if (ret)
> +		return ret;
> +
>   	i915_timelines_init(dev_priv);
>   
>   	ret = i915_gem_init_userptr(dev_priv);
>   	if (ret)
> -		return ret;
> +		goto err_timelines;
>   
>   	ret = intel_uc_init_misc(dev_priv);
>   	if (ret)
> -		return ret;
> +		goto err_userptr;
>   
>   	ret = intel_wopcm_init(&dev_priv->wopcm);
>   	if (ret)
> @@ -5122,9 +5127,13 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>   err_uc_misc:
>   	intel_uc_fini_misc(dev_priv);
>   
> -	if (ret != -EIO) {
> +err_userptr:
> +	if (ret != -EIO)
>   		i915_gem_cleanup_userptr(dev_priv);
> +err_timelines:
> +	if (ret != -EIO) {
>   		i915_timelines_fini(dev_priv);
> +		i915_gt_active_fini(i915_gt_active(dev_priv));
>   	}
>   
>   	if (ret == -EIO) {
> @@ -5177,6 +5186,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv)
>   	intel_uc_fini_misc(dev_priv);
>   	i915_gem_cleanup_userptr(dev_priv);
>   	i915_timelines_fini(dev_priv);
> +	i915_gt_active_fini(i915_gt_active(dev_priv));
>   
>   	i915_gem_drain_freed_objects(dev_priv);
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index e625659c03a2..d8819de0d6ee 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1917,7 +1917,7 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
>   	if (!vma)
>   		return ERR_PTR(-ENOMEM);
>   
> -	i915_active_init(i915, &vma->active, NULL);
> +	i915_active_init(i915_gt_active(i915), &vma->active, NULL);
>   	init_request_active(&vma->last_fence, NULL);
>   
>   	vma->vm = &ggtt->vm;
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index d4772061e642..2456bfb4877b 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -119,7 +119,8 @@ vma_create(struct drm_i915_gem_object *obj,
>   	if (vma == NULL)
>   		return ERR_PTR(-ENOMEM);
>   
> -	i915_active_init(vm->i915, &vma->active, __i915_vma_retire);
> +	i915_active_init(i915_gt_active(vm->i915),
> +			 &vma->active, __i915_vma_retire);
>   	init_request_active(&vma->last_fence, NULL);
>   
>   	vma->vm = vm;
> diff --git a/drivers/gpu/drm/i915/selftests/i915_active.c b/drivers/gpu/drm/i915/selftests/i915_active.c
> index 7c5c3068565b..0e923476920e 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_active.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_active.c
> @@ -30,7 +30,8 @@ static int __live_active_setup(struct drm_i915_private *i915,
>   	unsigned int count = 0;
>   	int err = 0;
>   
> -	i915_active_init(i915, &active->base, __live_active_retire);
> +	i915_active_init(i915_gt_active(i915),
> +			 &active->base, __live_active_retire);
>   	active->retired = false;
>   
>   	if (!i915_active_acquire(&active->base)) {
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index 074a0d9cbf26..5b88f74c1677 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -69,6 +69,7 @@ static void mock_device_release(struct drm_device *dev)
>   	mutex_unlock(&i915->drm.struct_mutex);
>   
>   	i915_timelines_fini(i915);
> +	i915_gt_active_fini(i915_gt_active(i915));
>   
>   	drain_workqueue(i915->wq);
>   	i915_gem_drain_freed_objects(i915);
> @@ -228,6 +229,9 @@ struct drm_i915_private *mock_gem_device(void)
>   	if (!i915->priorities)
>   		goto err_dependencies;
>   
> +	if (i915_gt_active_init(i915_gt_active(i915)))
> +		goto err_priorities;
> +
>   	i915_timelines_init(i915);
>   
>   	INIT_LIST_HEAD(&i915->gt.active_rings);
> @@ -257,6 +261,8 @@ struct drm_i915_private *mock_gem_device(void)
>   err_unlock:
>   	mutex_unlock(&i915->drm.struct_mutex);
>   	i915_timelines_fini(i915);
> +	i915_gt_active_fini(i915_gt_active(i915));
> +err_priorities:
>   	kmem_cache_destroy(i915->priorities);
>   err_dependencies:
>   	kmem_cache_destroy(i915->dependencies);
> 

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 06/11] drm/i915: Allocate active tracking nodes from a slabcache
  2019-01-30 15:57   ` Tvrtko Ursulin
@ 2019-01-30 16:08     ` Chris Wilson
  2019-01-30 21:58       ` Chris Wilson
  0 siblings, 1 reply; 38+ messages in thread
From: Chris Wilson @ 2019-01-30 16:08 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-30 15:57:06)
> 
> On 30/01/2019 02:19, Chris Wilson wrote:
> > Wrap the active tracking for a GPU references in a slabcache for faster
> > allocations, and keep track of inflight nodes so we can reap the
> > stale entries upon parking (thereby trimming our memory usage).
> 
> I suggest a two staged approach. First patch add a slab cache (you can 
> also add kmem_cache_shrink on park as we do for other caches), then add 
> the parking/reaping bit.

Not really seeing the point, the lesson learnt is that we should be
tidying up on parking and that's why having a slab makes sense in the
first place.

I can see one argument to not do the idle reaping immediately and that's
if we apply the reaping on vma idle instead. Then parking is redundant.
 
> Under what scenarios we end up not freeing active nodes sufficiently? It 
> would have to be some user which keeps many contexts around, having only 
> used them once?

Never in a typical scenario :) Almost every allocation is served from
the last_request slot, you need to have a pair of concurrent references
to a vma for it to activate, and that takes multiple engines reading the
same object within a context/ppgtt.

> >   struct i915_active {
> > -     struct drm_i915_private *i915;
> > +     struct i915_gt_active *gt;
> 
> gt_active would be better - gt is to vague.

As a backpointer though it's defined by its owner. Give me a better name
for everything.

> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index caccff87a2a1..2bc735df408b 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -130,6 +130,7 @@ static u32 __i915_gem_park(struct drm_i915_private *i915)
> >   
> >       intel_engines_park(i915);
> >       i915_timelines_park(i915);
> > +     i915_gt_active_park(i915_gt_active(i915));
> 
> The i915_gt_active macro is just too horrible IMHO. Why? :)

Because it was called gt.active_refs and that was irking me. I really
haven't settled on the name.

> Make i915_gt_active_park take i915, or i915->gt_active.

The pretense at encapsulation was nice, and I'd like to push harder on
that front.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 07/11] drm/i915: Pull i915_gem_active into the i915_active family
  2019-01-30  2:19 ` [PATCH 07/11] drm/i915: Pull i915_gem_active into the i915_active family Chris Wilson
@ 2019-01-30 16:11   ` Tvrtko Ursulin
  0 siblings, 0 replies; 38+ messages in thread
From: Tvrtko Ursulin @ 2019-01-30 16:11 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 30/01/2019 02:19, Chris Wilson wrote:
> Looking forward, we need to break the struct_mutex dependency on
> i915_gem_active. In the meantime, external use of i915_gem_active is
> quite beguiling, little do new users suspect that it implies a barrier
> as each request it tracks must be ordered wrt the previous one. As one
> of many, it can be used to track activity across multiple timelines, a
> shared fence, which fits our unordered request submission much better. We
> need to steer external users away from the singular, exclusive fence
> imposed by i915_gem_active to i915_active instead. As part of that
> process, we move i915_gem_active out of i915_request.c into
> i915_active.c to start separating the two concepts, and rename it to
> i915_active_request (both to tie it to the concept of tracking just one
> request, and to give it a longer, less appealing name).

Without even considering anything else you mentioned, 
i915_active_request became a much better name as soon i915_active was added.

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_active.c            |  64 ++-
>   drivers/gpu/drm/i915/i915_active.h            | 348 ++++++++++++++++
>   drivers/gpu/drm/i915/i915_active_types.h      |  13 +-
>   drivers/gpu/drm/i915/i915_debugfs.c           |   2 +-
>   drivers/gpu/drm/i915/i915_gem.c               |  10 +-
>   drivers/gpu/drm/i915/i915_gem_context.c       |   4 +-
>   drivers/gpu/drm/i915/i915_gem_fence_reg.c     |   4 +-
>   drivers/gpu/drm/i915/i915_gem_gtt.c           |   2 +-
>   drivers/gpu/drm/i915/i915_gem_object.h        |   2 +-
>   drivers/gpu/drm/i915/i915_gpu_error.c         |  10 +-
>   drivers/gpu/drm/i915/i915_request.c           |  35 +-
>   drivers/gpu/drm/i915/i915_request.h           | 383 ------------------
>   drivers/gpu/drm/i915/i915_reset.c             |   2 +-
>   drivers/gpu/drm/i915/i915_timeline.c          |  25 +-
>   drivers/gpu/drm/i915/i915_timeline.h          |  14 +-
>   drivers/gpu/drm/i915/i915_vma.c               |  12 +-
>   drivers/gpu/drm/i915/i915_vma.h               |   2 +-
>   drivers/gpu/drm/i915/intel_engine_cs.c        |   2 +-
>   drivers/gpu/drm/i915/intel_overlay.c          |  33 +-
>   drivers/gpu/drm/i915/selftests/intel_lrc.c    |   4 +-
>   .../gpu/drm/i915/selftests/mock_timeline.c    |   4 +-
>   21 files changed, 473 insertions(+), 502 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
> index 3c7abbde42ac..007098e44959 100644
> --- a/drivers/gpu/drm/i915/i915_active.c
> +++ b/drivers/gpu/drm/i915/i915_active.c
> @@ -12,7 +12,7 @@
>   #define BKL(ref) (&i915_from_gt((ref)->gt)->drm.struct_mutex)
>   
>   struct active_node {
> -	struct i915_gem_active base;
> +	struct i915_active_request base;
>   	struct i915_active *ref;
>   	struct rb_node node;
>   	u64 timeline;
> @@ -27,18 +27,18 @@ __active_retire(struct i915_active *ref)
>   }
>   
>   static void
> -node_retire(struct i915_gem_active *base, struct i915_request *rq)
> +node_retire(struct i915_active_request *base, struct i915_request *rq)
>   {
>   	__active_retire(container_of(base, struct active_node, base)->ref);
>   }
>   
>   static void
> -last_retire(struct i915_gem_active *base, struct i915_request *rq)
> +last_retire(struct i915_active_request *base, struct i915_request *rq)
>   {
>   	__active_retire(container_of(base, struct i915_active, last));
>   }
>   
> -static struct i915_gem_active *
> +static struct i915_active_request *
>   active_instance(struct i915_active *ref, u64 idx)
>   {
>   	struct active_node *node;
> @@ -59,7 +59,7 @@ active_instance(struct i915_active *ref, u64 idx)
>   	 * twice for the same timeline (as the older rbtree element will be
>   	 * retired before the new request added to last).
>   	 */
> -	old = i915_gem_active_raw(&ref->last, BKL(ref));
> +	old = i915_active_request_raw(&ref->last, BKL(ref));
>   	if (!old || old->fence.context == idx)
>   		goto out;
>   
> @@ -84,7 +84,7 @@ active_instance(struct i915_active *ref, u64 idx)
>   	node = kmem_cache_alloc(ref->gt->slab_cache, GFP_KERNEL);
>   
>   	/* kmalloc may retire the ref->last (thanks shrinker)! */
> -	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
> +	if (unlikely(!i915_active_request_raw(&ref->last, BKL(ref)))) {
>   		kmem_cache_free(ref->gt->slab_cache, node);
>   		goto out;
>   	}
> @@ -92,7 +92,7 @@ active_instance(struct i915_active *ref, u64 idx)
>   	if (unlikely(!node))
>   		return ERR_PTR(-ENOMEM);
>   
> -	init_request_active(&node->base, node_retire);
> +	i915_active_request_init(&node->base, NULL, node_retire);
>   	node->ref = ref;
>   	node->timeline = idx;
>   
> @@ -110,7 +110,7 @@ active_instance(struct i915_active *ref, u64 idx)
>   	 * callback not two, and so much undo the active counting for the
>   	 * overwritten slot.
>   	 */
> -	if (i915_gem_active_isset(&node->base)) {
> +	if (i915_active_request_isset(&node->base)) {
>   		/* Retire ourselves from the old rq->active_list */
>   		__list_del_entry(&node->base.link);
>   		ref->count--;
> @@ -131,7 +131,7 @@ void i915_active_init(struct i915_gt_active *gt,
>   	ref->gt = gt;
>   	ref->retire = retire;
>   	ref->tree = RB_ROOT;
> -	init_request_active(&ref->last, last_retire);
> +	i915_active_request_init(&ref->last, NULL, last_retire);
>   	ref->count = 0;
>   }
>   
> @@ -139,15 +139,15 @@ int i915_active_ref(struct i915_active *ref,
>   		    u64 timeline,
>   		    struct i915_request *rq)
>   {
> -	struct i915_gem_active *active;
> +	struct i915_active_request *active;
>   
>   	active = active_instance(ref, timeline);
>   	if (IS_ERR(active))
>   		return PTR_ERR(active);
>   
> -	if (!i915_gem_active_isset(active))
> +	if (!i915_active_request_isset(active))
>   		ref->count++;
> -	i915_gem_active_set(active, rq);
> +	__i915_active_request_set(active, rq);
>   
>   	return 0;
>   }
> @@ -170,7 +170,7 @@ int i915_active_wait(struct i915_active *ref)
>   	struct active_node *it, *n;
>   	int ret;
>   
> -	ret = i915_gem_active_retire(&ref->last, BKL(ref));
> +	ret = i915_active_request_retire(&ref->last, BKL(ref));
>   	if (ret)
>   		return ret;
>   
> @@ -178,11 +178,11 @@ int i915_active_wait(struct i915_active *ref)
>   		return 0;
>   
>   	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
> -		ret = i915_gem_active_retire(&it->base, BKL(ref));
> +		ret = i915_active_request_retire(&it->base, BKL(ref));
>   		if (ret)
>   			return ret;
>   
> -		GEM_BUG_ON(i915_gem_active_isset(&it->base));
> +		GEM_BUG_ON(i915_active_request_isset(&it->base));
>   		kmem_cache_free(slab, it);
>   	}
>   	ref->tree = RB_ROOT;
> @@ -191,11 +191,11 @@ int i915_active_wait(struct i915_active *ref)
>   	return 0;
>   }
>   
> -static int __i915_request_await_active(struct i915_request *rq,
> -				       struct i915_gem_active *active)
> +int i915_request_await_active_request(struct i915_request *rq,
> +				      struct i915_active_request *active)
>   {
>   	struct i915_request *barrier =
> -		i915_gem_active_raw(active, &rq->i915->drm.struct_mutex);
> +		i915_active_request_raw(active, &rq->i915->drm.struct_mutex);
>   
>   	return barrier ? i915_request_await_dma_fence(rq, &barrier->fence) : 0;
>   }
> @@ -205,12 +205,12 @@ int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
>   	struct active_node *it, *n;
>   	int ret;
>   
> -	ret = __i915_request_await_active(rq, &ref->last);
> +	ret = i915_request_await_active_request(rq, &ref->last);
>   	if (ret)
>   		return ret;
>   
>   	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
> -		ret = __i915_request_await_active(rq, &it->base);
> +		ret = i915_request_await_active_request(rq, &it->base);
>   		if (ret)
>   			return ret;
>   	}
> @@ -224,13 +224,13 @@ void i915_active_fini(struct i915_active *ref)
>   	struct active_node *it, *n;
>   
>   	lockdep_assert_held(BKL(ref));
> -	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
> +	GEM_BUG_ON(i915_active_request_isset(&ref->last));
>   
>   	if (RB_EMPTY_ROOT(&ref->tree))
>   		return;
>   
>   	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
> -		GEM_BUG_ON(i915_gem_active_isset(&it->base));
> +		GEM_BUG_ON(i915_active_request_isset(&it->base));
>   		kmem_cache_free(slab, it);
>   	}
>   	ref->tree = RB_ROOT;
> @@ -262,6 +262,26 @@ void i915_gt_active_fini(struct i915_gt_active *gt)
>   	kmem_cache_destroy(gt->slab_cache);
>   }
>   
> +int i915_active_request_set(struct i915_active_request *active,
> +			    struct i915_request *rq)
> +{
> +	int err;
> +
> +	/* Must maintain ordering wrt previous active requests */
> +	err = i915_request_await_active_request(rq, active);
> +	if (err)
> +		return err;
> +
> +	__i915_active_request_set(active, rq);
> +	return 0;
> +}
> +
> +void i915_active_retire_noop(struct i915_active_request *active,
> +			     struct i915_request *request)
> +{
> +	/* Space left intentionally blank */
> +}
> +
>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>   #include "selftests/i915_active.c"
>   #endif
> diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
> index 41c4a5da84c8..e24421a6ac5c 100644
> --- a/drivers/gpu/drm/i915/i915_active.h
> +++ b/drivers/gpu/drm/i915/i915_active.h
> @@ -7,8 +7,354 @@
>   #ifndef _I915_ACTIVE_H_
>   #define _I915_ACTIVE_H_
>   
> +#include <linux/lockdep.h>
> +
>   #include "i915_active_types.h"
>   
> +/*
> + * We treat requests as fences. This is not be to confused with our
> + * "fence registers" but pipeline synchronisation objects ala GL_ARB_sync.
> + * We use the fences to synchronize access from the CPU with activity on the
> + * GPU, for example, we should not rewrite an object's PTE whilst the GPU
> + * is reading them. We also track fences at a higher level to provide
> + * implicit synchronisation around GEM objects, e.g. set-domain will wait
> + * for outstanding GPU rendering before marking the object ready for CPU
> + * access, or a pageflip will wait until the GPU is complete before showing
> + * the frame on the scanout.
> + *
> + * In order to use a fence, the object must track the fence it needs to
> + * serialise with. For example, GEM objects want to track both read and
> + * write access so that we can perform concurrent read operations between
> + * the CPU and GPU engines, as well as waiting for all rendering to
> + * complete, or waiting for the last GPU user of a "fence register". The
> + * object then embeds a #i915_active_request to track the most recent (in
> + * retirement order) request relevant for the desired mode of access.
> + * The #i915_active_request is updated with i915_active_request_set() to
> + * track the most recent fence request, typically this is done as part of
> + * i915_vma_move_to_active().
> + *
> + * When the #i915_active_request completes (is retired), it will
> + * signal its completion to the owner through a callback as well as mark
> + * itself as idle (i915_active_request.request == NULL). The owner
> + * can then perform any action, such as delayed freeing of an active
> + * resource including itself.
> + */
> +
> +void i915_active_retire_noop(struct i915_active_request *active,
> +			     struct i915_request *request);
> +
> +/**
> + * i915_active_request_init - prepares the activity tracker for use
> + * @active - the active tracker
> + * @rq - initial request to track, can be NULL
> + * @func - a callback when then the tracker is retired (becomes idle),
> + *         can be NULL
> + *
> + * i915_active_request_init() prepares the embedded @active struct for use as
> + * an activity tracker, that is for tracking the last known active request
> + * associated with it. When the last request becomes idle, when it is retired
> + * after completion, the optional callback @func is invoked.
> + */
> +static inline void
> +i915_active_request_init(struct i915_active_request *active,
> +			 struct i915_request *rq,
> +			 i915_active_retire_fn retire)
> +{
> +	RCU_INIT_POINTER(active->request, rq);
> +	INIT_LIST_HEAD(&active->link);
> +	active->retire = retire ?: i915_active_retire_noop;
> +}
> +
> +#define INIT_ACTIVE_REQUEST(name) i915_active_request_init((name), NULL, NULL)
> +
> +/**
> + * i915_active_request_set - updates the tracker to watch the current request
> + * @active - the active tracker
> + * @request - the request to watch
> + *
> + * __i915_active_request_set() watches the given @request for completion. Whilst
> + * that @request is busy, the @active reports busy. When that @request is
> + * retired, the @active tracker is updated to report idle.
> + */
> +static inline void
> +__i915_active_request_set(struct i915_active_request *active,
> +			  struct i915_request *request)
> +{
> +	list_move(&active->link, &request->active_list);
> +	rcu_assign_pointer(active->request, request);
> +}
> +
> +int __must_check
> +i915_active_request_set(struct i915_active_request *active,
> +			struct i915_request *rq);
> +
> +/**
> + * i915_active_request_set_retire_fn - updates the retirement callback
> + * @active - the active tracker
> + * @fn - the routine called when the request is retired
> + * @mutex - struct_mutex used to guard retirements
> + *
> + * i915_active_request_set_retire_fn() updates the function pointer that
> + * is called when the final request associated with the @active tracker
> + * is retired.
> + */
> +static inline void
> +i915_active_request_set_retire_fn(struct i915_active_request *active,
> +				  i915_active_retire_fn fn,
> +				  struct mutex *mutex)
> +{
> +	lockdep_assert_held(mutex);
> +	active->retire = fn ?: i915_active_retire_noop;
> +}
> +
> +static inline struct i915_request *
> +__i915_active_request_peek(const struct i915_active_request *active)
> +{
> +	/*
> +	 * Inside the error capture (running with the driver in an unknown
> +	 * state), we want to bend the rules slightly (a lot).
> +	 *
> +	 * Work is in progress to make it safer, in the meantime this keeps
> +	 * the known issue from spamming the logs.
> +	 */
> +	return rcu_dereference_protected(active->request, 1);
> +}
> +
> +/**
> + * i915_active_request_raw - return the active request
> + * @active - the active tracker
> + *
> + * i915_active_request_raw() returns the current request being tracked, or NULL.
> + * It does not obtain a reference on the request for the caller, so the caller
> + * must hold struct_mutex.
> + */
> +static inline struct i915_request *
> +i915_active_request_raw(const struct i915_active_request *active,
> +			struct mutex *mutex)
> +{
> +	return rcu_dereference_protected(active->request,
> +					 lockdep_is_held(mutex));
> +}
> +
> +/**
> + * i915_active_request_peek - report the active request being monitored
> + * @active - the active tracker
> + *
> + * i915_active_request_peek() returns the current request being tracked if
> + * still active, or NULL. It does not obtain a reference on the request
> + * for the caller, so the caller must hold struct_mutex.
> + */
> +static inline struct i915_request *
> +i915_active_request_peek(const struct i915_active_request *active,
> +			 struct mutex *mutex)
> +{
> +	struct i915_request *request;
> +
> +	request = i915_active_request_raw(active, mutex);
> +	if (!request || i915_request_completed(request))
> +		return NULL;
> +
> +	return request;
> +}
> +
> +/**
> + * i915_active_request_get - return a reference to the active request
> + * @active - the active tracker
> + *
> + * i915_active_request_get() returns a reference to the active request, or NULL
> + * if the active tracker is idle. The caller must hold struct_mutex.
> + */
> +static inline struct i915_request *
> +i915_active_request_get(const struct i915_active_request *active,
> +			struct mutex *mutex)
> +{
> +	return i915_request_get(i915_active_request_peek(active, mutex));
> +}
> +
> +/**
> + * __i915_active_request_get_rcu - return a reference to the active request
> + * @active - the active tracker
> + *
> + * __i915_active_request_get() returns a reference to the active request,
> + * or NULL if the active tracker is idle. The caller must hold the RCU read
> + * lock, but the returned pointer is safe to use outside of RCU.
> + */
> +static inline struct i915_request *
> +__i915_active_request_get_rcu(const struct i915_active_request *active)
> +{
> +	/*
> +	 * Performing a lockless retrieval of the active request is super
> +	 * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
> +	 * slab of request objects will not be freed whilst we hold the
> +	 * RCU read lock. It does not guarantee that the request itself
> +	 * will not be freed and then *reused*. Viz,
> +	 *
> +	 * Thread A			Thread B
> +	 *
> +	 * rq = active.request
> +	 *				retire(rq) -> free(rq);
> +	 *				(rq is now first on the slab freelist)
> +	 *				active.request = NULL
> +	 *
> +	 *				rq = new submission on a new object
> +	 * ref(rq)
> +	 *
> +	 * To prevent the request from being reused whilst the caller
> +	 * uses it, we take a reference like normal. Whilst acquiring
> +	 * the reference we check that it is not in a destroyed state
> +	 * (refcnt == 0). That prevents the request being reallocated
> +	 * whilst the caller holds on to it. To check that the request
> +	 * was not reallocated as we acquired the reference we have to
> +	 * check that our request remains the active request across
> +	 * the lookup, in the same manner as a seqlock. The visibility
> +	 * of the pointer versus the reference counting is controlled
> +	 * by using RCU barriers (rcu_dereference and rcu_assign_pointer).
> +	 *
> +	 * In the middle of all that, we inspect whether the request is
> +	 * complete. Retiring is lazy so the request may be completed long
> +	 * before the active tracker is updated. Querying whether the
> +	 * request is complete is far cheaper (as it involves no locked
> +	 * instructions setting cachelines to exclusive) than acquiring
> +	 * the reference, so we do it first. The RCU read lock ensures the
> +	 * pointer dereference is valid, but does not ensure that the
> +	 * seqno nor HWS is the right one! However, if the request was
> +	 * reallocated, that means the active tracker's request was complete.
> +	 * If the new request is also complete, then both are and we can
> +	 * just report the active tracker is idle. If the new request is
> +	 * incomplete, then we acquire a reference on it and check that
> +	 * it remained the active request.
> +	 *
> +	 * It is then imperative that we do not zero the request on
> +	 * reallocation, so that we can chase the dangling pointers!
> +	 * See i915_request_alloc().
> +	 */
> +	do {
> +		struct i915_request *request;
> +
> +		request = rcu_dereference(active->request);
> +		if (!request || i915_request_completed(request))
> +			return NULL;
> +
> +		/*
> +		 * An especially silly compiler could decide to recompute the
> +		 * result of i915_request_completed, more specifically
> +		 * re-emit the load for request->fence.seqno. A race would catch
> +		 * a later seqno value, which could flip the result from true to
> +		 * false. Which means part of the instructions below might not
> +		 * be executed, while later on instructions are executed. Due to
> +		 * barriers within the refcounting the inconsistency can't reach
> +		 * past the call to i915_request_get_rcu, but not executing
> +		 * that while still executing i915_request_put() creates
> +		 * havoc enough.  Prevent this with a compiler barrier.
> +		 */
> +		barrier();
> +
> +		request = i915_request_get_rcu(request);
> +
> +		/*
> +		 * What stops the following rcu_access_pointer() from occurring
> +		 * before the above i915_request_get_rcu()? If we were
> +		 * to read the value before pausing to get the reference to
> +		 * the request, we may not notice a change in the active
> +		 * tracker.
> +		 *
> +		 * The rcu_access_pointer() is a mere compiler barrier, which
> +		 * means both the CPU and compiler are free to perform the
> +		 * memory read without constraint. The compiler only has to
> +		 * ensure that any operations after the rcu_access_pointer()
> +		 * occur afterwards in program order. This means the read may
> +		 * be performed earlier by an out-of-order CPU, or adventurous
> +		 * compiler.
> +		 *
> +		 * The atomic operation at the heart of
> +		 * i915_request_get_rcu(), see dma_fence_get_rcu(), is
> +		 * atomic_inc_not_zero() which is only a full memory barrier
> +		 * when successful. That is, if i915_request_get_rcu()
> +		 * returns the request (and so with the reference counted
> +		 * incremented) then the following read for rcu_access_pointer()
> +		 * must occur after the atomic operation and so confirm
> +		 * that this request is the one currently being tracked.
> +		 *
> +		 * The corresponding write barrier is part of
> +		 * rcu_assign_pointer().
> +		 */
> +		if (!request || request == rcu_access_pointer(active->request))
> +			return rcu_pointer_handoff(request);
> +
> +		i915_request_put(request);
> +	} while (1);
> +}
> +
> +/**
> + * i915_active_request_get_unlocked - return a reference to the active request
> + * @active - the active tracker
> + *
> + * i915_active_request_get_unlocked() returns a reference to the active request,
> + * or NULL if the active tracker is idle. The reference is obtained under RCU,
> + * so no locking is required by the caller.
> + *
> + * The reference should be freed with i915_request_put().
> + */
> +static inline struct i915_request *
> +i915_active_request_get_unlocked(const struct i915_active_request *active)
> +{
> +	struct i915_request *request;
> +
> +	rcu_read_lock();
> +	request = __i915_active_request_get_rcu(active);
> +	rcu_read_unlock();
> +
> +	return request;
> +}
> +
> +/**
> + * i915_active_request_isset - report whether the active tracker is assigned
> + * @active - the active tracker
> + *
> + * i915_active_request_isset() returns true if the active tracker is currently
> + * assigned to a request. Due to the lazy retiring, that request may be idle
> + * and this may report stale information.
> + */
> +static inline bool
> +i915_active_request_isset(const struct i915_active_request *active)
> +{
> +	return rcu_access_pointer(active->request);
> +}
> +
> +/**
> + * i915_active_request_retire - waits until the request is retired
> + * @active - the active request on which to wait
> + *
> + * i915_active_request_retire() waits until the request is completed,
> + * and then ensures that at least the retirement handler for this
> + * @active tracker is called before returning. If the @active
> + * tracker is idle, the function returns immediately.
> + */
> +static inline int __must_check
> +i915_active_request_retire(struct i915_active_request *active,
> +			   struct mutex *mutex)
> +{
> +	struct i915_request *request;
> +	long ret;
> +
> +	request = i915_active_request_raw(active, mutex);
> +	if (!request)
> +		return 0;
> +
> +	ret = i915_request_wait(request,
> +				I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
> +				MAX_SCHEDULE_TIMEOUT);
> +	if (ret < 0)
> +		return ret;
> +
> +	list_del_init(&active->link);
> +	RCU_INIT_POINTER(active->request, NULL);
> +
> +	active->retire(active, request);
> +
> +	return 0;
> +}
> +
>   /*
>    * GPU activity tracking
>    *
> @@ -47,6 +393,8 @@ int i915_active_wait(struct i915_active *ref);
>   
>   int i915_request_await_active(struct i915_request *rq,
>   			      struct i915_active *ref);
> +int i915_request_await_active_request(struct i915_request *rq,
> +				      struct i915_active_request *active);
>   
>   bool i915_active_acquire(struct i915_active *ref);
>   void i915_active_release(struct i915_active *ref);
> diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
> index 3d41c33ca78c..7c1b0b1958fa 100644
> --- a/drivers/gpu/drm/i915/i915_active_types.h
> +++ b/drivers/gpu/drm/i915/i915_active_types.h
> @@ -9,18 +9,29 @@
>   
>   #include <linux/list.h>
>   #include <linux/rbtree.h>
> +#include <linux/rcupdate.h>
>   
>   #include "i915_request.h"
>   
> +struct i915_active_request;
>   struct i915_gt_active;
>   struct kmem_cache;
>   
> +typedef void (*i915_active_retire_fn)(struct i915_active_request *,
> +				      struct i915_request *);
> +
> +struct i915_active_request {
> +	struct i915_request __rcu *request;
> +	struct list_head link;
> +	i915_active_retire_fn retire;
> +};
> +
>   struct i915_active {
>   	struct i915_gt_active *gt;
>   	struct list_head active_link;
>   
>   	struct rb_root tree;
> -	struct i915_gem_active last;
> +	struct i915_active_request last;
>   	unsigned int count;
>   
>   	void (*retire)(struct i915_active *ref);
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 2cea263b4d79..9cf86c8df958 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -207,7 +207,7 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
>   		if (vma->fence)
>   			seq_printf(m, " , fence: %d%s",
>   				   vma->fence->id,
> -				   i915_gem_active_isset(&vma->last_fence) ? "*" : "");
> +				   i915_active_request_isset(&vma->last_fence) ? "*" : "");
>   		seq_puts(m, ")");
>   	}
>   	if (obj->stolen)
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 2bc735df408b..ceb06cf73fc3 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2988,7 +2988,7 @@ static void assert_kernel_context_is_current(struct drm_i915_private *i915)
>   
>   	GEM_BUG_ON(i915->gt.active_requests);
>   	for_each_engine(engine, i915, id) {
> -		GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
> +		GEM_BUG_ON(__i915_active_request_peek(&engine->timeline.last_request));
>   		GEM_BUG_ON(engine->last_retired_context !=
>   			   to_intel_context(i915->kernel_context, engine));
>   	}
> @@ -3234,7 +3234,7 @@ wait_for_timelines(struct drm_i915_private *i915,
>   	list_for_each_entry(tl, &gt->active_list, link) {
>   		struct i915_request *rq;
>   
> -		rq = i915_gem_active_get_unlocked(&tl->last_request);
> +		rq = i915_active_request_get_unlocked(&tl->last_request);
>   		if (!rq)
>   			continue;
>   
> @@ -4135,7 +4135,8 @@ i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
>   }
>   
>   static void
> -frontbuffer_retire(struct i915_gem_active *active, struct i915_request *request)
> +frontbuffer_retire(struct i915_active_request *active,
> +		   struct i915_request *request)
>   {
>   	struct drm_i915_gem_object *obj =
>   		container_of(active, typeof(*obj), frontbuffer_write);
> @@ -4162,7 +4163,8 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
>   	obj->resv = &obj->__builtin_resv;
>   
>   	obj->frontbuffer_ggtt_origin = ORIGIN_GTT;
> -	init_request_active(&obj->frontbuffer_write, frontbuffer_retire);
> +	i915_active_request_init(&obj->frontbuffer_write,
> +				 NULL, frontbuffer_retire);
>   
>   	obj->mm.madv = I915_MADV_WILLNEED;
>   	INIT_RADIX_TREE(&obj->mm.get_page.radix, GFP_KERNEL | __GFP_NOWARN);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 6faf1f6faab5..ea8e818d22bf 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -653,8 +653,8 @@ last_request_on_engine(struct i915_timeline *timeline,
>   
>   	GEM_BUG_ON(timeline == &engine->timeline);
>   
> -	rq = i915_gem_active_raw(&timeline->last_request,
> -				 &engine->i915->drm.struct_mutex);
> +	rq = i915_active_request_raw(&timeline->last_request,
> +				     &engine->i915->drm.struct_mutex);
>   	if (rq && rq->engine == engine) {
>   		GEM_TRACE("last request for %s on engine %s: %llx:%llu\n",
>   			  timeline->name, engine->name,
> diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.c b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> index bdb745d5747f..946a3a756787 100644
> --- a/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> +++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.c
> @@ -223,7 +223,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>   			 i915_gem_object_get_tiling(vma->obj)))
>   			return -EINVAL;
>   
> -		ret = i915_gem_active_retire(&vma->last_fence,
> +		ret = i915_active_request_retire(&vma->last_fence,
>   					     &vma->obj->base.dev->struct_mutex);
>   		if (ret)
>   			return ret;
> @@ -232,7 +232,7 @@ static int fence_update(struct drm_i915_fence_reg *fence,
>   	if (fence->vma) {
>   		struct i915_vma *old = fence->vma;
>   
> -		ret = i915_gem_active_retire(&old->last_fence,
> +		ret = i915_active_request_retire(&old->last_fence,
>   					     &old->obj->base.dev->struct_mutex);
>   		if (ret)
>   			return ret;
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index d8819de0d6ee..be79c377fc59 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1918,7 +1918,7 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
>   		return ERR_PTR(-ENOMEM);
>   
>   	i915_active_init(i915_gt_active(i915), &vma->active, NULL);
> -	init_request_active(&vma->last_fence, NULL);
> +	INIT_ACTIVE_REQUEST(&vma->last_fence);
>   
>   	vma->vm = &ggtt->vm;
>   	vma->ops = &pd_vma_ops;
> diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
> index 73fec917d097..fab040331cdb 100644
> --- a/drivers/gpu/drm/i915/i915_gem_object.h
> +++ b/drivers/gpu/drm/i915/i915_gem_object.h
> @@ -175,7 +175,7 @@ struct drm_i915_gem_object {
>   
>   	atomic_t frontbuffer_bits;
>   	unsigned int frontbuffer_ggtt_origin; /* write once */
> -	struct i915_gem_active frontbuffer_write;
> +	struct i915_active_request frontbuffer_write;
>   
>   	/** Current tiling stride for the object, if it's tiled. */
>   	unsigned int tiling_and_stride;
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index 6e2e5ed2bd0a..9a65341fec09 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1062,23 +1062,23 @@ i915_error_object_create(struct drm_i915_private *i915,
>   }
>   
>   /* The error capture is special as tries to run underneath the normal
> - * locking rules - so we use the raw version of the i915_gem_active lookup.
> + * locking rules - so we use the raw version of the i915_active_request lookup.
>    */
>   static inline u32
> -__active_get_seqno(struct i915_gem_active *active)
> +__active_get_seqno(struct i915_active_request *active)
>   {
>   	struct i915_request *request;
>   
> -	request = __i915_gem_active_peek(active);
> +	request = __i915_active_request_peek(active);
>   	return request ? request->global_seqno : 0;
>   }
>   
>   static inline int
> -__active_get_engine_id(struct i915_gem_active *active)
> +__active_get_engine_id(struct i915_active_request *active)
>   {
>   	struct i915_request *request;
>   
> -	request = __i915_gem_active_peek(active);
> +	request = __i915_active_request_peek(active);
>   	return request ? request->engine->id : -1;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 4b1869295362..a09f47ccc703 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -29,6 +29,7 @@
>   #include <linux/sched/signal.h>
>   
>   #include "i915_drv.h"
> +#include "i915_active.h"
>   #include "i915_reset.h"
>   
>   static const char *i915_fence_get_driver_name(struct dma_fence *fence)
> @@ -125,12 +126,6 @@ static void unreserve_gt(struct drm_i915_private *i915)
>   		i915_gem_park(i915);
>   }
>   
> -void i915_gem_retire_noop(struct i915_gem_active *active,
> -			  struct i915_request *request)
> -{
> -	/* Space left intentionally blank */
> -}
> -
>   static void advance_ring(struct i915_request *request)
>   {
>   	struct intel_ring *ring = request->ring;
> @@ -244,7 +239,7 @@ static void __retire_engine_upto(struct intel_engine_cs *engine,
>   
>   static void i915_request_retire(struct i915_request *request)
>   {
> -	struct i915_gem_active *active, *next;
> +	struct i915_active_request *active, *next;
>   
>   	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d:%d\n",
>   		  request->engine->name,
> @@ -278,10 +273,10 @@ static void i915_request_retire(struct i915_request *request)
>   		 * we may spend an inordinate amount of time simply handling
>   		 * the retirement of requests and processing their callbacks.
>   		 * Of which, this loop itself is particularly hot due to the
> -		 * cache misses when jumping around the list of i915_gem_active.
> -		 * So we try to keep this loop as streamlined as possible and
> -		 * also prefetch the next i915_gem_active to try and hide
> -		 * the likely cache miss.
> +		 * cache misses when jumping around the list of
> +		 * i915_active_request.  So we try to keep this loop as
> +		 * streamlined as possible and also prefetch the next
> +		 * i915_active_request to try and hide the likely cache miss.
>   		 */
>   		prefetchw(next);
>   
> @@ -526,17 +521,9 @@ i915_request_alloc_slow(struct intel_context *ce)
>   	return kmem_cache_alloc(ce->gem_context->i915->requests, GFP_KERNEL);
>   }
>   
> -static int add_barrier(struct i915_request *rq, struct i915_gem_active *active)
> -{
> -	struct i915_request *barrier =
> -		i915_gem_active_raw(active, &rq->i915->drm.struct_mutex);
> -
> -	return barrier ? i915_request_await_dma_fence(rq, &barrier->fence) : 0;
> -}
> -
>   static int add_timeline_barrier(struct i915_request *rq)
>   {
> -	return add_barrier(rq, &rq->timeline->barrier);
> +	return i915_request_await_active_request(rq, &rq->timeline->barrier);
>   }
>   
>   /**
> @@ -595,7 +582,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	 * We use RCU to look up requests in flight. The lookups may
>   	 * race with the request being allocated from the slab freelist.
>   	 * That is the request we are writing to here, may be in the process
> -	 * of being read by __i915_gem_active_get_rcu(). As such,
> +	 * of being read by __i915_active_request_get_rcu(). As such,
>   	 * we have to be very careful when overwriting the contents. During
>   	 * the RCU lookup, we change chase the request->engine pointer,
>   	 * read the request->global_seqno and increment the reference count.
> @@ -937,8 +924,8 @@ void i915_request_add(struct i915_request *request)
>   	 * see a more recent value in the hws than we are tracking.
>   	 */
>   
> -	prev = i915_gem_active_raw(&timeline->last_request,
> -				   &request->i915->drm.struct_mutex);
> +	prev = i915_active_request_raw(&timeline->last_request,
> +				       &request->i915->drm.struct_mutex);
>   	if (prev && !i915_request_completed(prev)) {
>   		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
>   					     &request->submitq);
> @@ -954,7 +941,7 @@ void i915_request_add(struct i915_request *request)
>   	spin_unlock_irq(&timeline->lock);
>   
>   	GEM_BUG_ON(timeline->seqno != request->fence.seqno);
> -	i915_gem_active_set(&timeline->last_request, request);
> +	__i915_active_request_set(&timeline->last_request, request);
>   
>   	list_add_tail(&request->ring_link, &ring->request_list);
>   	if (list_is_first(&request->ring_link, &ring->request_list)) {
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 3cffb96203b9..40f3e8dcbdd5 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -403,387 +403,4 @@ static inline void i915_request_mark_complete(struct i915_request *rq)
>   
>   void i915_retire_requests(struct drm_i915_private *i915);
>   
> -/*
> - * We treat requests as fences. This is not be to confused with our
> - * "fence registers" but pipeline synchronisation objects ala GL_ARB_sync.
> - * We use the fences to synchronize access from the CPU with activity on the
> - * GPU, for example, we should not rewrite an object's PTE whilst the GPU
> - * is reading them. We also track fences at a higher level to provide
> - * implicit synchronisation around GEM objects, e.g. set-domain will wait
> - * for outstanding GPU rendering before marking the object ready for CPU
> - * access, or a pageflip will wait until the GPU is complete before showing
> - * the frame on the scanout.
> - *
> - * In order to use a fence, the object must track the fence it needs to
> - * serialise with. For example, GEM objects want to track both read and
> - * write access so that we can perform concurrent read operations between
> - * the CPU and GPU engines, as well as waiting for all rendering to
> - * complete, or waiting for the last GPU user of a "fence register". The
> - * object then embeds a #i915_gem_active to track the most recent (in
> - * retirement order) request relevant for the desired mode of access.
> - * The #i915_gem_active is updated with i915_gem_active_set() to track the
> - * most recent fence request, typically this is done as part of
> - * i915_vma_move_to_active().
> - *
> - * When the #i915_gem_active completes (is retired), it will
> - * signal its completion to the owner through a callback as well as mark
> - * itself as idle (i915_gem_active.request == NULL). The owner
> - * can then perform any action, such as delayed freeing of an active
> - * resource including itself.
> - */
> -struct i915_gem_active;
> -
> -typedef void (*i915_gem_retire_fn)(struct i915_gem_active *,
> -				   struct i915_request *);
> -
> -struct i915_gem_active {
> -	struct i915_request __rcu *request;
> -	struct list_head link;
> -	i915_gem_retire_fn retire;
> -};
> -
> -void i915_gem_retire_noop(struct i915_gem_active *,
> -			  struct i915_request *request);
> -
> -/**
> - * init_request_active - prepares the activity tracker for use
> - * @active - the active tracker
> - * @func - a callback when then the tracker is retired (becomes idle),
> - *         can be NULL
> - *
> - * init_request_active() prepares the embedded @active struct for use as
> - * an activity tracker, that is for tracking the last known active request
> - * associated with it. When the last request becomes idle, when it is retired
> - * after completion, the optional callback @func is invoked.
> - */
> -static inline void
> -init_request_active(struct i915_gem_active *active,
> -		    i915_gem_retire_fn retire)
> -{
> -	RCU_INIT_POINTER(active->request, NULL);
> -	INIT_LIST_HEAD(&active->link);
> -	active->retire = retire ?: i915_gem_retire_noop;
> -}
> -
> -/**
> - * i915_gem_active_set - updates the tracker to watch the current request
> - * @active - the active tracker
> - * @request - the request to watch
> - *
> - * i915_gem_active_set() watches the given @request for completion. Whilst
> - * that @request is busy, the @active reports busy. When that @request is
> - * retired, the @active tracker is updated to report idle.
> - */
> -static inline void
> -i915_gem_active_set(struct i915_gem_active *active,
> -		    struct i915_request *request)
> -{
> -	list_move(&active->link, &request->active_list);
> -	rcu_assign_pointer(active->request, request);
> -}
> -
> -/**
> - * i915_gem_active_set_retire_fn - updates the retirement callback
> - * @active - the active tracker
> - * @fn - the routine called when the request is retired
> - * @mutex - struct_mutex used to guard retirements
> - *
> - * i915_gem_active_set_retire_fn() updates the function pointer that
> - * is called when the final request associated with the @active tracker
> - * is retired.
> - */
> -static inline void
> -i915_gem_active_set_retire_fn(struct i915_gem_active *active,
> -			      i915_gem_retire_fn fn,
> -			      struct mutex *mutex)
> -{
> -	lockdep_assert_held(mutex);
> -	active->retire = fn ?: i915_gem_retire_noop;
> -}
> -
> -static inline struct i915_request *
> -__i915_gem_active_peek(const struct i915_gem_active *active)
> -{
> -	/*
> -	 * Inside the error capture (running with the driver in an unknown
> -	 * state), we want to bend the rules slightly (a lot).
> -	 *
> -	 * Work is in progress to make it safer, in the meantime this keeps
> -	 * the known issue from spamming the logs.
> -	 */
> -	return rcu_dereference_protected(active->request, 1);
> -}
> -
> -/**
> - * i915_gem_active_raw - return the active request
> - * @active - the active tracker
> - *
> - * i915_gem_active_raw() returns the current request being tracked, or NULL.
> - * It does not obtain a reference on the request for the caller, so the caller
> - * must hold struct_mutex.
> - */
> -static inline struct i915_request *
> -i915_gem_active_raw(const struct i915_gem_active *active, struct mutex *mutex)
> -{
> -	return rcu_dereference_protected(active->request,
> -					 lockdep_is_held(mutex));
> -}
> -
> -/**
> - * i915_gem_active_peek - report the active request being monitored
> - * @active - the active tracker
> - *
> - * i915_gem_active_peek() returns the current request being tracked if
> - * still active, or NULL. It does not obtain a reference on the request
> - * for the caller, so the caller must hold struct_mutex.
> - */
> -static inline struct i915_request *
> -i915_gem_active_peek(const struct i915_gem_active *active, struct mutex *mutex)
> -{
> -	struct i915_request *request;
> -
> -	request = i915_gem_active_raw(active, mutex);
> -	if (!request || i915_request_completed(request))
> -		return NULL;
> -
> -	return request;
> -}
> -
> -/**
> - * i915_gem_active_get - return a reference to the active request
> - * @active - the active tracker
> - *
> - * i915_gem_active_get() returns a reference to the active request, or NULL
> - * if the active tracker is idle. The caller must hold struct_mutex.
> - */
> -static inline struct i915_request *
> -i915_gem_active_get(const struct i915_gem_active *active, struct mutex *mutex)
> -{
> -	return i915_request_get(i915_gem_active_peek(active, mutex));
> -}
> -
> -/**
> - * __i915_gem_active_get_rcu - return a reference to the active request
> - * @active - the active tracker
> - *
> - * __i915_gem_active_get() returns a reference to the active request, or NULL
> - * if the active tracker is idle. The caller must hold the RCU read lock, but
> - * the returned pointer is safe to use outside of RCU.
> - */
> -static inline struct i915_request *
> -__i915_gem_active_get_rcu(const struct i915_gem_active *active)
> -{
> -	/*
> -	 * Performing a lockless retrieval of the active request is super
> -	 * tricky. SLAB_TYPESAFE_BY_RCU merely guarantees that the backing
> -	 * slab of request objects will not be freed whilst we hold the
> -	 * RCU read lock. It does not guarantee that the request itself
> -	 * will not be freed and then *reused*. Viz,
> -	 *
> -	 * Thread A			Thread B
> -	 *
> -	 * rq = active.request
> -	 *				retire(rq) -> free(rq);
> -	 *				(rq is now first on the slab freelist)
> -	 *				active.request = NULL
> -	 *
> -	 *				rq = new submission on a new object
> -	 * ref(rq)
> -	 *
> -	 * To prevent the request from being reused whilst the caller
> -	 * uses it, we take a reference like normal. Whilst acquiring
> -	 * the reference we check that it is not in a destroyed state
> -	 * (refcnt == 0). That prevents the request being reallocated
> -	 * whilst the caller holds on to it. To check that the request
> -	 * was not reallocated as we acquired the reference we have to
> -	 * check that our request remains the active request across
> -	 * the lookup, in the same manner as a seqlock. The visibility
> -	 * of the pointer versus the reference counting is controlled
> -	 * by using RCU barriers (rcu_dereference and rcu_assign_pointer).
> -	 *
> -	 * In the middle of all that, we inspect whether the request is
> -	 * complete. Retiring is lazy so the request may be completed long
> -	 * before the active tracker is updated. Querying whether the
> -	 * request is complete is far cheaper (as it involves no locked
> -	 * instructions setting cachelines to exclusive) than acquiring
> -	 * the reference, so we do it first. The RCU read lock ensures the
> -	 * pointer dereference is valid, but does not ensure that the
> -	 * seqno nor HWS is the right one! However, if the request was
> -	 * reallocated, that means the active tracker's request was complete.
> -	 * If the new request is also complete, then both are and we can
> -	 * just report the active tracker is idle. If the new request is
> -	 * incomplete, then we acquire a reference on it and check that
> -	 * it remained the active request.
> -	 *
> -	 * It is then imperative that we do not zero the request on
> -	 * reallocation, so that we can chase the dangling pointers!
> -	 * See i915_request_alloc().
> -	 */
> -	do {
> -		struct i915_request *request;
> -
> -		request = rcu_dereference(active->request);
> -		if (!request || i915_request_completed(request))
> -			return NULL;
> -
> -		/*
> -		 * An especially silly compiler could decide to recompute the
> -		 * result of i915_request_completed, more specifically
> -		 * re-emit the load for request->fence.seqno. A race would catch
> -		 * a later seqno value, which could flip the result from true to
> -		 * false. Which means part of the instructions below might not
> -		 * be executed, while later on instructions are executed. Due to
> -		 * barriers within the refcounting the inconsistency can't reach
> -		 * past the call to i915_request_get_rcu, but not executing
> -		 * that while still executing i915_request_put() creates
> -		 * havoc enough.  Prevent this with a compiler barrier.
> -		 */
> -		barrier();
> -
> -		request = i915_request_get_rcu(request);
> -
> -		/*
> -		 * What stops the following rcu_access_pointer() from occurring
> -		 * before the above i915_request_get_rcu()? If we were
> -		 * to read the value before pausing to get the reference to
> -		 * the request, we may not notice a change in the active
> -		 * tracker.
> -		 *
> -		 * The rcu_access_pointer() is a mere compiler barrier, which
> -		 * means both the CPU and compiler are free to perform the
> -		 * memory read without constraint. The compiler only has to
> -		 * ensure that any operations after the rcu_access_pointer()
> -		 * occur afterwards in program order. This means the read may
> -		 * be performed earlier by an out-of-order CPU, or adventurous
> -		 * compiler.
> -		 *
> -		 * The atomic operation at the heart of
> -		 * i915_request_get_rcu(), see dma_fence_get_rcu(), is
> -		 * atomic_inc_not_zero() which is only a full memory barrier
> -		 * when successful. That is, if i915_request_get_rcu()
> -		 * returns the request (and so with the reference counted
> -		 * incremented) then the following read for rcu_access_pointer()
> -		 * must occur after the atomic operation and so confirm
> -		 * that this request is the one currently being tracked.
> -		 *
> -		 * The corresponding write barrier is part of
> -		 * rcu_assign_pointer().
> -		 */
> -		if (!request || request == rcu_access_pointer(active->request))
> -			return rcu_pointer_handoff(request);
> -
> -		i915_request_put(request);
> -	} while (1);
> -}
> -
> -/**
> - * i915_gem_active_get_unlocked - return a reference to the active request
> - * @active - the active tracker
> - *
> - * i915_gem_active_get_unlocked() returns a reference to the active request,
> - * or NULL if the active tracker is idle. The reference is obtained under RCU,
> - * so no locking is required by the caller.
> - *
> - * The reference should be freed with i915_request_put().
> - */
> -static inline struct i915_request *
> -i915_gem_active_get_unlocked(const struct i915_gem_active *active)
> -{
> -	struct i915_request *request;
> -
> -	rcu_read_lock();
> -	request = __i915_gem_active_get_rcu(active);
> -	rcu_read_unlock();
> -
> -	return request;
> -}
> -
> -/**
> - * i915_gem_active_isset - report whether the active tracker is assigned
> - * @active - the active tracker
> - *
> - * i915_gem_active_isset() returns true if the active tracker is currently
> - * assigned to a request. Due to the lazy retiring, that request may be idle
> - * and this may report stale information.
> - */
> -static inline bool
> -i915_gem_active_isset(const struct i915_gem_active *active)
> -{
> -	return rcu_access_pointer(active->request);
> -}
> -
> -/**
> - * i915_gem_active_wait - waits until the request is completed
> - * @active - the active request on which to wait
> - * @flags - how to wait
> - * @timeout - how long to wait at most
> - * @rps - userspace client to charge for a waitboost
> - *
> - * i915_gem_active_wait() waits until the request is completed before
> - * returning, without requiring any locks to be held. Note that it does not
> - * retire any requests before returning.
> - *
> - * This function relies on RCU in order to acquire the reference to the active
> - * request without holding any locks. See __i915_gem_active_get_rcu() for the
> - * glory details on how that is managed. Once the reference is acquired, we
> - * can then wait upon the request, and afterwards release our reference,
> - * free of any locking.
> - *
> - * This function wraps i915_request_wait(), see it for the full details on
> - * the arguments.
> - *
> - * Returns 0 if successful, or a negative error code.
> - */
> -static inline int
> -i915_gem_active_wait(const struct i915_gem_active *active, unsigned int flags)
> -{
> -	struct i915_request *request;
> -	long ret = 0;
> -
> -	request = i915_gem_active_get_unlocked(active);
> -	if (request) {
> -		ret = i915_request_wait(request, flags, MAX_SCHEDULE_TIMEOUT);
> -		i915_request_put(request);
> -	}
> -
> -	return ret < 0 ? ret : 0;
> -}
> -
> -/**
> - * i915_gem_active_retire - waits until the request is retired
> - * @active - the active request on which to wait
> - *
> - * i915_gem_active_retire() waits until the request is completed,
> - * and then ensures that at least the retirement handler for this
> - * @active tracker is called before returning. If the @active
> - * tracker is idle, the function returns immediately.
> - */
> -static inline int __must_check
> -i915_gem_active_retire(struct i915_gem_active *active,
> -		       struct mutex *mutex)
> -{
> -	struct i915_request *request;
> -	long ret;
> -
> -	request = i915_gem_active_raw(active, mutex);
> -	if (!request)
> -		return 0;
> -
> -	ret = i915_request_wait(request,
> -				I915_WAIT_INTERRUPTIBLE | I915_WAIT_LOCKED,
> -				MAX_SCHEDULE_TIMEOUT);
> -	if (ret < 0)
> -		return ret;
> -
> -	list_del_init(&active->link);
> -	RCU_INIT_POINTER(active->request, NULL);
> -
> -	active->retire(active, request);
> -
> -	return 0;
> -}
> -
> -#define for_each_active(mask, idx) \
> -	for (; mask ? idx = ffs(mask) - 1, 1 : 0; mask &= ~BIT(idx))
> -
>   #endif /* I915_REQUEST_H */
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index 328b35410672..555f358bf6ba 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -888,7 +888,7 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>   		struct i915_request *rq;
>   		long timeout;
>   
> -		rq = i915_gem_active_get_unlocked(&tl->last_request);
> +		rq = i915_active_request_get_unlocked(&tl->last_request);
>   		if (!rq)
>   			continue;
>   
> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
> index b354843a5040..b2202d2e58a2 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_timeline.c
> @@ -163,8 +163,8 @@ int i915_timeline_init(struct drm_i915_private *i915,
>   
>   	spin_lock_init(&timeline->lock);
>   
> -	init_request_active(&timeline->barrier, NULL);
> -	init_request_active(&timeline->last_request, NULL);
> +	INIT_ACTIVE_REQUEST(&timeline->barrier);
> +	INIT_ACTIVE_REQUEST(&timeline->last_request);
>   	INIT_LIST_HEAD(&timeline->requests);
>   
>   	i915_syncmap_init(&timeline->sync);
> @@ -236,7 +236,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
>   {
>   	GEM_BUG_ON(timeline->pin_count);
>   	GEM_BUG_ON(!list_empty(&timeline->requests));
> -	GEM_BUG_ON(i915_gem_active_isset(&timeline->barrier));
> +	GEM_BUG_ON(i915_active_request_isset(&timeline->barrier));
>   
>   	i915_syncmap_free(&timeline->sync);
>   	hwsp_free(timeline);
> @@ -268,25 +268,6 @@ i915_timeline_create(struct drm_i915_private *i915,
>   	return timeline;
>   }
>   
> -int i915_timeline_set_barrier(struct i915_timeline *tl, struct i915_request *rq)
> -{
> -	struct i915_request *old;
> -	int err;
> -
> -	lockdep_assert_held(&rq->i915->drm.struct_mutex);
> -
> -	/* Must maintain ordering wrt existing barriers */
> -	old = i915_gem_active_raw(&tl->barrier, &rq->i915->drm.struct_mutex);
> -	if (old) {
> -		err = i915_request_await_dma_fence(rq, &old->fence);
> -		if (err)
> -			return err;
> -	}
> -
> -	i915_gem_active_set(&tl->barrier, rq);
> -	return 0;
> -}
> -
>   int i915_timeline_pin(struct i915_timeline *tl)
>   {
>   	int err;
> diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
> index d167e04073c5..7bec7d2e45bf 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.h
> +++ b/drivers/gpu/drm/i915/i915_timeline.h
> @@ -28,6 +28,7 @@
>   #include <linux/list.h>
>   #include <linux/kref.h>
>   
> +#include "i915_active.h"
>   #include "i915_request.h"
>   #include "i915_syncmap.h"
>   #include "i915_utils.h"
> @@ -58,10 +59,10 @@ struct i915_timeline {
>   
>   	/* Contains an RCU guarded pointer to the last request. No reference is
>   	 * held to the request, users must carefully acquire a reference to
> -	 * the request using i915_gem_active_get_request_rcu(), or hold the
> +	 * the request using i915_active_request_get_request_rcu(), or hold the
>   	 * struct_mutex.
>   	 */
> -	struct i915_gem_active last_request;
> +	struct i915_active_request last_request;
>   
>   	/**
>   	 * We track the most recent seqno that we wait on in every context so
> @@ -82,7 +83,7 @@ struct i915_timeline {
>   	 * subsequent submissions to this timeline be executed only after the
>   	 * barrier has been completed.
>   	 */
> -	struct i915_gem_active barrier;
> +	struct i915_active_request barrier;
>   
>   	struct list_head link;
>   	const char *name;
> @@ -174,7 +175,10 @@ void i915_timelines_fini(struct drm_i915_private *i915);
>    * submissions on @timeline. Subsequent requests will not be submitted to GPU
>    * until the barrier has been completed.
>    */
> -int i915_timeline_set_barrier(struct i915_timeline *timeline,
> -			      struct i915_request *rq);
> +static inline int
> +i915_timeline_set_barrier(struct i915_timeline *tl, struct i915_request *rq)
> +{
> +	return i915_active_request_set(&tl->barrier, rq);
> +}
>   
>   #endif
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index 2456bfb4877b..376821c37d72 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -121,7 +121,7 @@ vma_create(struct drm_i915_gem_object *obj,
>   
>   	i915_active_init(i915_gt_active(vm->i915),
>   			 &vma->active, __i915_vma_retire);
> -	init_request_active(&vma->last_fence, NULL);
> +	INIT_ACTIVE_REQUEST(&vma->last_fence);
>   
>   	vma->vm = vm;
>   	vma->ops = &vm->vma_ops;
> @@ -809,7 +809,7 @@ static void __i915_vma_destroy(struct i915_vma *vma)
>   	GEM_BUG_ON(vma->node.allocated);
>   	GEM_BUG_ON(vma->fence);
>   
> -	GEM_BUG_ON(i915_gem_active_isset(&vma->last_fence));
> +	GEM_BUG_ON(i915_active_request_isset(&vma->last_fence));
>   
>   	mutex_lock(&vma->vm->mutex);
>   	list_del(&vma->vm_link);
> @@ -943,14 +943,14 @@ int i915_vma_move_to_active(struct i915_vma *vma,
>   		obj->write_domain = I915_GEM_DOMAIN_RENDER;
>   
>   		if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
> -			i915_gem_active_set(&obj->frontbuffer_write, rq);
> +			__i915_active_request_set(&obj->frontbuffer_write, rq);
>   
>   		obj->read_domains = 0;
>   	}
>   	obj->read_domains |= I915_GEM_GPU_DOMAINS;
>   
>   	if (flags & EXEC_OBJECT_NEEDS_FENCE)
> -		i915_gem_active_set(&vma->last_fence, rq);
> +		__i915_active_request_set(&vma->last_fence, rq);
>   
>   	export_fence(vma, rq, flags);
>   	return 0;
> @@ -987,8 +987,8 @@ int i915_vma_unbind(struct i915_vma *vma)
>   		if (ret)
>   			goto unpin;
>   
> -		ret = i915_gem_active_retire(&vma->last_fence,
> -					     &vma->vm->i915->drm.struct_mutex);
> +		ret = i915_active_request_retire(&vma->last_fence,
> +					      &vma->vm->i915->drm.struct_mutex);
>   unpin:
>   		__i915_vma_unpin(vma);
>   		if (ret)
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index 3c03d4569481..7c742027f866 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -110,7 +110,7 @@ struct i915_vma {
>   #define I915_VMA_GGTT_WRITE	BIT(15)
>   
>   	struct i915_active active;
> -	struct i915_gem_active last_fence;
> +	struct i915_active_request last_fence;
>   
>   	/**
>   	 * Support different GGTT views into the same object.
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 71c01eb13af1..49fa43ff02ba 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -1086,7 +1086,7 @@ bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine)
>   	 * the last request that remains in the timeline. When idle, it is
>   	 * the last executed context as tracked by retirement.
>   	 */
> -	rq = __i915_gem_active_peek(&engine->timeline.last_request);
> +	rq = __i915_active_request_peek(&engine->timeline.last_request);
>   	if (rq)
>   		return rq->hw_context == kernel_context;
>   	else
> diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
> index a9238fd07e30..c0df1dbb0069 100644
> --- a/drivers/gpu/drm/i915/intel_overlay.c
> +++ b/drivers/gpu/drm/i915/intel_overlay.c
> @@ -186,7 +186,7 @@ struct intel_overlay {
>   	struct overlay_registers __iomem *regs;
>   	u32 flip_addr;
>   	/* flip handling */
> -	struct i915_gem_active last_flip;
> +	struct i915_active_request last_flip;
>   };
>   
>   static void i830_overlay_clock_gating(struct drm_i915_private *dev_priv,
> @@ -214,23 +214,23 @@ static void i830_overlay_clock_gating(struct drm_i915_private *dev_priv,
>   
>   static void intel_overlay_submit_request(struct intel_overlay *overlay,
>   					 struct i915_request *rq,
> -					 i915_gem_retire_fn retire)
> +					 i915_active_retire_fn retire)
>   {
> -	GEM_BUG_ON(i915_gem_active_peek(&overlay->last_flip,
> -					&overlay->i915->drm.struct_mutex));
> -	i915_gem_active_set_retire_fn(&overlay->last_flip, retire,
> -				      &overlay->i915->drm.struct_mutex);
> -	i915_gem_active_set(&overlay->last_flip, rq);
> +	GEM_BUG_ON(i915_active_request_peek(&overlay->last_flip,
> +					    &overlay->i915->drm.struct_mutex));
> +	i915_active_request_set_retire_fn(&overlay->last_flip, retire,
> +					  &overlay->i915->drm.struct_mutex);
> +	__i915_active_request_set(&overlay->last_flip, rq);
>   	i915_request_add(rq);
>   }
>   
>   static int intel_overlay_do_wait_request(struct intel_overlay *overlay,
>   					 struct i915_request *rq,
> -					 i915_gem_retire_fn retire)
> +					 i915_active_retire_fn retire)
>   {
>   	intel_overlay_submit_request(overlay, rq, retire);
> -	return i915_gem_active_retire(&overlay->last_flip,
> -				      &overlay->i915->drm.struct_mutex);
> +	return i915_active_request_retire(&overlay->last_flip,
> +					  &overlay->i915->drm.struct_mutex);
>   }
>   
>   static struct i915_request *alloc_request(struct intel_overlay *overlay)
> @@ -351,8 +351,9 @@ static void intel_overlay_release_old_vma(struct intel_overlay *overlay)
>   	i915_vma_put(vma);
>   }
>   
> -static void intel_overlay_release_old_vid_tail(struct i915_gem_active *active,
> -					       struct i915_request *rq)
> +static void
> +intel_overlay_release_old_vid_tail(struct i915_active_request *active,
> +				   struct i915_request *rq)
>   {
>   	struct intel_overlay *overlay =
>   		container_of(active, typeof(*overlay), last_flip);
> @@ -360,7 +361,7 @@ static void intel_overlay_release_old_vid_tail(struct i915_gem_active *active,
>   	intel_overlay_release_old_vma(overlay);
>   }
>   
> -static void intel_overlay_off_tail(struct i915_gem_active *active,
> +static void intel_overlay_off_tail(struct i915_active_request *active,
>   				   struct i915_request *rq)
>   {
>   	struct intel_overlay *overlay =
> @@ -423,8 +424,8 @@ static int intel_overlay_off(struct intel_overlay *overlay)
>    * We have to be careful not to repeat work forever an make forward progess. */
>   static int intel_overlay_recover_from_interrupt(struct intel_overlay *overlay)
>   {
> -	return i915_gem_active_retire(&overlay->last_flip,
> -				      &overlay->i915->drm.struct_mutex);
> +	return i915_active_request_retire(&overlay->last_flip,
> +					  &overlay->i915->drm.struct_mutex);
>   }
>   
>   /* Wait for pending overlay flip and release old frame.
> @@ -1357,7 +1358,7 @@ void intel_overlay_setup(struct drm_i915_private *dev_priv)
>   	overlay->contrast = 75;
>   	overlay->saturation = 146;
>   
> -	init_request_active(&overlay->last_flip, NULL);
> +	INIT_ACTIVE_REQUEST(&overlay->last_flip);
>   
>   	mutex_lock(&dev_priv->drm.struct_mutex);
>   
> diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> index 1151c54d2acf..b0331b0bfbc0 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
> @@ -503,8 +503,8 @@ static int live_suppress_wait_preempt(void *arg)
>   				}
>   
>   				/* Disable NEWCLIENT promotion */
> -				i915_gem_active_set(&rq[i]->timeline->last_request,
> -						    dummy);
> +				__i915_active_request_set(&rq[i]->timeline->last_request,
> +							  dummy);
>   				i915_request_add(rq[i]);
>   			}
>   
> diff --git a/drivers/gpu/drm/i915/selftests/mock_timeline.c b/drivers/gpu/drm/i915/selftests/mock_timeline.c
> index e5659aaa856d..d2de9ece2118 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_timeline.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_timeline.c
> @@ -15,8 +15,8 @@ void mock_timeline_init(struct i915_timeline *timeline, u64 context)
>   
>   	spin_lock_init(&timeline->lock);
>   
> -	init_request_active(&timeline->barrier, NULL);
> -	init_request_active(&timeline->last_request, NULL);
> +	INIT_ACTIVE_REQUEST(&timeline->barrier);
> +	INIT_ACTIVE_REQUEST(&timeline->last_request);
>   	INIT_LIST_HEAD(&timeline->requests);
>   
>   	i915_syncmap_init(&timeline->sync);
> 

Hello conflicts!

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v2] drm/i915: Allocate active tracking nodes from a slabcache
  2019-01-30  2:19 ` [PATCH 06/11] drm/i915: Allocate active tracking nodes from a slabcache Chris Wilson
  2019-01-30 15:57   ` Tvrtko Ursulin
@ 2019-01-30 16:37   ` Chris Wilson
  2019-01-30 16:47     ` Chris Wilson
  2019-01-30 17:25   ` [PATCH v3] " Chris Wilson
  2 siblings, 1 reply; 38+ messages in thread
From: Chris Wilson @ 2019-01-30 16:37 UTC (permalink / raw)
  To: intel-gfx

Wrap the active tracking for a GPU references in a slabcache for faster
allocations, and keep track of inflight nodes so we can reap the
stale entries upon idling (thereby trimming our memory usage).

v2: Automatically discard the trees everytime the tracker idles; they
should be rarely used and fast to allocate as required.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_active.c            | 48 +++++++++++++++----
 drivers/gpu/drm/i915/i915_active.h            | 19 ++++++--
 drivers/gpu/drm/i915/i915_active_types.h      | 10 +++-
 drivers/gpu/drm/i915/i915_drv.h               |  2 +
 drivers/gpu/drm/i915/i915_gem.c               | 15 ++++--
 drivers/gpu/drm/i915/i915_gem_gtt.c           |  2 +-
 drivers/gpu/drm/i915/i915_vma.c               |  3 +-
 drivers/gpu/drm/i915/selftests/i915_active.c  |  3 +-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  6 +++
 9 files changed, 86 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index e0182e19cb8b..fe475015221d 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -7,7 +7,9 @@
 #include "i915_drv.h"
 #include "i915_active.h"
 
-#define BKL(ref) (&(ref)->i915->drm.struct_mutex)
+#define i915_from_gt(x) \
+	container_of(x, struct drm_i915_private, gt.active_refs)
+#define BKL(ref) (&i915_from_gt((ref)->gt)->drm.struct_mutex)
 
 struct active_node {
 	struct i915_gem_active base;
@@ -20,8 +22,13 @@ static void
 __active_retire(struct i915_active *ref)
 {
 	GEM_BUG_ON(!ref->count);
-	if (!--ref->count)
-		ref->retire(ref);
+	if (--ref->count)
+		return;
+
+	/* return the unused nodes to our slabcache*/
+	i915_active_fini(ref);
+
+	ref->retire(ref);
 }
 
 static void
@@ -79,11 +86,11 @@ active_instance(struct i915_active *ref, u64 idx)
 			p = &parent->rb_left;
 	}
 
-	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	node = kmem_cache_alloc(ref->gt->slab_cache, GFP_KERNEL);
 
 	/* kmalloc may retire the ref->last (thanks shrinker)! */
 	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
-		kfree(node);
+		kmem_cache_free(ref->gt->slab_cache, node);
 		goto out;
 	}
 
@@ -119,11 +126,11 @@ active_instance(struct i915_active *ref, u64 idx)
 	return &ref->last;
 }
 
-void i915_active_init(struct drm_i915_private *i915,
+void i915_active_init(struct i915_gt_active *gt,
 		      struct i915_active *ref,
 		      void (*retire)(struct i915_active *ref))
 {
-	ref->i915 = i915;
+	ref->gt = gt;
 	ref->retire = retire;
 	ref->tree = RB_ROOT;
 	init_request_active(&ref->last, last_retire);
@@ -161,6 +168,7 @@ void i915_active_release(struct i915_active *ref)
 
 int i915_active_wait(struct i915_active *ref)
 {
+	struct kmem_cache *slab = ref->gt->slab_cache;
 	struct active_node *it, *n;
 	int ret;
 
@@ -168,13 +176,16 @@ int i915_active_wait(struct i915_active *ref)
 	if (ret)
 		return ret;
 
+	if (RB_EMPTY_ROOT(&ref->tree))
+		return 0;
+
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
 		ret = i915_gem_active_retire(&it->base, BKL(ref));
 		if (ret)
 			return ret;
 
 		GEM_BUG_ON(i915_gem_active_isset(&it->base));
-		kfree(it);
+		kmem_cache_free(slab, it);
 	}
 	ref->tree = RB_ROOT;
 
@@ -210,17 +221,36 @@ int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
 
 void i915_active_fini(struct i915_active *ref)
 {
+	struct kmem_cache *slab = ref->gt->slab_cache;
 	struct active_node *it, *n;
 
+	lockdep_assert_held(BKL(ref));
 	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
 
+	if (RB_EMPTY_ROOT(&ref->tree))
+		return;
+
 	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
 		GEM_BUG_ON(i915_gem_active_isset(&it->base));
-		kfree(it);
+		kmem_cache_free(slab, it);
 	}
 	ref->tree = RB_ROOT;
 }
 
+int i915_gt_active_init(struct i915_gt_active *gt)
+{
+	gt->slab_cache = KMEM_CACHE(active_node, SLAB_HWCACHE_ALIGN);
+	if (!gt->slab_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void i915_gt_active_fini(struct i915_gt_active *gt)
+{
+	kmem_cache_destroy(gt->slab_cache);
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/i915_active.c"
 #endif
diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
index c0729a046f98..13d644d7e689 100644
--- a/drivers/gpu/drm/i915/i915_active.h
+++ b/drivers/gpu/drm/i915/i915_active.h
@@ -9,10 +9,6 @@
 
 #include "i915_active_types.h"
 
-#include <linux/rbtree.h>
-
-#include "i915_request.h"
-
 /*
  * GPU activity tracking
  *
@@ -39,7 +35,7 @@
  * synchronisation.
  */
 
-void i915_active_init(struct drm_i915_private *i915,
+void i915_active_init(struct i915_gt_active *gt,
 		      struct i915_active *ref,
 		      void (*retire)(struct i915_active *ref));
 
@@ -63,4 +59,17 @@ i915_active_is_idle(const struct i915_active *ref)
 
 void i915_active_fini(struct i915_active *ref);
 
+/*
+ * Active refs memory management
+ *
+ * To be more economical with memory, we reap all the i915_active trees as
+ * they idle (when we know the active requests are inactive) and allocate the
+ * nodes from a local slab cache to hopefully reduce the fragmentation.
+ */
+
+int i915_gt_active_init(struct i915_gt_active *gt);
+void i915_gt_active_fini(struct i915_gt_active *gt);
+
+#define i915_gt_active(i915) (&(i915)->gt.active_refs)
+
 #endif /* _I915_ACTIVE_H_ */
diff --git a/drivers/gpu/drm/i915/i915_active_types.h b/drivers/gpu/drm/i915/i915_active_types.h
index 411e502ed8dd..817cc8c530f1 100644
--- a/drivers/gpu/drm/i915/i915_active_types.h
+++ b/drivers/gpu/drm/i915/i915_active_types.h
@@ -7,14 +7,16 @@
 #ifndef _I915_ACTIVE_TYPES_H_
 #define _I915_ACTIVE_TYPES_H_
 
+#include <linux/list.h>
 #include <linux/rbtree.h>
 
 #include "i915_request.h"
 
-struct drm_i915_private;
+struct i915_gt_active;
+struct kmem_cache;
 
 struct i915_active {
-	struct drm_i915_private *i915;
+	struct i915_gt_active *gt;
 
 	struct rb_root tree;
 	struct i915_gem_active last;
@@ -23,4 +25,8 @@ struct i915_active {
 	void (*retire)(struct i915_active *ref);
 };
 
+struct i915_gt_active {
+	struct kmem_cache *slab_cache;
+};
+
 #endif /* _I915_ACTIVE_TYPES_H_ */
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7946c271ab24..135be159091b 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1985,6 +1985,8 @@ struct drm_i915_private {
 			struct list_head hwsp_free_list;
 		} timelines;
 
+		struct i915_gt_active active_refs;
+
 		struct list_head active_rings;
 		struct list_head closed_vma;
 		u32 active_requests;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index caccff87a2a1..157079cec01d 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4998,15 +4998,19 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
 	}
 
+	ret = i915_gt_active_init(i915_gt_active(dev_priv));
+	if (ret)
+		return ret;
+
 	i915_timelines_init(dev_priv);
 
 	ret = i915_gem_init_userptr(dev_priv);
 	if (ret)
-		return ret;
+		goto err_timelines;
 
 	ret = intel_uc_init_misc(dev_priv);
 	if (ret)
-		return ret;
+		goto err_userptr;
 
 	ret = intel_wopcm_init(&dev_priv->wopcm);
 	if (ret)
@@ -5122,9 +5126,13 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 err_uc_misc:
 	intel_uc_fini_misc(dev_priv);
 
-	if (ret != -EIO) {
+err_userptr:
+	if (ret != -EIO)
 		i915_gem_cleanup_userptr(dev_priv);
+err_timelines:
+	if (ret != -EIO) {
 		i915_timelines_fini(dev_priv);
+		i915_gt_active_fini(i915_gt_active(dev_priv));
 	}
 
 	if (ret == -EIO) {
@@ -5177,6 +5185,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv)
 	intel_uc_fini_misc(dev_priv);
 	i915_gem_cleanup_userptr(dev_priv);
 	i915_timelines_fini(dev_priv);
+	i915_gt_active_fini(i915_gt_active(dev_priv));
 
 	i915_gem_drain_freed_objects(dev_priv);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index e625659c03a2..d8819de0d6ee 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1917,7 +1917,7 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
 	if (!vma)
 		return ERR_PTR(-ENOMEM);
 
-	i915_active_init(i915, &vma->active, NULL);
+	i915_active_init(i915_gt_active(i915), &vma->active, NULL);
 	init_request_active(&vma->last_fence, NULL);
 
 	vma->vm = &ggtt->vm;
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index d4772061e642..2456bfb4877b 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -119,7 +119,8 @@ vma_create(struct drm_i915_gem_object *obj,
 	if (vma == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	i915_active_init(vm->i915, &vma->active, __i915_vma_retire);
+	i915_active_init(i915_gt_active(vm->i915),
+			 &vma->active, __i915_vma_retire);
 	init_request_active(&vma->last_fence, NULL);
 
 	vma->vm = vm;
diff --git a/drivers/gpu/drm/i915/selftests/i915_active.c b/drivers/gpu/drm/i915/selftests/i915_active.c
index 7c5c3068565b..0e923476920e 100644
--- a/drivers/gpu/drm/i915/selftests/i915_active.c
+++ b/drivers/gpu/drm/i915/selftests/i915_active.c
@@ -30,7 +30,8 @@ static int __live_active_setup(struct drm_i915_private *i915,
 	unsigned int count = 0;
 	int err = 0;
 
-	i915_active_init(i915, &active->base, __live_active_retire);
+	i915_active_init(i915_gt_active(i915),
+			 &active->base, __live_active_retire);
 	active->retired = false;
 
 	if (!i915_active_acquire(&active->base)) {
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 074a0d9cbf26..5b88f74c1677 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -69,6 +69,7 @@ static void mock_device_release(struct drm_device *dev)
 	mutex_unlock(&i915->drm.struct_mutex);
 
 	i915_timelines_fini(i915);
+	i915_gt_active_fini(i915_gt_active(i915));
 
 	drain_workqueue(i915->wq);
 	i915_gem_drain_freed_objects(i915);
@@ -228,6 +229,9 @@ struct drm_i915_private *mock_gem_device(void)
 	if (!i915->priorities)
 		goto err_dependencies;
 
+	if (i915_gt_active_init(i915_gt_active(i915)))
+		goto err_priorities;
+
 	i915_timelines_init(i915);
 
 	INIT_LIST_HEAD(&i915->gt.active_rings);
@@ -257,6 +261,8 @@ struct drm_i915_private *mock_gem_device(void)
 err_unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
 	i915_timelines_fini(i915);
+	i915_gt_active_fini(i915_gt_active(i915));
+err_priorities:
 	kmem_cache_destroy(i915->priorities);
 err_dependencies:
 	kmem_cache_destroy(i915->dependencies);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH v2] drm/i915: Allocate active tracking nodes from a slabcache
  2019-01-30 16:37   ` [PATCH v2] " Chris Wilson
@ 2019-01-30 16:47     ` Chris Wilson
  0 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30 16:47 UTC (permalink / raw)
  To: intel-gfx

Quoting Chris Wilson (2019-01-30 16:37:30)
> @@ -20,8 +22,13 @@ static void
>  __active_retire(struct i915_active *ref)
>  {
>         GEM_BUG_ON(!ref->count);
> -       if (!--ref->count)
> -               ref->retire(ref);
> +       if (--ref->count)
> +               return;
> +
> +       /* return the unused nodes to our slabcache*/
> +       i915_active_fini(ref);

This also means the struct is self-finalizing and we only need to use
i915_active_fini() as a sanitycheck. Also if we embed actives
everywhere, debugobject ahoy.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* [PATCH v3] drm/i915: Allocate active tracking nodes from a slabcache
  2019-01-30  2:19 ` [PATCH 06/11] drm/i915: Allocate active tracking nodes from a slabcache Chris Wilson
  2019-01-30 15:57   ` Tvrtko Ursulin
  2019-01-30 16:37   ` [PATCH v2] " Chris Wilson
@ 2019-01-30 17:25   ` Chris Wilson
  2 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30 17:25 UTC (permalink / raw)
  To: intel-gfx

Wrap the active tracking for a GPU references in a slabcache for faster
allocations, and keep track of inflight nodes so we can reap the
stale entries upon idling (thereby trimming our memory usage).

v2: Automatically discard the trees everytime the tracker idles; they
should be rarely used and fast to allocate as required.
v3: Nothing device specific left, it's just a slabcache that we can
make global.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_active.c | 61 ++++++++++++++++++++++++------
 drivers/gpu/drm/i915/i915_active.h | 11 ++++--
 drivers/gpu/drm/i915/i915_gem.c    | 10 +++--
 drivers/gpu/drm/i915/i915_pci.c    |  3 ++
 4 files changed, 65 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_active.c b/drivers/gpu/drm/i915/i915_active.c
index e0182e19cb8b..d05fd92dbc82 100644
--- a/drivers/gpu/drm/i915/i915_active.c
+++ b/drivers/gpu/drm/i915/i915_active.c
@@ -9,6 +9,17 @@
 
 #define BKL(ref) (&(ref)->i915->drm.struct_mutex)
 
+/*
+ * Active refs memory management
+ *
+ * To be more economical with memory, we reap all the i915_active trees as
+ * they idle (when we know the active requests are inactive) and allocate the
+ * nodes from a local slab cache to hopefully reduce the fragmentation.
+ */
+static struct i915_global_active {
+	struct kmem_cache *slab_cache;
+} global;
+
 struct active_node {
 	struct i915_gem_active base;
 	struct i915_active *ref;
@@ -16,12 +27,29 @@ struct active_node {
 	u64 timeline;
 };
 
+static void
+__active_park(struct i915_active *ref)
+{
+	struct active_node *it, *n;
+
+	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
+		GEM_BUG_ON(i915_gem_active_isset(&it->base));
+		kmem_cache_free(global.slab_cache, it);
+	}
+	ref->tree = RB_ROOT;
+}
+
 static void
 __active_retire(struct i915_active *ref)
 {
 	GEM_BUG_ON(!ref->count);
-	if (!--ref->count)
-		ref->retire(ref);
+	if (--ref->count)
+		return;
+
+	/* return the unused nodes to our slabcache */
+	__active_park(ref);
+
+	ref->retire(ref);
 }
 
 static void
@@ -79,11 +107,11 @@ active_instance(struct i915_active *ref, u64 idx)
 			p = &parent->rb_left;
 	}
 
-	node = kmalloc(sizeof(*node), GFP_KERNEL);
+	node = kmem_cache_alloc(global.slab_cache, GFP_KERNEL);
 
 	/* kmalloc may retire the ref->last (thanks shrinker)! */
 	if (unlikely(!i915_gem_active_raw(&ref->last, BKL(ref)))) {
-		kfree(node);
+		kmem_cache_free(global.slab_cache, node);
 		goto out;
 	}
 
@@ -174,7 +202,7 @@ int i915_active_wait(struct i915_active *ref)
 			return ret;
 
 		GEM_BUG_ON(i915_gem_active_isset(&it->base));
-		kfree(it);
+		kmem_cache_free(global.slab_cache, it);
 	}
 	ref->tree = RB_ROOT;
 
@@ -208,17 +236,26 @@ int i915_request_await_active(struct i915_request *rq, struct i915_active *ref)
 	return 0;
 }
 
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
 void i915_active_fini(struct i915_active *ref)
 {
-	struct active_node *it, *n;
-
 	GEM_BUG_ON(i915_gem_active_isset(&ref->last));
+	GEM_BUG_ON(!RB_EMPTY_ROOT(&ref->tree));
+}
+#endif
 
-	rbtree_postorder_for_each_entry_safe(it, n, &ref->tree, node) {
-		GEM_BUG_ON(i915_gem_active_isset(&it->base));
-		kfree(it);
-	}
-	ref->tree = RB_ROOT;
+int __init i915_global_active_init(void)
+{
+	global.slab_cache = KMEM_CACHE(active_node, SLAB_HWCACHE_ALIGN);
+	if (!global.slab_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit i915_global_active_exit(void)
+{
+	kmem_cache_destroy(global.slab_cache);
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/i915_active.h b/drivers/gpu/drm/i915/i915_active.h
index c0729a046f98..48fdb1497883 100644
--- a/drivers/gpu/drm/i915/i915_active.h
+++ b/drivers/gpu/drm/i915/i915_active.h
@@ -9,10 +9,6 @@
 
 #include "i915_active_types.h"
 
-#include <linux/rbtree.h>
-
-#include "i915_request.h"
-
 /*
  * GPU activity tracking
  *
@@ -61,6 +57,13 @@ i915_active_is_idle(const struct i915_active *ref)
 	return !ref->count;
 }
 
+#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
 void i915_active_fini(struct i915_active *ref);
+#else
+static inline void i915_active_fini(struct i915_active *ref) { }
+#endif
+
+int i915_global_active_init(void);
+void i915_global_active_exit(void);
 
 #endif /* _I915_ACTIVE_H_ */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index caccff87a2a1..843b7fd93331 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -5002,11 +5002,11 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 
 	ret = i915_gem_init_userptr(dev_priv);
 	if (ret)
-		return ret;
+		goto err_timelines;
 
 	ret = intel_uc_init_misc(dev_priv);
 	if (ret)
-		return ret;
+		goto err_userptr;
 
 	ret = intel_wopcm_init(&dev_priv->wopcm);
 	if (ret)
@@ -5122,10 +5122,12 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 err_uc_misc:
 	intel_uc_fini_misc(dev_priv);
 
-	if (ret != -EIO) {
+err_userptr:
+	if (ret != -EIO)
 		i915_gem_cleanup_userptr(dev_priv);
+err_timelines:
+	if (ret != -EIO)
 		i915_timelines_fini(dev_priv);
-	}
 
 	if (ret == -EIO) {
 		mutex_lock(&dev_priv->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_pci.c b/drivers/gpu/drm/i915/i915_pci.c
index 44c23ac60347..751a787c83d1 100644
--- a/drivers/gpu/drm/i915/i915_pci.c
+++ b/drivers/gpu/drm/i915/i915_pci.c
@@ -793,6 +793,8 @@ static int __init i915_init(void)
 	bool use_kms = true;
 	int err;
 
+	i915_global_active_init();
+
 	err = i915_mock_selftests();
 	if (err)
 		return err > 0 ? 0 : err;
@@ -824,6 +826,7 @@ static void __exit i915_exit(void)
 		return;
 
 	pci_unregister_driver(&i915_pci_driver);
+	i915_global_active_exit();
 }
 
 module_init(i915_init);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 38+ messages in thread

* Re: [PATCH 08/11] drm/i915: Keep timeline HWSP allocated until the system is idle
  2019-01-30  2:19 ` [PATCH 08/11] drm/i915: Keep timeline HWSP allocated until the system is idle Chris Wilson
@ 2019-01-30 17:54   ` Tvrtko Ursulin
  2019-01-30 18:25     ` Chris Wilson
  0 siblings, 1 reply; 38+ messages in thread
From: Tvrtko Ursulin @ 2019-01-30 17:54 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 30/01/2019 02:19, Chris Wilson wrote:
> In preparation for enabling HW semaphores, we need to keep in flight
> timeline HWSP alive until the entire system is idle, as any other
> timeline active on the GPU may still refer back to the already retired
> timeline. We both have to delay recycling available cachelines and
> unpinning old HWSP until the next idle point (i.e. on parking).
> 
> That we have to keep the HWSP alive for external references on HW raises
> an interesting conundrum. On a busy system, we may never see a global
> idle point, essentially meaning the resource will be leaking until we
> are forced to sleep. What we need is a set of RCU primitives for the GPU!
> This should also help mitigate the resource starvation issues
> promulgating from keeping all logical state pinned until idle (instead
> of as currently handled until the next context switch).
> 
> v2: Use idle barriers to free stale HWSP as soon as all current requests
> are idle, rather than rely on the system reaching a global idle point.
> (Tvrtko)
> v3: Replace the idle barrier with read locks.

Time to change patch title and actually even rewrite the commit message 
I think.

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_request.c  |  30 ++--
>   drivers/gpu/drm/i915/i915_timeline.c | 229 +++++++++++++++++++++++++--
>   drivers/gpu/drm/i915/i915_timeline.h |   9 +-
>   3 files changed, 237 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index a09f47ccc703..07e4c3c68ecd 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -326,11 +326,6 @@ void i915_request_retire_upto(struct i915_request *rq)
>   	} while (tmp != rq);
>   }
>   
> -static u32 timeline_get_seqno(struct i915_timeline *tl)
> -{
> -	return tl->seqno += 1 + tl->has_initial_breadcrumb;
> -}
> -
>   static void move_to_timeline(struct i915_request *request,
>   			     struct i915_timeline *timeline)
>   {
> @@ -539,8 +534,10 @@ struct i915_request *
>   i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   {
>   	struct drm_i915_private *i915 = engine->i915;
> -	struct i915_request *rq;
>   	struct intel_context *ce;
> +	struct i915_timeline *tl;
> +	struct i915_request *rq;
> +	u32 seqno;
>   	int ret;
>   
>   	lockdep_assert_held(&i915->drm.struct_mutex);
> @@ -615,24 +612,26 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   		}
>   	}
>   
> -	rq->rcustate = get_state_synchronize_rcu();
> -
>   	INIT_LIST_HEAD(&rq->active_list);
> +
> +	tl = ce->ring->timeline;
> +	ret = i915_timeline_get_seqno(tl, rq, &seqno);
> +	if (ret)
> +		goto err_free;
> +
>   	rq->i915 = i915;
>   	rq->engine = engine;
>   	rq->gem_context = ctx;
>   	rq->hw_context = ce;
>   	rq->ring = ce->ring;
> -	rq->timeline = ce->ring->timeline;
> +	rq->timeline = tl;
>   	GEM_BUG_ON(rq->timeline == &engine->timeline);
> -	rq->hwsp_seqno = rq->timeline->hwsp_seqno;
> +	rq->hwsp_seqno = tl->hwsp_seqno;
> +	rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */
>   
>   	spin_lock_init(&rq->lock);
> -	dma_fence_init(&rq->fence,
> -		       &i915_fence_ops,
> -		       &rq->lock,
> -		       rq->timeline->fence_context,
> -		       timeline_get_seqno(rq->timeline));
> +	dma_fence_init(&rq->fence, &i915_fence_ops, &rq->lock,
> +		       tl->fence_context, seqno);
>   
>   	/* We bump the ref for the fence chain */
>   	i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
> @@ -693,6 +692,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	GEM_BUG_ON(!list_empty(&rq->sched.signalers_list));
>   	GEM_BUG_ON(!list_empty(&rq->sched.waiters_list));
>   
> +err_free:
>   	kmem_cache_free(i915->requests, rq);
>   err_unreserve:
>   	unreserve_gt(i915);
> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
> index b2202d2e58a2..fd1a92a3663d 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_timeline.c
> @@ -6,19 +6,28 @@
>   
>   #include "i915_drv.h"
>   
> -#include "i915_timeline.h"
> +#include "i915_active.h"
>   #include "i915_syncmap.h"
> +#include "i915_timeline.h"
>   
>   struct i915_timeline_hwsp {
> -	struct i915_vma *vma;
> +	struct i915_gt_timelines *gt;

Coming back to a comment from one of the previous reviews, this is also 
called gt but is a different thing altogether. I would really like that 
we afford ourselves a few more characters so it just easier to read the 
code.

>   	struct list_head free_link;
> +	struct i915_vma *vma;
>   	u64 free_bitmap;
>   };
>   
> -static inline struct i915_timeline_hwsp *
> -i915_timeline_hwsp(const struct i915_timeline *tl)
> +struct i915_timeline_cacheline {
> +	struct i915_active active;
> +	struct i915_timeline_hwsp *hwsp;
> +	unsigned int cacheline : 6;
> +	unsigned int free : 1;
> +};
> +
> +static inline struct drm_i915_private *
> +hwsp_to_i915(struct i915_timeline_hwsp *hwsp)
>   {
> -	return tl->hwsp_ggtt->private;
> +	return container_of(hwsp->gt, struct drm_i915_private, gt.timelines);
>   }
>   
>   static struct i915_vma *__hwsp_alloc(struct drm_i915_private *i915)
> @@ -71,6 +80,7 @@ hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
>   		vma->private = hwsp;
>   		hwsp->vma = vma;
>   		hwsp->free_bitmap = ~0ull;
> +		hwsp->gt = gt;
>   
>   		spin_lock(&gt->hwsp_lock);
>   		list_add(&hwsp->free_link, &gt->hwsp_free_list);
> @@ -88,14 +98,9 @@ hwsp_alloc(struct i915_timeline *timeline, unsigned int *cacheline)
>   	return hwsp->vma;
>   }
>   
> -static void hwsp_free(struct i915_timeline *timeline)
> +static void __idle_hwsp_free(struct i915_timeline_hwsp *hwsp, int cacheline)
>   {
> -	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
> -	struct i915_timeline_hwsp *hwsp;
> -
> -	hwsp = i915_timeline_hwsp(timeline);
> -	if (!hwsp) /* leave global HWSP alone! */
> -		return;
> +	struct i915_gt_timelines *gt = hwsp->gt;
>   
>   	spin_lock(&gt->hwsp_lock);
>   
> @@ -103,7 +108,8 @@ static void hwsp_free(struct i915_timeline *timeline)
>   	if (!hwsp->free_bitmap)
>   		list_add_tail(&hwsp->free_link, &gt->hwsp_free_list);
>   
> -	hwsp->free_bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
> +	GEM_BUG_ON(cacheline >= BITS_PER_TYPE(hwsp->free_bitmap));
> +	hwsp->free_bitmap |= BIT_ULL(cacheline);
>   
>   	/* And if no one is left using it, give the page back to the system */
>   	if (hwsp->free_bitmap == ~0ull) {
> @@ -115,6 +121,80 @@ static void hwsp_free(struct i915_timeline *timeline)
>   	spin_unlock(&gt->hwsp_lock);
>   }
>   
> +static void __idle_cacheline_free(struct i915_timeline_cacheline *cl)
> +{
> +	GEM_BUG_ON(!i915_active_is_idle(&cl->active));
> +
> +	i915_vma_put(cl->hwsp->vma);
> +	__idle_hwsp_free(cl->hwsp, cl->cacheline);
> +
> +	i915_active_fini(&cl->active);
> +	kfree(cl);
> +}
> +
> +static void __idle_cacheline_park(struct i915_timeline_cacheline *cl)
> +{
> +	i915_active_fini(&cl->active);
> +}
> +
> +static void __cacheline_retire(struct i915_active *active)
> +{
> +	struct i915_timeline_cacheline *cl =
> +		container_of(active, typeof(*cl), active);
> +
> +	i915_vma_unpin(cl->hwsp->vma);
> +	if (!cl->free)
> +		__idle_cacheline_park(cl);
> +	else
> +		__idle_cacheline_free(cl);
> +}
> +
> +static struct i915_timeline_cacheline *
> +cacheline_alloc(struct i915_timeline_hwsp *hwsp, unsigned int cacheline)
> +{
> +	struct i915_timeline_cacheline *cl;
> +
> +	GEM_BUG_ON(cacheline >= 64);

Maybe pull out CACHELINES_PER_PAGE as HWSP_CACHELINES_PER_PAGE or something?

> +
> +	cl = kmalloc(sizeof(*cl), GFP_KERNEL);
> +	if (!cl)
> +		return ERR_PTR(-ENOMEM);
> +
> +	i915_vma_get(hwsp->vma);
> +	cl->hwsp = hwsp;
> +	cl->cacheline = cacheline;
> +	cl->free = false;
> +
> +	i915_active_init(i915_gt_active(hwsp_to_i915(hwsp)),
> +			 &cl->active, __cacheline_retire);
> +
> +	return cl;
> +}
> +
> +static void cacheline_acquire(struct i915_timeline_cacheline *cl)
> +{
> +	if (cl && i915_active_acquire(&cl->active))
> +		__i915_vma_pin(cl->hwsp->vma);
> +}
> +
> +static void cacheline_release(struct i915_timeline_cacheline *cl)
> +{
> +	if (cl)
> +		i915_active_release(&cl->active);
> +}
> +
> +static void cacheline_free(struct i915_timeline_cacheline *cl)
> +{
> +	if (!cl)
> +		return;
> +
> +	GEM_BUG_ON(cl->free);
> +	cl->free = true;
> +
> +	if (i915_active_is_idle(&cl->active))
> +		__idle_cacheline_free(cl);
> +}
> +
>   int i915_timeline_init(struct drm_i915_private *i915,
>   		       struct i915_timeline *timeline,
>   		       const char *name,
> @@ -136,22 +216,32 @@ int i915_timeline_init(struct drm_i915_private *i915,
>   	timeline->name = name;
>   	timeline->pin_count = 0;
>   	timeline->has_initial_breadcrumb = !hwsp;
> +	timeline->hwsp_cacheline = NULL;
>   
>   	timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
>   	if (!hwsp) {
> +		struct i915_timeline_cacheline *cl;
>   		unsigned int cacheline;
>   
>   		hwsp = hwsp_alloc(timeline, &cacheline);
>   		if (IS_ERR(hwsp))
>   			return PTR_ERR(hwsp);
>   
> +		cl = cacheline_alloc(hwsp->private, cacheline);
> +		if (IS_ERR(cl)) {
> +			__idle_hwsp_free(hwsp->private, cacheline);
> +			return PTR_ERR(cl);
> +		}
> +
>   		timeline->hwsp_offset = cacheline * CACHELINE_BYTES;
> +		timeline->hwsp_cacheline = cl;
>   	}
>   	timeline->hwsp_ggtt = i915_vma_get(hwsp);
> +	GEM_BUG_ON(timeline->hwsp_offset >= hwsp->size);
>   
>   	vaddr = i915_gem_object_pin_map(hwsp->obj, I915_MAP_WB);
>   	if (IS_ERR(vaddr)) {
> -		hwsp_free(timeline);
> +		cacheline_free(timeline->hwsp_cacheline);
>   		i915_vma_put(hwsp);
>   		return PTR_ERR(vaddr);
>   	}
> @@ -239,7 +329,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
>   	GEM_BUG_ON(i915_active_request_isset(&timeline->barrier));
>   
>   	i915_syncmap_free(&timeline->sync);
> -	hwsp_free(timeline);
> +	cacheline_free(timeline->hwsp_cacheline);
>   
>   	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
>   	i915_vma_put(timeline->hwsp_ggtt);
> @@ -284,6 +374,7 @@ int i915_timeline_pin(struct i915_timeline *tl)
>   		i915_ggtt_offset(tl->hwsp_ggtt) +
>   		offset_in_page(tl->hwsp_offset);
>   
> +	cacheline_acquire(tl->hwsp_cacheline);
>   	timeline_add_to_active(tl);
>   
>   	return 0;
> @@ -293,6 +384,113 @@ int i915_timeline_pin(struct i915_timeline *tl)
>   	return err;
>   }
>   
> +static u32 timeline_advance(struct i915_timeline *tl)
> +{
> +	GEM_BUG_ON(!tl->pin_count);
> +	GEM_BUG_ON(tl->seqno & tl->has_initial_breadcrumb);
> +
> +	return tl->seqno += 1 + tl->has_initial_breadcrumb;
> +}
> +
> +static void timeline_rollback(struct i915_timeline *tl)
> +{
> +	tl->seqno -= 1 + tl->has_initial_breadcrumb;
> +}
> +
> +static noinline int
> +__i915_timeline_get_seqno(struct i915_timeline *tl,
> +			  struct i915_request *rq,
> +			  u32 *seqno)
> +{
> +	struct i915_timeline_cacheline *cl;
> +	struct i915_vma *vma;
> +	unsigned int cacheline;
> +	int err;
> +
> +	/*
> +	 * If there is an outstanding GPU reference to this cacheline,
> +	 * such as it being sampled by a HW semaphore on another timeline,
> +	 * we cannot wraparound our seqno value (the HW semaphore does
> +	 * a strict greater-than-or-equals compare, not i915_seqno_passed).
> +	 * So if the cacheline is still busy, we must detached ourselves
> +	 * from it and leave it inflight alongside its users.
> +	 *
> +	 * However, if nobody is watching and we can guarantee that nobody
> +	 * will, we could simply reuse the same cacheline.
> +	 *
> +	 * // while locked
> +	 * if (i915_active_request_is_signaled(&tl->last_request) &&
> +	 *     i915_active_is_signaled(&tl->hwsp_cacheline->active))
> +	 *	return 0;
> +	 *
> +	 * That seems unlikely for a busy timeline that needed to wrap in
> +	 * the first place, so just replace the cacheline.
> +	 */
> +
> +	vma = hwsp_alloc(tl, &cacheline);
> +	if (IS_ERR(vma)) {
> +		err = PTR_ERR(vma);
> +		goto err_rollback;
> +	}
> +
> +	cl = cacheline_alloc(vma->private, cacheline);
> +	if (IS_ERR(cl)) {
> +		err = PTR_ERR(cl);
> +		goto err_hwsp;
> +	}
> +
> +	/*
> +	 * Attach the old cacheline to the current request, so that we only
> +	 * free it after the current request is retired, which ensures that
> +	 * all writes into the cacheline from previous requests are complete.
> +	 */
> +	err = i915_active_ref(&tl->hwsp_cacheline->active,
> +			      tl->fence_context, rq);

Right, this is the rq + 1 magic akin to unpin previous context. Was 
confusing me for a bit why we would be assigning the old cacheline to 
the current rq.

> +	if (err)
> +		goto err_cacheline;
> +
> +	tl->hwsp_ggtt = i915_vma_get(vma);
> +	tl->hwsp_offset = cacheline * CACHELINE_BYTES;
> +	__i915_vma_pin(tl->hwsp_ggtt);
> +
> +	cacheline_release(tl->hwsp_cacheline); /* ownership now xfered to rq */
> +	cacheline_free(tl->hwsp_cacheline);
> +
> +	cacheline_acquire(cl);
> +	tl->hwsp_cacheline = cl;
> +
> +	*seqno = timeline_advance(tl);
> +	return 0;
> +
> +err_cacheline:

i915_vma_put looks to be missing here to fully unwind cacheline_alloc.

> +	kfree(cl);
> +err_hwsp:
> +	__idle_hwsp_free(vma->private, cacheline);
> +err_rollback:
> +	timeline_rollback(tl);
> +	return err;
> +}
> +
> +int i915_timeline_get_seqno(struct i915_timeline *tl,
> +			    struct i915_request *rq,
> +			    u32 *seqno)
> +{
> +	*seqno = timeline_advance(tl);
> +
> +	/* Replace the HWSP on wraparound for HW semaphores */
> +	if (unlikely(!*seqno && tl->hwsp_cacheline))
> +		return __i915_timeline_get_seqno(tl, rq, seqno);
> +
> +	return 0;
> +}
> +
> +int i915_timeline_read_lock(struct i915_timeline *tl, struct i915_request *rq)
> +{
> +	GEM_BUG_ON(!tl->pin_count);
> +	return i915_active_ref(&tl->hwsp_cacheline->active,
> +			       rq->fence.context, rq);
> +}
> +
>   void i915_timeline_unpin(struct i915_timeline *tl)
>   {
>   	GEM_BUG_ON(!tl->pin_count);
> @@ -300,6 +498,7 @@ void i915_timeline_unpin(struct i915_timeline *tl)
>   		return;
>   
>   	timeline_remove_from_active(tl);
> +	cacheline_release(tl->hwsp_cacheline);
>   
>   	/*
>   	 * Since this timeline is idle, all bariers upon which we were waiting
> diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
> index 7bec7d2e45bf..d78ec6fbc000 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.h
> +++ b/drivers/gpu/drm/i915/i915_timeline.h
> @@ -34,7 +34,7 @@
>   #include "i915_utils.h"
>   
>   struct i915_vma;
> -struct i915_timeline_hwsp;
> +struct i915_timeline_cacheline;
>   
>   struct i915_timeline {
>   	u64 fence_context;
> @@ -49,6 +49,8 @@ struct i915_timeline {
>   	struct i915_vma *hwsp_ggtt;
>   	u32 hwsp_offset;
>   
> +	struct i915_timeline_cacheline *hwsp_cacheline;
> +
>   	bool has_initial_breadcrumb;
>   
>   	/**
> @@ -160,6 +162,11 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
>   }
>   
>   int i915_timeline_pin(struct i915_timeline *tl);
> +int i915_timeline_get_seqno(struct i915_timeline *tl,
> +			    struct i915_request *rq,
> +			    u32 *seqno);
> +int i915_timeline_read_lock(struct i915_timeline *tl,
> +			    struct i915_request *rq);
>   void i915_timeline_unpin(struct i915_timeline *tl);
>   
>   void i915_timelines_init(struct drm_i915_private *i915);
> 

I like it! Took me some time to figure it out but it looks good and is 
definitely elegant.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 09/11] drm/i915/execlists: Refactor out can_merge_rq()
  2019-01-30  2:19 ` [PATCH 09/11] drm/i915/execlists: Refactor out can_merge_rq() Chris Wilson
@ 2019-01-30 18:05   ` Tvrtko Ursulin
  2019-01-30 18:14     ` Chris Wilson
  0 siblings, 1 reply; 38+ messages in thread
From: Tvrtko Ursulin @ 2019-01-30 18:05 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 30/01/2019 02:19, Chris Wilson wrote:
> In the next patch, we add another user that wants to check whether
> requests can be merge into a single HW execution, and in the future we
> want to add more conditions under which requests from the same context
> cannot be merge. In preparation, extract out can_merge_rq().
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/intel_lrc.c | 30 +++++++++++++++++++-----------
>   1 file changed, 19 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 2616b0b3e8d5..e97ce54138d3 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -285,12 +285,11 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
>   }
>   
>   __maybe_unused static inline bool
> -assert_priority_queue(const struct intel_engine_execlists *execlists,
> -		      const struct i915_request *prev,
> +assert_priority_queue(const struct i915_request *prev,
>   		      const struct i915_request *next)
>   {
> -	if (!prev)
> -		return true;
> +	const struct intel_engine_execlists *execlists =
> +		&prev->engine->execlists;
>   
>   	/*
>   	 * Without preemption, the prev may refer to the still active element
> @@ -601,6 +600,17 @@ static bool can_merge_ctx(const struct intel_context *prev,
>   	return true;
>   }
>   
> +static bool can_merge_rq(const struct i915_request *prev,
> +			 const struct i915_request *next)
> +{
> +	GEM_BUG_ON(!assert_priority_queue(prev, next));
> +
> +	if (!can_merge_ctx(prev->hw_context, next->hw_context))
> +		return false;
> +
> +	return true;

I'll assume you'll be adding here in the future as the reason this is 
not simply "return can_merge_ctx(...)"?

> +}
> +
>   static void port_assign(struct execlist_port *port, struct i915_request *rq)
>   {
>   	GEM_BUG_ON(rq == port_request(port));
> @@ -753,8 +763,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   		int i;
>   
>   		priolist_for_each_request_consume(rq, rn, p, i) {
> -			GEM_BUG_ON(!assert_priority_queue(execlists, last, rq));
> -
>   			/*
>   			 * Can we combine this request with the current port?
>   			 * It has to be the same context/ringbuffer and not
> @@ -766,8 +774,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   			 * second request, and so we never need to tell the
>   			 * hardware about the first.
>   			 */
> -			if (last &&
> -			    !can_merge_ctx(rq->hw_context, last->hw_context)) {
> +			if (last && !can_merge_rq(last, rq)) {
> +				if (last->hw_context == rq->hw_context)
> +					goto done;

I don't get this added check. AFAICS it will only trigger with GVT 
making it not consider filling both ports if possible.

> +
>   				/*
>   				 * If we are on the second port and cannot
>   				 * combine this request with the last, then we
> @@ -787,7 +797,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   				    ctx_single_port_submission(rq->hw_context))
>   					goto done;
>   
> -				GEM_BUG_ON(last->hw_context == rq->hw_context);

This is related to the previous comment. Rebase error?

>   
>   				if (submit)
>   					port_assign(port, last);
> @@ -827,8 +836,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>   	 * request triggering preemption on the next dequeue (or subsequent
>   	 * interrupt for secondary ports).
>   	 */
> -	execlists->queue_priority_hint =
> -		port != execlists->port ? rq_prio(last) : INT_MIN;
> +	execlists->queue_priority_hint = queue_prio(execlists);

This shouldn't be in this patch.

>   
>   	if (submit) {
>   		port_assign(port, last);
> 

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 09/11] drm/i915/execlists: Refactor out can_merge_rq()
  2019-01-30 18:05   ` Tvrtko Ursulin
@ 2019-01-30 18:14     ` Chris Wilson
  2019-01-31  9:19       ` Tvrtko Ursulin
  0 siblings, 1 reply; 38+ messages in thread
From: Chris Wilson @ 2019-01-30 18:14 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-30 18:05:42)
> 
> On 30/01/2019 02:19, Chris Wilson wrote:
> > In the next patch, we add another user that wants to check whether
> > requests can be merge into a single HW execution, and in the future we
> > want to add more conditions under which requests from the same context
> > cannot be merge. In preparation, extract out can_merge_rq().
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   drivers/gpu/drm/i915/intel_lrc.c | 30 +++++++++++++++++++-----------
> >   1 file changed, 19 insertions(+), 11 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> > index 2616b0b3e8d5..e97ce54138d3 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -285,12 +285,11 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
> >   }
> >   
> >   __maybe_unused static inline bool
> > -assert_priority_queue(const struct intel_engine_execlists *execlists,
> > -                   const struct i915_request *prev,
> > +assert_priority_queue(const struct i915_request *prev,
> >                     const struct i915_request *next)
> >   {
> > -     if (!prev)
> > -             return true;
> > +     const struct intel_engine_execlists *execlists =
> > +             &prev->engine->execlists;
> >   
> >       /*
> >        * Without preemption, the prev may refer to the still active element
> > @@ -601,6 +600,17 @@ static bool can_merge_ctx(const struct intel_context *prev,
> >       return true;
> >   }
> >   
> > +static bool can_merge_rq(const struct i915_request *prev,
> > +                      const struct i915_request *next)
> > +{
> > +     GEM_BUG_ON(!assert_priority_queue(prev, next));
> > +
> > +     if (!can_merge_ctx(prev->hw_context, next->hw_context))
> > +             return false;
> > +
> > +     return true;
> 
> I'll assume you'll be adding here in the future as the reason this is 
> not simply "return can_merge_ctx(...)"?

Yes, raison d'etre of making the change.

> >   static void port_assign(struct execlist_port *port, struct i915_request *rq)
> >   {
> >       GEM_BUG_ON(rq == port_request(port));
> > @@ -753,8 +763,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >               int i;
> >   
> >               priolist_for_each_request_consume(rq, rn, p, i) {
> > -                     GEM_BUG_ON(!assert_priority_queue(execlists, last, rq));
> > -
> >                       /*
> >                        * Can we combine this request with the current port?
> >                        * It has to be the same context/ringbuffer and not
> > @@ -766,8 +774,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >                        * second request, and so we never need to tell the
> >                        * hardware about the first.
> >                        */
> > -                     if (last &&
> > -                         !can_merge_ctx(rq->hw_context, last->hw_context)) {
> > +                     if (last && !can_merge_rq(last, rq)) {
> > +                             if (last->hw_context == rq->hw_context)
> > +                                     goto done;
> 
> I don't get this added check. AFAICS it will only trigger with GVT 
> making it not consider filling both ports if possible.

Because we are preparing for can_merge_rq() deciding not to merge the
same context. If we do that we can't continue on to the next port and
must terminate the loop, violating the trick with the hint in the
process.

This changes due to the next patch, per-context freq and probably more
that I've forgotten.

> > +
> >                               /*
> >                                * If we are on the second port and cannot
> >                                * combine this request with the last, then we
> > @@ -787,7 +797,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >                                   ctx_single_port_submission(rq->hw_context))
> >                                       goto done;
> >   
> > -                             GEM_BUG_ON(last->hw_context == rq->hw_context);
> 
> This is related to the previous comment. Rebase error?

Previous if check, so it's clear at this point that we can't be using
the same.

> > @@ -827,8 +836,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >        * request triggering preemption on the next dequeue (or subsequent
> >        * interrupt for secondary ports).
> >        */
> > -     execlists->queue_priority_hint =
> > -             port != execlists->port ? rq_prio(last) : INT_MIN;
> > +     execlists->queue_priority_hint = queue_prio(execlists);
> 
> This shouldn't be in this patch.

If we terminate the loop early, we need to look at the head of the
queue.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 08/11] drm/i915: Keep timeline HWSP allocated until the system is idle
  2019-01-30 17:54   ` Tvrtko Ursulin
@ 2019-01-30 18:25     ` Chris Wilson
  0 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30 18:25 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-30 17:54:51)
> 
> On 30/01/2019 02:19, Chris Wilson wrote:
> > In preparation for enabling HW semaphores, we need to keep in flight
> > timeline HWSP alive until the entire system is idle, as any other
> > timeline active on the GPU may still refer back to the already retired
> > timeline. We both have to delay recycling available cachelines and
> > unpinning old HWSP until the next idle point (i.e. on parking).
> > 
> > That we have to keep the HWSP alive for external references on HW raises
> > an interesting conundrum. On a busy system, we may never see a global
> > idle point, essentially meaning the resource will be leaking until we
> > are forced to sleep. What we need is a set of RCU primitives for the GPU!
> > This should also help mitigate the resource starvation issues
> > promulgating from keeping all logical state pinned until idle (instead
> > of as currently handled until the next context switch).
> > 
> > v2: Use idle barriers to free stale HWSP as soon as all current requests
> > are idle, rather than rely on the system reaching a global idle point.
> > (Tvrtko)
> > v3: Replace the idle barrier with read locks.
> 
> Time to change patch title and actually even rewrite the commit message 
> I think.

Why? We are keeping it until it is idle in the system, not just the
timeline. "Keep timeline HWSP allocated until idle in the system".

First paragraph is still true. Second paragraph can be more concise and
need not be a flight of fantasy.

> > diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
> > index b2202d2e58a2..fd1a92a3663d 100644
> > --- a/drivers/gpu/drm/i915/i915_timeline.c
> > +++ b/drivers/gpu/drm/i915/i915_timeline.c
> > @@ -6,19 +6,28 @@
> >   
> >   #include "i915_drv.h"
> >   
> > -#include "i915_timeline.h"
> > +#include "i915_active.h"
> >   #include "i915_syncmap.h"
> > +#include "i915_timeline.h"
> >   
> >   struct i915_timeline_hwsp {
> > -     struct i915_vma *vma;
> > +     struct i915_gt_timelines *gt;
> 
> Coming back to a comment from one of the previous reviews, this is also 
> called gt but is a different thing altogether. I would really like that 
> we afford ourselves a few more characters so it just easier to read the 
> code.

Then start suggesting names. A tiny bit late to object to the gt pattern
at this point! :)

But do need a better name for the graphic device "globals"

> > +static struct i915_timeline_cacheline *
> > +cacheline_alloc(struct i915_timeline_hwsp *hwsp, unsigned int cacheline)
> > +{
> > +     struct i915_timeline_cacheline *cl;
> > +
> > +     GEM_BUG_ON(cacheline >= 64);
> 
> Maybe pull out CACHELINES_PER_PAGE as HWSP_CACHELINES_PER_PAGE or something?

No, this the size of the bitfield and I don't know of any preprocessor
tricks to determine that.

> > +     /*
> > +      * Attach the old cacheline to the current request, so that we only
> > +      * free it after the current request is retired, which ensures that
> > +      * all writes into the cacheline from previous requests are complete.
> > +      */
> > +     err = i915_active_ref(&tl->hwsp_cacheline->active,
> > +                           tl->fence_context, rq);
> 
> Right, this is the rq + 1 magic akin to unpin previous context. Was 
> confusing me for a bit why we would be assigning the old cacheline to 
> the current rq.

It just means we don't have to track it only every request in the local
timeline, which is a bit easier and lot less work.

> > +     if (err)
> > +             goto err_cacheline;
> > +
> > +     tl->hwsp_ggtt = i915_vma_get(vma);
> > +     tl->hwsp_offset = cacheline * CACHELINE_BYTES;
> > +     __i915_vma_pin(tl->hwsp_ggtt);
> > +
> > +     cacheline_release(tl->hwsp_cacheline); /* ownership now xfered to rq */
> > +     cacheline_free(tl->hwsp_cacheline);
> > +
> > +     cacheline_acquire(cl);
> > +     tl->hwsp_cacheline = cl;
> > +
> > +     *seqno = timeline_advance(tl);
> > +     return 0;
> > +
> > +err_cacheline:
> 
> i915_vma_put looks to be missing here to fully unwind cacheline_alloc.
 
Now there's a cacheline_free() can be used here.

> > +     kfree(cl);
> > +err_hwsp:
> > +     __idle_hwsp_free(vma->private, cacheline);
> > +err_rollback:
> > +     timeline_rollback(tl);
> > +     return err;
> > +}

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3)
  2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
                   ` (13 preceding siblings ...)
  2019-01-30  7:32 ` ✗ Fi.CI.IGT: failure " Patchwork
@ 2019-01-30 18:41 ` Patchwork
  14 siblings, 0 replies; 38+ messages in thread
From: Patchwork @ 2019-01-30 18:41 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3)
URL   : https://patchwork.freedesktop.org/series/55956/
State : failure

== Summary ==

Applying: drm/i915: Revoke mmaps and prevent access to fence registers across reset
Applying: drm/i915/execlists: Suppress redundant preemption
Applying: drm/i915/selftests: Exercise some AB...BA preemption chains
Applying: drm/i915: Generalise GPU activity tracking
Applying: drm/i915: Add timeline barrier support
Applying: drm/i915: Allocate active tracking nodes from a slabcache
Applying: drm/i915: Pull i915_gem_active into the i915_active family
Using index info to reconstruct a base tree...
M	drivers/gpu/drm/i915/i915_active.c
M	drivers/gpu/drm/i915/i915_active.h
M	drivers/gpu/drm/i915/i915_active_types.h
M	drivers/gpu/drm/i915/i915_gem.c
M	drivers/gpu/drm/i915/i915_gem_gtt.c
M	drivers/gpu/drm/i915/i915_vma.c
Falling back to patching base and 3-way merge...
Auto-merging drivers/gpu/drm/i915/i915_vma.c
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/i915_vma.c
Auto-merging drivers/gpu/drm/i915/i915_gem_gtt.c
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/i915_gem_gtt.c
Auto-merging drivers/gpu/drm/i915/i915_gem.c
Auto-merging drivers/gpu/drm/i915/i915_active_types.h
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/i915_active_types.h
Auto-merging drivers/gpu/drm/i915/i915_active.h
Auto-merging drivers/gpu/drm/i915/i915_active.c
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/i915_active.c
error: Failed to merge in the changes.
hint: Use 'git am --show-current-patch' to see the failed patch
Patch failed at 0007 drm/i915: Pull i915_gem_active into the i915_active family
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 06/11] drm/i915: Allocate active tracking nodes from a slabcache
  2019-01-30 16:08     ` Chris Wilson
@ 2019-01-30 21:58       ` Chris Wilson
  0 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-30 21:58 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Chris Wilson (2019-01-30 16:08:59)
> Quoting Tvrtko Ursulin (2019-01-30 15:57:06)
> > Under what scenarios we end up not freeing active nodes sufficiently? It 
> > would have to be some user which keeps many contexts around, having only 
> > used them once?
> 
> Never in a typical scenario :) Almost every allocation is served from
> the last_request slot, you need to have a pair of concurrent references
> to a vma for it to activate, and that takes multiple engines reading the
> same object within a context/ppgtt.

That being said, that applies to vma. HW semaphores is an example where
the cacheline could be shared far and wide with uncertain lifetimes.
Context barriers in contrast just have a few different timelines and
idle almost immediately.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 09/11] drm/i915/execlists: Refactor out can_merge_rq()
  2019-01-30 18:14     ` Chris Wilson
@ 2019-01-31  9:19       ` Tvrtko Ursulin
  2019-01-31  9:30         ` Chris Wilson
  2019-01-31  9:36         ` Chris Wilson
  0 siblings, 2 replies; 38+ messages in thread
From: Tvrtko Ursulin @ 2019-01-31  9:19 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 30/01/2019 18:14, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-01-30 18:05:42)
>>
>> On 30/01/2019 02:19, Chris Wilson wrote:
>>> In the next patch, we add another user that wants to check whether
>>> requests can be merge into a single HW execution, and in the future we
>>> want to add more conditions under which requests from the same context
>>> cannot be merge. In preparation, extract out can_merge_rq().
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>    drivers/gpu/drm/i915/intel_lrc.c | 30 +++++++++++++++++++-----------
>>>    1 file changed, 19 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>>> index 2616b0b3e8d5..e97ce54138d3 100644
>>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>>> @@ -285,12 +285,11 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
>>>    }
>>>    
>>>    __maybe_unused static inline bool
>>> -assert_priority_queue(const struct intel_engine_execlists *execlists,
>>> -                   const struct i915_request *prev,
>>> +assert_priority_queue(const struct i915_request *prev,
>>>                      const struct i915_request *next)
>>>    {
>>> -     if (!prev)
>>> -             return true;
>>> +     const struct intel_engine_execlists *execlists =
>>> +             &prev->engine->execlists;
>>>    
>>>        /*
>>>         * Without preemption, the prev may refer to the still active element
>>> @@ -601,6 +600,17 @@ static bool can_merge_ctx(const struct intel_context *prev,
>>>        return true;
>>>    }
>>>    
>>> +static bool can_merge_rq(const struct i915_request *prev,
>>> +                      const struct i915_request *next)
>>> +{
>>> +     GEM_BUG_ON(!assert_priority_queue(prev, next));
>>> +
>>> +     if (!can_merge_ctx(prev->hw_context, next->hw_context))
>>> +             return false;
>>> +
>>> +     return true;
>>
>> I'll assume you'll be adding here in the future as the reason this is
>> not simply "return can_merge_ctx(...)"?
> 
> Yes, raison d'etre of making the change.
> 
>>>    static void port_assign(struct execlist_port *port, struct i915_request *rq)
>>>    {
>>>        GEM_BUG_ON(rq == port_request(port));
>>> @@ -753,8 +763,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>>>                int i;
>>>    
>>>                priolist_for_each_request_consume(rq, rn, p, i) {
>>> -                     GEM_BUG_ON(!assert_priority_queue(execlists, last, rq));
>>> -
>>>                        /*
>>>                         * Can we combine this request with the current port?
>>>                         * It has to be the same context/ringbuffer and not
>>> @@ -766,8 +774,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>>>                         * second request, and so we never need to tell the
>>>                         * hardware about the first.
>>>                         */
>>> -                     if (last &&
>>> -                         !can_merge_ctx(rq->hw_context, last->hw_context)) {
>>> +                     if (last && !can_merge_rq(last, rq)) {
>>> +                             if (last->hw_context == rq->hw_context)
>>> +                                     goto done;
>>
>> I don't get this added check. AFAICS it will only trigger with GVT
>> making it not consider filling both ports if possible.
> 
> Because we are preparing for can_merge_rq() deciding not to merge the
> same context. If we do that we can't continue on to the next port and
> must terminate the loop, violating the trick with the hint in the
> process.
> 
> This changes due to the next patch, per-context freq and probably more
> that I've forgotten.

After a second look, I noticed the existing GVT comment a bit lower down 
which avoids populating port1 already.

Maybe one thing which would make sense is to re-arange these checks in 
the order of "priority", like:

	if (last && !can_merge_rq(...)) {
		// naturally highest prio since it is impossible
		if (port == last_port)
			goto done;
		// 2nd highest to account for programming limitation
		else if (last->hw_context == rq->hw_context)
			goto done;
		// GVT check simplified (I think - since we know last is either 
different ctx or single submit)
		else if (ctx_single_port_submission(rq->hw_context))
			goto done;
> 
>>> +
>>>                                /*
>>>                                 * If we are on the second port and cannot
>>>                                 * combine this request with the last, then we
>>> @@ -787,7 +797,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>>>                                    ctx_single_port_submission(rq->hw_context))
>>>                                        goto done;
>>>    
>>> -                             GEM_BUG_ON(last->hw_context == rq->hw_context);
>>
>> This is related to the previous comment. Rebase error?
> 
> Previous if check, so it's clear at this point that we can't be using
> the same.

Yep.

> 
>>> @@ -827,8 +836,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>>>         * request triggering preemption on the next dequeue (or subsequent
>>>         * interrupt for secondary ports).
>>>         */
>>> -     execlists->queue_priority_hint =
>>> -             port != execlists->port ? rq_prio(last) : INT_MIN;
>>> +     execlists->queue_priority_hint = queue_prio(execlists);
>>
>> This shouldn't be in this patch.
> 
> If we terminate the loop early, we need to look at the head of the
> queue.

Why it is different for ending early for any other (existing) reason? 
Although I concede better management of queue_priority_hint is exactly 
what I was suggesting. Oops. Consequences are not entirely straight 
forward though.. if we decide not to submit all of a single context, or 
leave port1 empty, currently we would hint scheduling the tasklet for 
any new submission. With this change only after a CS or if a higher ctx 
is submitted. Which is what makes me feel it should be a separate patch 
for a behaviour change (since a high prio, higher than INT_MIN, is 
potentially head of the queue).

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 09/11] drm/i915/execlists: Refactor out can_merge_rq()
  2019-01-31  9:19       ` Tvrtko Ursulin
@ 2019-01-31  9:30         ` Chris Wilson
  2019-01-31  9:36         ` Chris Wilson
  1 sibling, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-31  9:30 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-31 09:19:18)
> 
> On 30/01/2019 18:14, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-01-30 18:05:42)
> >>
> >> On 30/01/2019 02:19, Chris Wilson wrote:
> >>> @@ -827,8 +836,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >>>         * request triggering preemption on the next dequeue (or subsequent
> >>>         * interrupt for secondary ports).
> >>>         */
> >>> -     execlists->queue_priority_hint =
> >>> -             port != execlists->port ? rq_prio(last) : INT_MIN;
> >>> +     execlists->queue_priority_hint = queue_prio(execlists);
> >>
> >> This shouldn't be in this patch.
> > 
> > If we terminate the loop early, we need to look at the head of the
> > queue.
> 
> Why it is different for ending early for any other (existing) reason? 
> Although I concede better management of queue_priority_hint is exactly 
> what I was suggesting. Oops. Consequences are not entirely straight 
> forward though.. if we decide not to submit all of a single context, or 
> leave port1 empty, currently we would hint scheduling the tasklet for 
> any new submission. With this change only after a CS or if a higher ctx 
> is submitted. Which is what makes me feel it should be a separate patch 
> for a behaviour change (since a high prio, higher than INT_MIN, is 
> potentially head of the queue).

Not quite. Previously if we saw port1 was empty it meant that last was
invalid and so the right choice was INT_MIN as the queue was empty. In
all other cases last is the first request in the priority list.

After this patch, we cannot draw the same conclusions from port1 being
empty, and nor can we directly inspect last. So to get the same result
as before the patch, we must actually look at the priority queue.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 09/11] drm/i915/execlists: Refactor out can_merge_rq()
  2019-01-31  9:19       ` Tvrtko Ursulin
  2019-01-31  9:30         ` Chris Wilson
@ 2019-01-31  9:36         ` Chris Wilson
  1 sibling, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-31  9:36 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-31 09:19:18)
> 
> On 30/01/2019 18:14, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-01-30 18:05:42)
> >>> @@ -766,8 +774,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
> >>>                         * second request, and so we never need to tell the
> >>>                         * hardware about the first.
> >>>                         */
> >>> -                     if (last &&
> >>> -                         !can_merge_ctx(rq->hw_context, last->hw_context)) {
> >>> +                     if (last && !can_merge_rq(last, rq)) {
> >>> +                             if (last->hw_context == rq->hw_context)
> >>> +                                     goto done;
> >>
> >> I don't get this added check. AFAICS it will only trigger with GVT
> >> making it not consider filling both ports if possible.
> > 
> > Because we are preparing for can_merge_rq() deciding not to merge the
> > same context. If we do that we can't continue on to the next port and
> > must terminate the loop, violating the trick with the hint in the
> > process.
> > 
> > This changes due to the next patch, per-context freq and probably more
> > that I've forgotten.
> 
> After a second look, I noticed the existing GVT comment a bit lower down 
> which avoids populating port1 already.
> 
> Maybe one thing which would make sense is to re-arange these checks in 
> the order of "priority", like:
> 
>         if (last && !can_merge_rq(...)) {
>                 // naturally highest prio since it is impossible
>                 if (port == last_port)
>                         goto done;
>                 // 2nd highest to account for programming limitation
>                 else if (last->hw_context == rq->hw_context)
>                         goto done;

I was tempted to pull the last_port and context checks together.

>                 // GVT check simplified (I think - since we know last is either 
> different ctx or single submit)
>                 else if (ctx_single_port_submission(rq->hw_context))
>                         goto done;

And that's what I think and I tried to get gvt to clarify that their
checks are excessive. And I'll keep on suggesting that they remove their
poking around inside the scheduler... :-p

But it's definitely something I want out of sight.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
  2019-01-30  2:19 ` [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+ Chris Wilson
@ 2019-01-31 13:19   ` Tvrtko Ursulin
  2019-01-31 13:39     ` Chris Wilson
  2019-01-31 17:21     ` Chris Wilson
  0 siblings, 2 replies; 38+ messages in thread
From: Tvrtko Ursulin @ 2019-01-31 13:19 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 30/01/2019 02:19, Chris Wilson wrote:
> Having introduced per-context seqno, we now have a means to identity
> progress across the system without feel of rollback as befell the
> global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> advance of submission safe in the knowledge that our target seqno and
> address is stable.
> 
> However, since we are telling the GPU to busy-spin on the target address
> until it matches the signaling seqno, we only want to do so when we are
> sure that busy-spin will be completed quickly. To achieve this we only
> submit the request to HW once the signaler is itself executing (modulo
> preemption causing us to wait longer), and we only do so for default and
> above priority requests (so that idle priority tasks never themselves
> hog the GPU waiting for others).

It could be milliseconds though. I think apart from media-bench saying 
this is faster, we would need to look at performance per Watt as well.

RING_SEMA_WAIT_POLL is a potential tunable as well. Not that I have an 
idea how to tune it.

Eventually, do we dare adding this without a runtime switch? (There, I 
mentioned the taboo.)

What about signal mode and handling this via context switches?

> But what AB-BA deadlocks? If you remove B, there can be no deadlock...
> The issue is that with a deep ELSP queue, we can queue up a pair of
> AB-BA on different engines, thus forming a classic mutual exclusion
> deadlock. We side-step that issue by restricting the queue depth to
> avoid having multiple semaphores in flight and so we only ever take one
> set of locks at a time.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_request.c       | 153 +++++++++++++++++++++-
>   drivers/gpu/drm/i915/i915_request.h       |   1 +
>   drivers/gpu/drm/i915/i915_scheduler.c     |   1 +
>   drivers/gpu/drm/i915/i915_scheduler.h     |   1 +
>   drivers/gpu/drm/i915/i915_sw_fence.c      |   4 +-
>   drivers/gpu/drm/i915/i915_sw_fence.h      |   3 +
>   drivers/gpu/drm/i915/intel_gpu_commands.h |   5 +
>   drivers/gpu/drm/i915/intel_lrc.c          |  14 +-
>   8 files changed, 178 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 07e4c3c68ecd..6d825cd28ae6 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -22,8 +22,9 @@
>    *
>    */
>   
> -#include <linux/prefetch.h>
>   #include <linux/dma-fence-array.h>
> +#include <linux/irq_work.h>
> +#include <linux/prefetch.h>
>   #include <linux/sched.h>
>   #include <linux/sched/clock.h>
>   #include <linux/sched/signal.h>
> @@ -326,6 +327,76 @@ void i915_request_retire_upto(struct i915_request *rq)
>   	} while (tmp != rq);
>   }
>   
> +struct execute_cb {
> +	struct list_head link;
> +	struct irq_work work;
> +	struct i915_sw_fence *fence;
> +};
> +
> +static void irq_execute_cb(struct irq_work *wrk)
> +{
> +	struct execute_cb *cb = container_of(wrk, typeof(*cb), work);
> +
> +	i915_sw_fence_complete(cb->fence);
> +	kfree(cb);
> +}
> +
> +static void __notify_execute_cb(struct i915_request *rq)
> +{
> +	struct execute_cb *cb;
> +
> +	lockdep_assert_held(&rq->lock);
> +
> +	if (list_empty(&rq->execute_cb))
> +		return;
> +
> +	list_for_each_entry(cb, &rq->execute_cb, link)
> +		irq_work_queue(&cb->work);
> +
> +	/*
> +	 * XXX Rollback on __i915_request_unsubmit()
> +	 *
> +	 * In the future, perhaps when we have an active time-slicing scheduler,
> +	 * it will be interesting to unsubmit parallel execution and remove
> +	 * busywaits from the GPU until their master is restarted. This is
> +	 * quite hairy, we have to carefully rollback the fence and do a
> +	 * preempt-to-idle cycle on the target engine, all the while the
> +	 * master execute_cb may refire.
> +	 */
> +	INIT_LIST_HEAD(&rq->execute_cb);
> +}
> +
> +static int
> +i915_request_await_execution(struct i915_request *rq,
> +			     struct i915_request *signal,
> +			     gfp_t gfp)
> +{
> +	struct execute_cb *cb;
> +	unsigned long flags;
> +
> +	if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags))
> +		return 0;
> +
> +	cb = kmalloc(sizeof(*cb), gfp);
> +	if (!cb)
> +		return -ENOMEM;
> +
> +	cb->fence = &rq->submit;
> +	i915_sw_fence_await(cb->fence);
> +	init_irq_work(&cb->work, irq_execute_cb);
> +
> +	spin_lock_irqsave(&signal->lock, flags);
> +	if (test_bit(I915_FENCE_FLAG_ACTIVE, &signal->fence.flags)) {
> +		i915_sw_fence_complete(cb->fence);
> +		kfree(cb);
> +	} else {
> +		list_add_tail(&cb->link, &signal->execute_cb);
> +	}
> +	spin_unlock_irqrestore(&signal->lock, flags);
> +
> +	return 0;
> +}
> +
>   static void move_to_timeline(struct i915_request *request,
>   			     struct i915_timeline *timeline)
>   {
> @@ -373,6 +444,7 @@ void __i915_request_submit(struct i915_request *request)
>   	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
>   	    !i915_request_enable_breadcrumb(request))
>   		intel_engine_queue_breadcrumbs(engine);
> +	__notify_execute_cb(request);
>   	spin_unlock(&request->lock);
>   
>   	engine->emit_fini_breadcrumb(request,
> @@ -613,6 +685,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	}
>   
>   	INIT_LIST_HEAD(&rq->active_list);
> +	INIT_LIST_HEAD(&rq->execute_cb);
>   
>   	tl = ce->ring->timeline;
>   	ret = i915_timeline_get_seqno(tl, rq, &seqno);
> @@ -700,6 +773,81 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
>   	return ERR_PTR(ret);
>   }
>   
> +static int
> +emit_semaphore_wait(struct i915_request *to,
> +		    struct i915_request *from,
> +		    gfp_t gfp)
> +{
> +	u32 *cs;
> +	int err;
> +
> +	GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
> +
> +	err = i915_timeline_read_lock(from->timeline, to);
> +	if (err)
> +		return err;
> +
> +	/*
> +	 * If we know our signaling request has started, we know that it
> +	 * must, at least, have passed its initial breadcrumb and that its
> +	 * seqno can only increase, therefore any change in its breadcrumb
> +	 * must indicate completion. By using a "not equal to start" compare
> +	 * we avoid the murky issue of how to handle seqno wraparound in an
> +	 * async environment (short answer, we must stop the world whenever
> +	 * any context wraps!) as the likelihood of missing one request then
> +	 * seeing the same start value for a new request is 1 in 2^31, and
> +	 * even then we know that the new request has started and is in
> +	 * progress, so we are sure it will complete soon enough (not to
> +	 * worry about).
> +	 */
> +	if (i915_request_started(from)) {
> +		cs = intel_ring_begin(to, 4);
> +		if (IS_ERR(cs))
> +			return PTR_ERR(cs);
> +
> +		*cs++ = MI_SEMAPHORE_WAIT |
> +			MI_SEMAPHORE_GLOBAL_GTT |
> +			MI_SEMAPHORE_POLL |
> +			MI_SEMAPHORE_SAD_NEQ_SDD;
> +		*cs++ = from->fence.seqno - 1;
> +		*cs++ = from->timeline->hwsp_offset;
> +		*cs++ = 0;
> +
> +		intel_ring_advance(to, cs);
> +	} else {
> +		int err;
> +
> +		err = i915_request_await_execution(to, from, gfp);
> +		if (err)
> +			return err;
> +
> +		cs = intel_ring_begin(to, 4);
> +		if (IS_ERR(cs))
> +			return PTR_ERR(cs);
> +
> +		/*
> +		 * Using greater-than-or-equal here means we have to worry
> +		 * about seqno wraparound. To side step that issue, we swap
> +		 * the timeline HWSP upon wrapping, so that everyone listening
> +		 * for the old (pre-wrap) values do not see the much smaller
> +		 * (post-wrap) values than they were expecting (and so wait
> +		 * forever).
> +		 */
> +		*cs++ = MI_SEMAPHORE_WAIT |
> +			MI_SEMAPHORE_GLOBAL_GTT |
> +			MI_SEMAPHORE_POLL |
> +			MI_SEMAPHORE_SAD_GTE_SDD;
> +		*cs++ = from->fence.seqno;
> +		*cs++ = from->timeline->hwsp_offset;
> +		*cs++ = 0;
> +
> +		intel_ring_advance(to, cs);
> +	}

Would it not work to have a single path which emits the wait on NEQ 
from->fence.seqno - 1, just i915_request_await_execution conditional on 
i915_request_started?

In the !started case, having added the await, we would know the 
semaphore wait would not run until after the dependency has started, and 
NEQ would be true when it completes. The same as the above started path.

> +
> +	to->sched.semaphore = true;
> +	return 0;
> +}
> +
>   static int
>   i915_request_await_request(struct i915_request *to, struct i915_request *from)
>   {
> @@ -723,6 +871,9 @@ i915_request_await_request(struct i915_request *to, struct i915_request *from)
>   		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
>   						       &from->submit,
>   						       I915_FENCE_GFP);
> +	} else if (HAS_EXECLISTS(to->i915) &&
> +		   to->gem_context->sched.priority >= I915_PRIORITY_NORMAL) {
> +		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
>   	} else {
>   		ret = i915_sw_fence_await_dma_fence(&to->submit,
>   						    &from->fence, 0,
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 40f3e8dcbdd5..66a374ee177a 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -127,6 +127,7 @@ struct i915_request {
>   	 */
>   	struct i915_sw_fence submit;
>   	wait_queue_entry_t submitq;
> +	struct list_head execute_cb;
>   
>   	/*
>   	 * A list of everyone we wait upon, and everyone who waits upon us.
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
> index d01683167c77..aa6c663dca09 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.c
> +++ b/drivers/gpu/drm/i915/i915_scheduler.c
> @@ -29,6 +29,7 @@ void i915_sched_node_init(struct i915_sched_node *node)
>   	INIT_LIST_HEAD(&node->waiters_list);
>   	INIT_LIST_HEAD(&node->link);
>   	node->attr.priority = I915_PRIORITY_INVALID;
> +	node->semaphore = false;
>   }
>   
>   static struct i915_dependency *
> diff --git a/drivers/gpu/drm/i915/i915_scheduler.h b/drivers/gpu/drm/i915/i915_scheduler.h
> index dbe9cb7ecd82..d764cf10536f 100644
> --- a/drivers/gpu/drm/i915/i915_scheduler.h
> +++ b/drivers/gpu/drm/i915/i915_scheduler.h
> @@ -72,6 +72,7 @@ struct i915_sched_node {
>   	struct list_head waiters_list; /* those after us, they depend upon us */
>   	struct list_head link;
>   	struct i915_sched_attr attr;
> +	bool semaphore;
>   };
>   
>   struct i915_dependency {
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 7c58b049ecb5..8d1400d378d7 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -192,7 +192,7 @@ static void __i915_sw_fence_complete(struct i915_sw_fence *fence,
>   	__i915_sw_fence_notify(fence, FENCE_FREE);
>   }
>   
> -static void i915_sw_fence_complete(struct i915_sw_fence *fence)
> +void i915_sw_fence_complete(struct i915_sw_fence *fence)
>   {
>   	debug_fence_assert(fence);
>   
> @@ -202,7 +202,7 @@ static void i915_sw_fence_complete(struct i915_sw_fence *fence)
>   	__i915_sw_fence_complete(fence, NULL);
>   }
>   
> -static void i915_sw_fence_await(struct i915_sw_fence *fence)
> +void i915_sw_fence_await(struct i915_sw_fence *fence)
>   {
>   	debug_fence_assert(fence);
>   	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 0e055ea0179f..6dec9e1d1102 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -79,6 +79,9 @@ int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    unsigned long timeout,
>   				    gfp_t gfp);
>   
> +void i915_sw_fence_await(struct i915_sw_fence *fence);
> +void i915_sw_fence_complete(struct i915_sw_fence *fence);
> +
>   static inline bool i915_sw_fence_signaled(const struct i915_sw_fence *fence)
>   {
>   	return atomic_read(&fence->pending) <= 0;
> diff --git a/drivers/gpu/drm/i915/intel_gpu_commands.h b/drivers/gpu/drm/i915/intel_gpu_commands.h
> index b96a31bc1080..0efaadd3bc32 100644
> --- a/drivers/gpu/drm/i915/intel_gpu_commands.h
> +++ b/drivers/gpu/drm/i915/intel_gpu_commands.h
> @@ -106,7 +106,12 @@
>   #define   MI_SEMAPHORE_TARGET(engine)	((engine)<<15)
>   #define MI_SEMAPHORE_WAIT	MI_INSTR(0x1c, 2) /* GEN8+ */
>   #define   MI_SEMAPHORE_POLL		(1<<15)
> +#define   MI_SEMAPHORE_SAD_GT_SDD	(0<<12)
>   #define   MI_SEMAPHORE_SAD_GTE_SDD	(1<<12)
> +#define   MI_SEMAPHORE_SAD_LT_SDD	(2<<12)
> +#define   MI_SEMAPHORE_SAD_LTE_SDD	(3<<12)
> +#define   MI_SEMAPHORE_SAD_EQ_SDD	(4<<12)
> +#define   MI_SEMAPHORE_SAD_NEQ_SDD	(5<<12)
>   #define MI_STORE_DWORD_IMM	MI_INSTR(0x20, 1)
>   #define MI_STORE_DWORD_IMM_GEN4	MI_INSTR(0x20, 2)
>   #define   MI_MEM_VIRTUAL	(1 << 22) /* 945,g33,965 */
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index e97ce54138d3..80d17b75b2e6 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -421,7 +421,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
>   	 * in the priority queue, but they will not gain immediate access to
>   	 * the GPU.
>   	 */
> -	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY) {
> +	if ((prio & ACTIVE_PRIORITY) != ACTIVE_PRIORITY &&
> +	    i915_request_started(active)) {
>   		prio |= ACTIVE_PRIORITY;
>   		active->sched.attr.priority = prio;
>   		list_move_tail(&active->sched.link,
> @@ -605,6 +606,17 @@ static bool can_merge_rq(const struct i915_request *prev,
>   {
>   	GEM_BUG_ON(!assert_priority_queue(prev, next));
>   
> +	/*
> +	 * To avoid AB-BA deadlocks, we simply restrict ourselves to only
> +	 * submitting one semaphore (think HW spinlock) to HW at a time. This
> +	 * prevents the execution callback on a later sempahore from being
> +	 * queued on another engine, so no cycle can be formed. Preemption
> +	 * rules should mean that if this semaphore is preempted, its
> +	 * dependency chain is preserved and suitably promoted via PI.
> +	 */
> +	if (prev->sched.semaphore && !i915_request_started(prev))
> +		return false;
> +
>   	if (!can_merge_ctx(prev->hw_context, next->hw_context))
>   		return false;
>   
> 

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
  2019-01-31 13:19   ` Tvrtko Ursulin
@ 2019-01-31 13:39     ` Chris Wilson
  2019-01-31 16:20       ` Chris Wilson
  2019-01-31 17:21     ` Chris Wilson
  1 sibling, 1 reply; 38+ messages in thread
From: Chris Wilson @ 2019-01-31 13:39 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-31 13:19:31)
> 
> On 30/01/2019 02:19, Chris Wilson wrote:
> > Having introduced per-context seqno, we now have a means to identity
> > progress across the system without feel of rollback as befell the
> > global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> > advance of submission safe in the knowledge that our target seqno and
> > address is stable.
> > 
> > However, since we are telling the GPU to busy-spin on the target address
> > until it matches the signaling seqno, we only want to do so when we are
> > sure that busy-spin will be completed quickly. To achieve this we only
> > submit the request to HW once the signaler is itself executing (modulo
> > preemption causing us to wait longer), and we only do so for default and
> > above priority requests (so that idle priority tasks never themselves
> > hog the GPU waiting for others).
> 
> It could be milliseconds though. I think apart from media-bench saying 
> this is faster, we would need to look at performance per Watt as well.

All throughput measurements are substantially faster, as you would
expect, and inter-engine latency decreased. I would hope it would
powergate/rc6 the EU while the CS was spinning, but I don't know :)
 
> RING_SEMA_WAIT_POLL is a potential tunable as well. Not that I have an 
> idea how to tune it.
> 
> Eventually, do we dare adding this without a runtime switch? (There, I 
> mentioned the taboo.)

Yes :p

> What about signal mode and handling this via context switches?

That's 99% of the timeslicing scheduler right there -- the handling of
deferred work with the complication of it impacting other engines.
 
> > +static int
> > +emit_semaphore_wait(struct i915_request *to,
> > +                 struct i915_request *from,
> > +                 gfp_t gfp)
> > +{
> > +     u32 *cs;
> > +     int err;
> > +
> > +     GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
> > +
> > +     err = i915_timeline_read_lock(from->timeline, to);
> > +     if (err)
> > +             return err;
> > +
> > +     /*
> > +      * If we know our signaling request has started, we know that it
> > +      * must, at least, have passed its initial breadcrumb and that its
> > +      * seqno can only increase, therefore any change in its breadcrumb
> > +      * must indicate completion. By using a "not equal to start" compare
> > +      * we avoid the murky issue of how to handle seqno wraparound in an
> > +      * async environment (short answer, we must stop the world whenever
> > +      * any context wraps!) as the likelihood of missing one request then
> > +      * seeing the same start value for a new request is 1 in 2^31, and
> > +      * even then we know that the new request has started and is in
> > +      * progress, so we are sure it will complete soon enough (not to
> > +      * worry about).
> > +      */
> > +     if (i915_request_started(from)) {
> > +             cs = intel_ring_begin(to, 4);
> > +             if (IS_ERR(cs))
> > +                     return PTR_ERR(cs);
> > +
> > +             *cs++ = MI_SEMAPHORE_WAIT |
> > +                     MI_SEMAPHORE_GLOBAL_GTT |
> > +                     MI_SEMAPHORE_POLL |
> > +                     MI_SEMAPHORE_SAD_NEQ_SDD;
> > +             *cs++ = from->fence.seqno - 1;
> > +             *cs++ = from->timeline->hwsp_offset;
> > +             *cs++ = 0;
> > +
> > +             intel_ring_advance(to, cs);
> > +     } else {
> > +             int err;
> > +
> > +             err = i915_request_await_execution(to, from, gfp);
> > +             if (err)
> > +                     return err;
> > +
> > +             cs = intel_ring_begin(to, 4);
> > +             if (IS_ERR(cs))
> > +                     return PTR_ERR(cs);
> > +
> > +             /*
> > +              * Using greater-than-or-equal here means we have to worry
> > +              * about seqno wraparound. To side step that issue, we swap
> > +              * the timeline HWSP upon wrapping, so that everyone listening
> > +              * for the old (pre-wrap) values do not see the much smaller
> > +              * (post-wrap) values than they were expecting (and so wait
> > +              * forever).
> > +              */
> > +             *cs++ = MI_SEMAPHORE_WAIT |
> > +                     MI_SEMAPHORE_GLOBAL_GTT |
> > +                     MI_SEMAPHORE_POLL |
> > +                     MI_SEMAPHORE_SAD_GTE_SDD;
> > +             *cs++ = from->fence.seqno;
> > +             *cs++ = from->timeline->hwsp_offset;
> > +             *cs++ = 0;
> > +
> > +             intel_ring_advance(to, cs);
> > +     }
> 
> Would it not work to have a single path which emits the wait on NEQ 
> from->fence.seqno - 1, just i915_request_await_execution conditional on 
> i915_request_started?
> 
> In the !started case, having added the await, we would know the 
> semaphore wait would not run until after the dependency has started, and 
> NEQ would be true when it completes. The same as the above started path.

We may have previously submitted the signaler in a very long queue to
its engine so cannot determine its position, in which case we could
sample a long time before it even begins. Even if we launch both
requests on the different engines at the same time, we could sample
before the started semaphore.

I should remove the current NEQ path, it was before I committed myself
to handling the HWSP across wraparounds, and is now just needless
complication.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
  2019-01-31 13:39     ` Chris Wilson
@ 2019-01-31 16:20       ` Chris Wilson
  2019-02-01  9:03         ` Chris Wilson
  0 siblings, 1 reply; 38+ messages in thread
From: Chris Wilson @ 2019-01-31 16:20 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Chris Wilson (2019-01-31 13:39:50)
> Quoting Tvrtko Ursulin (2019-01-31 13:19:31)
> > 
> > On 30/01/2019 02:19, Chris Wilson wrote:
> > > Having introduced per-context seqno, we now have a means to identity
> > > progress across the system without feel of rollback as befell the
> > > global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> > > advance of submission safe in the knowledge that our target seqno and
> > > address is stable.
> > > 
> > > However, since we are telling the GPU to busy-spin on the target address
> > > until it matches the signaling seqno, we only want to do so when we are
> > > sure that busy-spin will be completed quickly. To achieve this we only
> > > submit the request to HW once the signaler is itself executing (modulo
> > > preemption causing us to wait longer), and we only do so for default and
> > > above priority requests (so that idle priority tasks never themselves
> > > hog the GPU waiting for others).
> > 
> > It could be milliseconds though. I think apart from media-bench saying 
> > this is faster, we would need to look at performance per Watt as well.
> 
> All throughput measurements are substantially faster, as you would
> expect, and inter-engine latency decreased. I would hope it would
> powergate/rc6 the EU while the CS was spinning, but I don't know :)

Fwiw, it's about the power cost of simply spinning with the CS without
any additional cost of utilizing the engine.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
  2019-01-31 13:19   ` Tvrtko Ursulin
  2019-01-31 13:39     ` Chris Wilson
@ 2019-01-31 17:21     ` Chris Wilson
  1 sibling, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-01-31 17:21 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-31 13:19:31)
> 
> On 30/01/2019 02:19, Chris Wilson wrote:
> > Having introduced per-context seqno, we now have a means to identity
> > progress across the system without feel of rollback as befell the
> > global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> > advance of submission safe in the knowledge that our target seqno and
> > address is stable.
> > 
> > However, since we are telling the GPU to busy-spin on the target address
> > until it matches the signaling seqno, we only want to do so when we are
> > sure that busy-spin will be completed quickly. To achieve this we only
> > submit the request to HW once the signaler is itself executing (modulo
> > preemption causing us to wait longer), and we only do so for default and
> > above priority requests (so that idle priority tasks never themselves
> > hog the GPU waiting for others).
> 
> It could be milliseconds though. I think apart from media-bench saying 
> this is faster, we would need to look at performance per Watt as well.
> 
> RING_SEMA_WAIT_POLL is a potential tunable as well. Not that I have an 
> idea how to tune it.
> 
> Eventually, do we dare adding this without a runtime switch? (There, I 
> mentioned the taboo.)

Yes, we could make it a context setparam. I used priority here as
arguing that idle workloads don't want the extra power draw makes sense.

Downside of making it opt-in, nobody benefits. Still it's pretty limited
to media workloads at the moment (who else uses multiple rings atm), but
even there reducing latency for desktop video is justifiable imo.

(Now having said that, I should go out and find a video player to
benchmark... Maybe we can demonstrate reduced frame drop for Kodi. If I
say "Kodi, Kodi, Kodi" I summon a Kodi dev right?)

Downside of making it opt-out: everybody gets to experience our bugs,
and the onus is on us in making the right choice.

> > @@ -605,6 +606,17 @@ static bool can_merge_rq(const struct i915_request *prev,
> >   {
> >       GEM_BUG_ON(!assert_priority_queue(prev, next));
> >   
> > +     /*
> > +      * To avoid AB-BA deadlocks, we simply restrict ourselves to only
> > +      * submitting one semaphore (think HW spinlock) to HW at a time. This
> > +      * prevents the execution callback on a later sempahore from being
> > +      * queued on another engine, so no cycle can be formed. Preemption
> > +      * rules should mean that if this semaphore is preempted, its
> > +      * dependency chain is preserved and suitably promoted via PI.
> > +      */
> > +     if (prev->sched.semaphore && !i915_request_started(prev))
> > +             return false;

The other way I was thinking we could solve this is to move the
execute_cb from i915_request_submit until we actually insert the request
in ELSP[0] (or do the promotion from ELSP[1]).

I don't like either much. I don't really want to walk the list of
requests for port0 checking for execute_cb, but I don't also like
arbitrary splitting contexts (however, there seems to be reasons to do
that anyway).

It all depends on how fast we can service CS interrupts, and that needs
to always be fast. :|
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

* Re: [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+
  2019-01-31 16:20       ` Chris Wilson
@ 2019-02-01  9:03         ` Chris Wilson
  0 siblings, 0 replies; 38+ messages in thread
From: Chris Wilson @ 2019-02-01  9:03 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Chris Wilson (2019-01-31 16:20:37)
> Quoting Chris Wilson (2019-01-31 13:39:50)
> > Quoting Tvrtko Ursulin (2019-01-31 13:19:31)
> > > 
> > > On 30/01/2019 02:19, Chris Wilson wrote:
> > > > Having introduced per-context seqno, we now have a means to identity
> > > > progress across the system without feel of rollback as befell the
> > > > global_seqno. That is we can program a MI_SEMAPHORE_WAIT operation in
> > > > advance of submission safe in the knowledge that our target seqno and
> > > > address is stable.
> > > > 
> > > > However, since we are telling the GPU to busy-spin on the target address
> > > > until it matches the signaling seqno, we only want to do so when we are
> > > > sure that busy-spin will be completed quickly. To achieve this we only
> > > > submit the request to HW once the signaler is itself executing (modulo
> > > > preemption causing us to wait longer), and we only do so for default and
> > > > above priority requests (so that idle priority tasks never themselves
> > > > hog the GPU waiting for others).
> > > 
> > > It could be milliseconds though. I think apart from media-bench saying 
> > > this is faster, we would need to look at performance per Watt as well.
> > 
> > All throughput measurements are substantially faster, as you would
> > expect, and inter-engine latency decreased. I would hope it would
> > powergate/rc6 the EU while the CS was spinning, but I don't know :)
> 
> Fwiw, it's about the power cost of simply spinning with the CS without
> any additional cost of utilizing the engine.

Another interesting data point is that the *total* energy consumed for a
latency bound test that passes a piece of work from one engine to the
next reduced by 30%, with a speed increase of nearly 100% (on glk).
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 38+ messages in thread

end of thread, other threads:[~2019-02-01  9:03 UTC | newest]

Thread overview: 38+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-30  2:18 [PATCH 01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Chris Wilson
2019-01-30  2:18 ` [PATCH 02/11] drm/i915/execlists: Suppress redundant preemption Chris Wilson
2019-01-30  2:18 ` [PATCH 03/11] drm/i915/selftests: Exercise some AB...BA preemption chains Chris Wilson
2019-01-30  2:18 ` [PATCH 04/11] drm/i915: Generalise GPU activity tracking Chris Wilson
2019-01-30 12:02   ` Tvrtko Ursulin
2019-01-30 12:14     ` Chris Wilson
2019-01-30  2:19 ` [PATCH 05/11] drm/i915: Add timeline barrier support Chris Wilson
2019-01-30  2:19 ` [PATCH 06/11] drm/i915: Allocate active tracking nodes from a slabcache Chris Wilson
2019-01-30 15:57   ` Tvrtko Ursulin
2019-01-30 16:08     ` Chris Wilson
2019-01-30 21:58       ` Chris Wilson
2019-01-30 16:37   ` [PATCH v2] " Chris Wilson
2019-01-30 16:47     ` Chris Wilson
2019-01-30 17:25   ` [PATCH v3] " Chris Wilson
2019-01-30  2:19 ` [PATCH 07/11] drm/i915: Pull i915_gem_active into the i915_active family Chris Wilson
2019-01-30 16:11   ` Tvrtko Ursulin
2019-01-30  2:19 ` [PATCH 08/11] drm/i915: Keep timeline HWSP allocated until the system is idle Chris Wilson
2019-01-30 17:54   ` Tvrtko Ursulin
2019-01-30 18:25     ` Chris Wilson
2019-01-30  2:19 ` [PATCH 09/11] drm/i915/execlists: Refactor out can_merge_rq() Chris Wilson
2019-01-30 18:05   ` Tvrtko Ursulin
2019-01-30 18:14     ` Chris Wilson
2019-01-31  9:19       ` Tvrtko Ursulin
2019-01-31  9:30         ` Chris Wilson
2019-01-31  9:36         ` Chris Wilson
2019-01-30  2:19 ` [PATCH 10/11] drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+ Chris Wilson
2019-01-31 13:19   ` Tvrtko Ursulin
2019-01-31 13:39     ` Chris Wilson
2019-01-31 16:20       ` Chris Wilson
2019-02-01  9:03         ` Chris Wilson
2019-01-31 17:21     ` Chris Wilson
2019-01-30  2:19 ` [PATCH 11/11] drm/i915: Prioritise non-busywait semaphore workloads Chris Wilson
2019-01-30  2:43 ` ✗ Fi.CI.CHECKPATCH: warning for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset Patchwork
2019-01-30  2:48 ` ✗ Fi.CI.SPARSE: " Patchwork
2019-01-30  3:07 ` ✓ Fi.CI.BAT: success " Patchwork
2019-01-30  7:32 ` ✗ Fi.CI.IGT: failure " Patchwork
2019-01-30  9:17   ` Chris Wilson
2019-01-30 18:41 ` ✗ Fi.CI.BAT: failure for series starting with [01/11] drm/i915: Revoke mmaps and prevent access to fence registers across reset (rev3) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.