Keeping Tvrtko busy

All of lore.kernel.org
 help / color / mirror / Atom feed

* Keeping Tvrtko busy
@ 2019-01-18 14:00 Chris Wilson
  2019-01-18 14:00 ` [PATCH 01/38] drm/i915/execlists: Store the highest priority context Chris Wilson
                   ` (39 more replies)
  0 siblings, 40 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

A continuation of the series under review to show the current path to
load balancing. After the HWSP timeline reworking, we have the removal
of global seqno, and then the patches to construct a virtual engine with
load balancing across the equivalent siblings. Wrt veng, the changes
since last time all centre around improving the uapi to remove more
implicit choices baked into the kernel and some bugfixes that came and
went thanks due to per-context seqno.
-Chris


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH 01/38] drm/i915/execlists: Store the highest priority context
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 02/38] drm/i915: Make all GPU resets atomic Chris Wilson
                   ` (38 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

In order to avoid preempting ourselves, we currently refuse to schedule
the tasklet if we reschedule an inflight context. However, this glosses
over a few issues such as what happens after a CS completion event and
we then preempt the newly executing context with itself, or if something
else causes a tasklet_schedule triggering the same evaluation to
preempt the active context with itself.

To avoid the extra complications, record the highest priority context
along with its value so we can elide no effect preemption requests.

References: a2bf92e8cc16 ("drm/i915/execlists: Avoid kicking priority on the current context")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_scheduler.c   | 21 +++++++++++++++----
 drivers/gpu/drm/i915/intel_lrc.c        | 28 ++++++++++++++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  8 +++++++
 3 files changed, 46 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 340faea6c08a..e0b177687bec 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -239,6 +239,18 @@ sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
 	return engine;
 }
 
+static bool inflight(const struct i915_request *rq,
+		     const struct intel_engine_cs *engine)
+{
+	const struct i915_request *active;
+
+	if (!rq->global_seqno)
+		return false;
+
+	active = port_request(engine->execlists.port);
+	return active->hw_context == rq->hw_context;
+}
+
 static void __i915_schedule(struct i915_request *rq,
 			    const struct i915_sched_attr *attr)
 {
@@ -356,17 +368,18 @@ static void __i915_schedule(struct i915_request *rq,
 		if (prio <= engine->execlists.queue_priority)
 			continue;
 
+		engine->execlists.queue_priority = prio;
+		engine->execlists.queue_context =
+			node_to_request(node)->hw_context;
+
 		/*
 		 * If we are already the currently executing context, don't
 		 * bother evaluating if we should preempt ourselves.
 		 */
-		if (node_to_request(node)->global_seqno &&
-		    i915_seqno_passed(port_request(engine->execlists.port)->global_seqno,
-				      node_to_request(node)->global_seqno))
+		if (inflight(node_to_request(node), engine))
 			continue;
 
 		/* Defer (tasklet) submission until after all of our updates. */
-		engine->execlists.queue_priority = prio;
 		tasklet_hi_schedule(&engine->execlists.tasklet);
 	}
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index c0a42afaf177..379a43ed2f90 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -183,10 +183,12 @@ static inline int rq_prio(const struct i915_request *rq)
 
 static inline bool need_preempt(const struct intel_engine_cs *engine,
 				const struct i915_request *last,
-				int prio)
+				int prio,
+				const struct intel_context *context)
 {
 	return (intel_engine_has_preemption(engine) &&
 		__execlists_need_preempt(prio, rq_prio(last)) &&
+		last->hw_context != context &&
 		!i915_request_completed(last));
 }
 
@@ -579,7 +581,10 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
 			return;
 
-		if (need_preempt(engine, last, execlists->queue_priority)) {
+		if (need_preempt(engine, last,
+				 execlists->queue_priority,
+				 execlists->queue_context)) {
+			GEM_BUG_ON(execlists->queue_priority == INT_MIN);
 			inject_preempt_context(engine);
 			return;
 		}
@@ -696,6 +701,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * user, see queue_request(), the queue_priority is bumped to that
 	 * request triggering preemption on the next dequeue (or subsequent
 	 * interrupt for secondary ports).
+	 *
+	 * As we are guaranteed that the queue_priority does not preempt
+	 * the currently executing context, we can forgo resetting
+	 * queue_context here in the knowlege that it will be set before
+	 * any preemption.
 	 */
 	execlists->queue_priority =
 		port != execlists->port ? rq_prio(last) : INT_MIN;
@@ -1073,10 +1083,12 @@ static void __submit_queue_imm(struct intel_engine_cs *engine)
 		tasklet_hi_schedule(&execlists->tasklet);
 }
 
-static void submit_queue(struct intel_engine_cs *engine, int prio)
+static void submit_queue(struct intel_engine_cs *engine,
+			 const struct i915_request *rq)
 {
-	if (prio > engine->execlists.queue_priority) {
-		engine->execlists.queue_priority = prio;
+	if (rq_prio(rq) > engine->execlists.queue_priority) {
+		engine->execlists.queue_priority = rq_prio(rq);
+		engine->execlists.queue_context = rq->hw_context;
 		__submit_queue_imm(engine);
 	}
 }
@@ -1094,7 +1106,7 @@ static void execlists_submit_request(struct i915_request *request)
 	GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 	GEM_BUG_ON(list_empty(&request->sched.link));
 
-	submit_queue(engine, rq_prio(request));
+	submit_queue(engine, request);
 
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
@@ -2736,7 +2748,9 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
 
 	last = NULL;
 	count = 0;
-	drm_printf(m, "\t\tQueue priority: %d\n", execlists->queue_priority);
+	if (execlists->queue_priority != INT_MIN)
+		drm_printf(m, "\t\tQueue priority: %d\n",
+			   execlists->queue_priority);
 	for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
 		struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
 		int i;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c3ef0f9bf321..c5975c67f74d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -308,6 +308,14 @@ struct intel_engine_execlists {
 	 */
 	int queue_priority;
 
+	/**
+	 * @queue_context: Highest pending context
+	 *
+	 * Record the context that is at the forefront of the queue to
+	 * avoid preempting itself.
+	 */
+	const struct intel_context *queue_context;
+
 	/**
 	 * @queue: queue of requests, in priority lists
 	 */
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 02/38] drm/i915: Make all GPU resets atomic
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
  2019-01-18 14:00 ` [PATCH 01/38] drm/i915/execlists: Store the highest priority context Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:22   ` Mika Kuoppala
  2019-01-18 14:00 ` [PATCH 03/38] drm/i915/guc: Disable global reset Chris Wilson
                   ` (37 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

In preparation for the next few commits, make resetting the GPU atomic.
Currently, we have prepared gen6+ for atomic resetting of individual
engines, but now there is a requirement to perform the whole device
level reset (just the register poking) from inside an atomic context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c | 50 +++++++++++++++++--------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 342d9ee42601..b9d0ea70361c 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -144,14 +144,14 @@ static int i915_do_reset(struct drm_i915_private *i915,
 
 	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
 	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
-	usleep_range(50, 200);
-	err = wait_for(i915_in_reset(pdev), 500);
+	udelay(50);
+	err = wait_for_atomic(i915_in_reset(pdev), 50);
 
 	/* Clear the reset request. */
 	pci_write_config_byte(pdev, I915_GDRST, 0);
-	usleep_range(50, 200);
+	udelay(50);
 	if (!err)
-		err = wait_for(!i915_in_reset(pdev), 500);
+		err = wait_for_atomic(!i915_in_reset(pdev), 50);
 
 	return err;
 }
@@ -171,7 +171,7 @@ static int g33_do_reset(struct drm_i915_private *i915,
 	struct pci_dev *pdev = i915->drm.pdev;
 
 	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
-	return wait_for(g4x_reset_complete(pdev), 500);
+	return wait_for_atomic(g4x_reset_complete(pdev), 50);
 }
 
 static int g4x_do_reset(struct drm_i915_private *dev_priv,
@@ -182,13 +182,13 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
 	int ret;
 
 	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
-	I915_WRITE(VDECCLK_GATE_D,
-		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
-	POSTING_READ(VDECCLK_GATE_D);
+	I915_WRITE_FW(VDECCLK_GATE_D,
+		      I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
+	POSTING_READ_FW(VDECCLK_GATE_D);
 
 	pci_write_config_byte(pdev, I915_GDRST,
 			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
-	ret =  wait_for(g4x_reset_complete(pdev), 500);
+	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
 	if (ret) {
 		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
 		goto out;
@@ -196,7 +196,7 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
 
 	pci_write_config_byte(pdev, I915_GDRST,
 			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
-	ret =  wait_for(g4x_reset_complete(pdev), 500);
+	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
 	if (ret) {
 		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
 		goto out;
@@ -205,9 +205,9 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
 out:
 	pci_write_config_byte(pdev, I915_GDRST, 0);
 
-	I915_WRITE(VDECCLK_GATE_D,
-		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
-	POSTING_READ(VDECCLK_GATE_D);
+	I915_WRITE_FW(VDECCLK_GATE_D,
+		      I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
+	POSTING_READ_FW(VDECCLK_GATE_D);
 
 	return ret;
 }
@@ -218,27 +218,29 @@ static int ironlake_do_reset(struct drm_i915_private *dev_priv,
 {
 	int ret;
 
-	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
-	ret = intel_wait_for_register(dev_priv,
-				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
-				      500);
+	I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
+	ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR,
+					   ILK_GRDOM_RESET_ENABLE, 0,
+					   5000, 0,
+					   NULL);
 	if (ret) {
 		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
 		goto out;
 	}
 
-	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
-	ret = intel_wait_for_register(dev_priv,
-				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
-				      500);
+	I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
+	ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR,
+					   ILK_GRDOM_RESET_ENABLE, 0,
+					   5000, 0,
+					   NULL);
 	if (ret) {
 		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
 		goto out;
 	}
 
 out:
-	I915_WRITE(ILK_GDSR, 0);
-	POSTING_READ(ILK_GDSR);
+	I915_WRITE_FW(ILK_GDSR, 0);
+	POSTING_READ_FW(ILK_GDSR);
 	return ret;
 }
 
@@ -572,7 +574,9 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
 		ret = -ENODEV;
 		if (reset) {
 			GEM_TRACE("engine_mask=%x\n", engine_mask);
+			preempt_disable();
 			ret = reset(i915, engine_mask, retry);
+			preempt_enable();
 		}
 		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
 			break;
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 03/38] drm/i915/guc: Disable global reset
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
  2019-01-18 14:00 ` [PATCH 01/38] drm/i915/execlists: Store the highest priority context Chris Wilson
  2019-01-18 14:00 ` [PATCH 02/38] drm/i915: Make all GPU resets atomic Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 04/38] drm/i915: Remove GPU reset dependence on struct_mutex Chris Wilson
                   ` (36 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

The guc (and huc) currently inexcruitably depend on struct_mutex for
device reinitialisation from inside the reset, and indeed taking any
mutex here is verboten (as we must be able to reset from underneath any
of our mutexes). That makes recovering the guc unviable without, for
example, reserving contiguous vma space and pages for it to use.

The plan to re-enable global reset for the GuC centres around reusing the
WOPM reserved space at the top of the aperture (that we know we can
populate a contiguous range large enough to dma xfer the fw image).

In the meantime, hopefully no one even notices as the device-reset is
only used as a backup to the per-engine resets for handling GPU hangs.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Acked-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Acked-by: Daniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
---
 drivers/gpu/drm/i915/i915_reset.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index b9d0ea70361c..2961c21d9420 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -590,6 +590,9 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)

 bool intel_has_gpu_reset(struct drm_i915_private *i915)
 {
+	if (USES_GUC(i915))
+		return false;
+
 	return intel_get_gpu_reset(i915);
 }

-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 04/38] drm/i915: Remove GPU reset dependence on struct_mutex
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (2 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 03/38] drm/i915/guc: Disable global reset Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 05/38] drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest Chris Wilson
                   ` (35 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Now that the submission backends are controlled via their own spinlocks,
with a wave of a magic wand we can lift the struct_mutex requirement
around GPU reset. That is we allow the submission frontend (userspace)
to keep on submitting while we process the GPU reset as we can suspend
the backend independently.

The major change is around the backoff/handoff strategy for performing
the reset. With no mutex deadlock, we no longer have to coordinate with
any waiter, and just perform the reset immediately.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  38 +-
 drivers/gpu/drm/i915/i915_drv.h               |   5 -
 drivers/gpu/drm/i915/i915_gem.c               |  18 +-
 drivers/gpu/drm/i915/i915_gem_fence_reg.h     |   1 -
 drivers/gpu/drm/i915/i915_gem_gtt.h           |   1 +
 drivers/gpu/drm/i915/i915_gpu_error.c         | 102 +++--
 drivers/gpu/drm/i915/i915_gpu_error.h         |  28 +-
 drivers/gpu/drm/i915/i915_request.c           |  47 ---
 drivers/gpu/drm/i915/i915_reset.c             | 397 ++++++++----------
 drivers/gpu/drm/i915/i915_reset.h             |   3 +
 drivers/gpu/drm/i915/intel_engine_cs.c        |   6 +-
 drivers/gpu/drm/i915/intel_guc_submission.c   |   5 +-
 drivers/gpu/drm/i915/intel_hangcheck.c        |  28 +-
 drivers/gpu/drm/i915/intel_lrc.c              |  92 ++--
 drivers/gpu/drm/i915/intel_overlay.c          |   2 -
 drivers/gpu/drm/i915/intel_ringbuffer.c       |  91 ++--
 drivers/gpu/drm/i915/intel_ringbuffer.h       |  17 +-
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |  57 +--
 .../drm/i915/selftests/intel_workarounds.c    |   3 -
 .../gpu/drm/i915/selftests/mock_gem_device.c  |   4 +-
 20 files changed, 391 insertions(+), 554 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 24d6d4ce14ef..3ec369980d40 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1284,8 +1284,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_puts(m, "Wedged\n");
 	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
 		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-	if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Reset in progress: reset handoff to waiter\n");
 	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
 		seq_puts(m, "Waiter holding struct mutex\n");
 	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
@@ -1321,15 +1319,15 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		struct rb_node *rb;
 
 		seq_printf(m, "%s:\n", engine->name);
-		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
+		seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
 			   engine->hangcheck.seqno, seqno[id],
-			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
+			   intel_engine_last_submit(engine),
+			   jiffies_to_msecs(jiffies -
+					    engine->hangcheck.action_timestamp));
+		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)),
-			   yesno(engine->hangcheck.stalled),
-			   yesno(engine->hangcheck.wedged));
+					  &dev_priv->gpu_error.missed_irq_rings)));
 
 		spin_lock_irq(&b->rb_lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1343,11 +1341,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
-		seq_printf(m, "\taction = %s(%d) %d ms ago\n",
-			   hangcheck_action_to_str(engine->hangcheck.action),
-			   engine->hangcheck.action,
-			   jiffies_to_msecs(jiffies -
-					    engine->hangcheck.action_timestamp));
 
 		if (engine->id == RCS) {
 			seq_puts(m, "\tinstdone read =\n");
@@ -3886,8 +3879,6 @@ static int
 i915_wedged_set(void *data, u64 val)
 {
 	struct drm_i915_private *i915 = data;
-	struct intel_engine_cs *engine;
-	unsigned int tmp;
 
 	/*
 	 * There is no safeguard against this debugfs entry colliding
@@ -3900,18 +3891,8 @@ i915_wedged_set(void *data, u64 val)
 	if (i915_reset_backoff(&i915->gpu_error))
 		return -EAGAIN;
 
-	for_each_engine_masked(engine, i915, val, tmp) {
-		engine->hangcheck.seqno = intel_engine_get_seqno(engine);
-		engine->hangcheck.stalled = true;
-	}
-
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
-
-	wait_on_bit(&i915->gpu_error.flags,
-		    I915_RESET_HANDOFF,
-		    TASK_UNINTERRUPTIBLE);
-
 	return 0;
 }
 
@@ -4066,13 +4047,8 @@ i915_drop_caches_set(void *data, u64 val)
 		mutex_unlock(&i915->drm.struct_mutex);
 	}
 
-	if (val & DROP_RESET_ACTIVE &&
-	    i915_terminally_wedged(&i915->gpu_error)) {
+	if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
 		i915_handle_error(i915, ALL_ENGINES, 0, NULL);
-		wait_on_bit(&i915->gpu_error.flags,
-			    I915_RESET_HANDOFF,
-			    TASK_UNINTERRUPTIBLE);
-	}
 
 	fs_reclaim_acquire(GFP_KERNEL);
 	if (val & DROP_BOUND)
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 310d9e1e1620..94680b15bed0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3001,11 +3001,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
 	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
 }
 
-static inline bool i915_reset_handoff(struct i915_gpu_error *error)
-{
-	return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
-}
-
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
 	return unlikely(test_bit(I915_WEDGED, &error->flags));
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b359390ba22c..d20b42386c3c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -657,11 +657,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
 		     struct intel_rps_client *rps_client)
 {
 	might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
-	GEM_BUG_ON(debug_locks &&
-		   !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
-		   !!(flags & I915_WAIT_LOCKED));
-#endif
 	GEM_BUG_ON(timeout < 0);
 
 	timeout = i915_gem_object_wait_reservation(obj->resv,
@@ -4493,8 +4488,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 
 	GEM_TRACE("\n");
 
-	mutex_lock(&i915->drm.struct_mutex);
-
 	wakeref = intel_runtime_pm_get(i915);
 	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
 
@@ -4520,6 +4513,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 	intel_runtime_pm_put(i915, wakeref);
 
+	mutex_lock(&i915->drm.struct_mutex);
 	i915_gem_contexts_lost(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 }
@@ -4534,6 +4528,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	wakeref = intel_runtime_pm_get(i915);
 	intel_suspend_gt_powersave(i915);
 
+	flush_workqueue(i915->wq);
+
 	mutex_lock(&i915->drm.struct_mutex);
 
 	/*
@@ -4563,11 +4559,9 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	i915_retire_requests(i915); /* ensure we flush after wedging */
 
 	mutex_unlock(&i915->drm.struct_mutex);
+	i915_reset_flush(i915);
 
-	intel_uc_suspend(i915);
-
-	cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
-	cancel_delayed_work_sync(&i915->gt.retire_work);
+	drain_delayed_work(&i915->gt.retire_work);
 
 	/*
 	 * As the idle_work is rearming if it detects a race, play safe and
@@ -4575,6 +4569,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	 */
 	drain_delayed_work(&i915->gt.idle_work);
 
+	intel_uc_suspend(i915);
+
 	/*
 	 * Assert that we successfully flushed all the work and
 	 * reset the GPU back to its idle, low power state.
diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
index 99a31ded4dfd..09dcaf14121b 100644
--- a/drivers/gpu/drm/i915/i915_gem_fence_reg.h
+++ b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -50,4 +50,3 @@ struct drm_i915_fence_reg {
 };
 
 #endif
-
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 9229b03d629b..a0039ea97cdc 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -39,6 +39,7 @@
 #include <linux/pagevec.h>
 
 #include "i915_request.h"
+#include "i915_reset.h"
 #include "i915_selftest.h"
 #include "i915_timeline.h"
 
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 1f8e80e31b49..afccffa9f3f9 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -533,10 +533,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-	err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
-	err_printf(m, "  hangcheck action: %s\n",
-		   hangcheck_action_to_str(ee->hangcheck_action));
-	err_printf(m, "  hangcheck action timestamp: %dms (%lu%s)\n",
+	err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
 		   jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
 		   ee->hangcheck_timestamp,
 		   ee->hangcheck_timestamp == epoch ? "; epoch" : "");
@@ -684,15 +681,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 		   jiffies_to_msecs(error->capture - error->epoch));
 
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_stalled &&
-		    error->engine[i].context.pid) {
-			err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
-				   engine_name(m->i915, i),
-				   error->engine[i].context.comm,
-				   error->engine[i].context.pid,
-				   error->engine[i].context.ban_score,
-				   bannable(&error->engine[i].context));
-		}
+		if (!error->engine[i].context.pid)
+			continue;
+
+		err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
+			   engine_name(m->i915, i),
+			   error->engine[i].context.comm,
+			   error->engine[i].context.pid,
+			   error->engine[i].context.ban_score,
+			   bannable(&error->engine[i].context));
 	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
 	err_printf(m, "Suspend count: %u\n", error->suspend_count);
@@ -1153,29 +1150,22 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
  *
  * It's only a small step better than a random number in its current form.
  */
-static u32 i915_error_generate_code(struct drm_i915_private *dev_priv,
-				    struct i915_gpu_state *error,
-				    int *engine_id)
+static u32 i915_error_generate_code(struct i915_gpu_state *error,
+				    unsigned long engine_mask)
 {
-	u32 error_code = 0;
-	int i;
-
-	/* IPEHR would be an ideal way to detect errors, as it's the gross
+	/*
+	 * IPEHR would be an ideal way to detect errors, as it's the gross
 	 * measure of "the command that hung." However, has some very common
 	 * synchronization commands which almost always appear in the case
 	 * strictly a client bug. Use instdone to differentiate those some.
 	 */
-	for (i = 0; i < I915_NUM_ENGINES; i++) {
-		if (error->engine[i].hangcheck_stalled) {
-			if (engine_id)
-				*engine_id = i;
-
-			return error->engine[i].ipehr ^
-			       error->engine[i].instdone.instdone;
-		}
+	if (engine_mask) {
+		int i = ffs(engine_mask);
+		return error->engine[i].ipehr ^
+			error->engine[i].instdone.instdone;
 	}
 
-	return error_code;
+	return 0;
 }
 
 static void gem_record_fences(struct i915_gpu_state *error)
@@ -1338,9 +1328,8 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
 	}
 
 	ee->idle = intel_engine_is_idle(engine);
-	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
-	ee->hangcheck_action = engine->hangcheck.action;
-	ee->hangcheck_stalled = engine->hangcheck.stalled;
+	if (!ee->idle)
+		ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
 	ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
 						  engine);
 
@@ -1783,31 +1772,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
 	error->pgtbl_er = I915_READ(PGTBL_ER);
 }
 
-static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
-				   struct i915_gpu_state *error,
-				   u32 engine_mask,
-				   const char *error_msg)
+static const char *
+error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg)
 {
-	u32 ecode;
-	int engine_id = -1, len;
+	int len;
+	int i;
 
-	ecode = i915_error_generate_code(dev_priv, error, &engine_id);
+	for (i = 0; i < ARRAY_SIZE(error->engine); i++)
+		if (!error->engine[i].context.pid)
+			engines &= ~BIT(i);
 
 	len = scnprintf(error->error_msg, sizeof(error->error_msg),
-			"GPU HANG: ecode %d:%d:0x%08x",
-			INTEL_GEN(dev_priv), engine_id, ecode);
-
-	if (engine_id != -1 && error->engine[engine_id].context.pid)
+			"GPU HANG: ecode %d:%lx:0x%08x",
+			INTEL_GEN(error->i915), engines,
+			i915_error_generate_code(error, engines));
+	if (engines) {
+		/* Just show the first executing process, more is confusing */
+		i = ffs(engines);
 		len += scnprintf(error->error_msg + len,
 				 sizeof(error->error_msg) - len,
 				 ", in %s [%d]",
-				 error->engine[engine_id].context.comm,
-				 error->engine[engine_id].context.pid);
+				 error->engine[i].context.comm,
+				 error->engine[i].context.pid);
+	}
+	if (msg)
+		len += scnprintf(error->error_msg + len,
+				 sizeof(error->error_msg) - len,
+				 ", %s", msg);
 
-	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
-		  ", reason: %s, action: %s",
-		  error_msg,
-		  engine_mask ? "reset" : "continue");
+	return error->error_msg;
 }
 
 static void capture_gen_state(struct i915_gpu_state *error)
@@ -1847,7 +1840,7 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		const struct drm_i915_error_engine *ee = &error->engine[i];
 
-		if (ee->hangcheck_stalled &&
+		if (ee->hangcheck_timestamp &&
 		    time_before(ee->hangcheck_timestamp, epoch))
 			epoch = ee->hangcheck_timestamp;
 	}
@@ -1921,7 +1914,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
  * i915_capture_error_state - capture an error record for later analysis
  * @i915: i915 device
  * @engine_mask: the mask of engines triggering the hang
- * @error_msg: a message to insert into the error capture header
+ * @msg: a message to insert into the error capture header
  *
  * Should be called when an error is detected (either a hang or an error
  * interrupt) to capture error state from the time of the error.  Fills
@@ -1929,8 +1922,8 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
  * to pick up.
  */
 void i915_capture_error_state(struct drm_i915_private *i915,
-			      u32 engine_mask,
-			      const char *error_msg)
+			      unsigned long engine_mask,
+			      const char *msg)
 {
 	static bool warned;
 	struct i915_gpu_state *error;
@@ -1946,8 +1939,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
 	if (IS_ERR(error))
 		return;
 
-	i915_error_capture_msg(i915, error, engine_mask, error_msg);
-	DRM_INFO("%s\n", error->error_msg);
+	dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
 
 	if (!error->simulated) {
 		spin_lock_irqsave(&i915->gpu_error.lock, flags);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 604291f7762d..231173786eae 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -85,8 +85,6 @@ struct i915_gpu_state {
 		bool waiting;
 		int num_waiters;
 		unsigned long hangcheck_timestamp;
-		bool hangcheck_stalled;
-		enum intel_engine_hangcheck_action hangcheck_action;
 		struct i915_address_space *vm;
 		int num_requests;
 		u32 reset_count;
@@ -197,6 +195,8 @@ struct i915_gpu_state {
 	struct scatterlist *sgl, *fit;
 };
 
+struct i915_gpu_restart;
+
 struct i915_gpu_error {
 	/* For hangcheck timer */
 #define DRM_I915_HANGCHECK_PERIOD 1500 /* in ms */
@@ -247,15 +247,6 @@ struct i915_gpu_error {
 	 * i915_mutex_lock_interruptible()?). I915_RESET_BACKOFF serves a
 	 * secondary role in preventing two concurrent global reset attempts.
 	 *
-	 * #I915_RESET_HANDOFF - To perform the actual GPU reset, we need the
-	 * struct_mutex. We try to acquire the struct_mutex in the reset worker,
-	 * but it may be held by some long running waiter (that we cannot
-	 * interrupt without causing trouble). Once we are ready to do the GPU
-	 * reset, we set the I915_RESET_HANDOFF bit and wakeup any waiters. If
-	 * they already hold the struct_mutex and want to participate they can
-	 * inspect the bit and do the reset directly, otherwise the worker
-	 * waits for the struct_mutex.
-	 *
 	 * #I915_RESET_ENGINE[num_engines] - Since the driver doesn't need to
 	 * acquire the struct_mutex to reset an engine, we need an explicit
 	 * flag to prevent two concurrent reset attempts in the same engine.
@@ -269,20 +260,13 @@ struct i915_gpu_error {
 	 */
 	unsigned long flags;
 #define I915_RESET_BACKOFF	0
-#define I915_RESET_HANDOFF	1
-#define I915_RESET_MODESET	2
-#define I915_RESET_ENGINE	3
+#define I915_RESET_MODESET	1
+#define I915_RESET_ENGINE	2
 #define I915_WEDGED		(BITS_PER_LONG - 1)
 
 	/** Number of times an engine has been reset */
 	u32 reset_engine_count[I915_NUM_ENGINES];
 
-	/** Set of stalled engines with guilty requests, in the current reset */
-	u32 stalled_mask;
-
-	/** Reason for the current *global* reset */
-	const char *reason;
-
 	struct mutex wedge_mutex; /* serialises wedging/unwedging */
 
 	/**
@@ -299,6 +283,8 @@ struct i915_gpu_error {
 
 	/* For missed irq/seqno simulation. */
 	unsigned long test_irq_rings;
+
+	struct i915_gpu_restart *restart;
 };
 
 struct drm_i915_error_state_buf {
@@ -320,7 +306,7 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...);
 
 struct i915_gpu_state *i915_capture_gpu_state(struct drm_i915_private *i915);
 void i915_capture_error_state(struct drm_i915_private *dev_priv,
-			      u32 engine_mask,
+			      unsigned long engine_mask,
 			      const char *error_msg);
 
 static inline struct i915_gpu_state *
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5403d4e2cee0..fb723ed2f574 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -1076,18 +1076,6 @@ static bool __i915_spin_request(const struct i915_request *rq,
 	return false;
 }
 
-static bool __i915_wait_request_check_and_reset(struct i915_request *request)
-{
-	struct i915_gpu_error *error = &request->i915->gpu_error;
-
-	if (likely(!i915_reset_handoff(error)))
-		return false;
-
-	__set_current_state(TASK_RUNNING);
-	i915_reset(request->i915, error->stalled_mask, error->reason);
-	return true;
-}
-
 /**
  * i915_request_wait - wait until execution of request has finished
  * @rq: the request to wait upon
@@ -1113,17 +1101,10 @@ long i915_request_wait(struct i915_request *rq,
 {
 	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
 		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
-	wait_queue_head_t *errq = &rq->i915->gpu_error.wait_queue;
-	DEFINE_WAIT_FUNC(reset, default_wake_function);
 	DEFINE_WAIT_FUNC(exec, default_wake_function);
 	struct intel_wait wait;
 
 	might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
-	GEM_BUG_ON(debug_locks &&
-		   !!lockdep_is_held(&rq->i915->drm.struct_mutex) !=
-		   !!(flags & I915_WAIT_LOCKED));
-#endif
 	GEM_BUG_ON(timeout < 0);
 
 	if (i915_request_completed(rq))
@@ -1133,11 +1114,7 @@ long i915_request_wait(struct i915_request *rq,
 		return -ETIME;
 
 	trace_i915_request_wait_begin(rq, flags);
-
 	add_wait_queue(&rq->execute, &exec);
-	if (flags & I915_WAIT_LOCKED)
-		add_wait_queue(errq, &reset);
-
 	intel_wait_init(&wait);
 	if (flags & I915_WAIT_PRIORITY)
 		i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
@@ -1148,10 +1125,6 @@ long i915_request_wait(struct i915_request *rq,
 		if (intel_wait_update_request(&wait, rq))
 			break;
 
-		if (flags & I915_WAIT_LOCKED &&
-		    __i915_wait_request_check_and_reset(rq))
-			continue;
-
 		if (signal_pending_state(state, current)) {
 			timeout = -ERESTARTSYS;
 			goto complete;
@@ -1181,9 +1154,6 @@ long i915_request_wait(struct i915_request *rq,
 		 */
 		goto wakeup;
 
-	if (flags & I915_WAIT_LOCKED)
-		__i915_wait_request_check_and_reset(rq);
-
 	for (;;) {
 		if (signal_pending_state(state, current)) {
 			timeout = -ERESTARTSYS;
@@ -1207,21 +1177,6 @@ long i915_request_wait(struct i915_request *rq,
 		if (i915_request_completed(rq))
 			break;
 
-		/*
-		 * If the GPU is hung, and we hold the lock, reset the GPU
-		 * and then check for completion. On a full reset, the engine's
-		 * HW seqno will be advanced passed us and we are complete.
-		 * If we do a partial reset, we have to wait for the GPU to
-		 * resume and update the breadcrumb.
-		 *
-		 * If we don't hold the mutex, we can just wait for the worker
-		 * to come along and update the breadcrumb (either directly
-		 * itself, or indirectly by recovering the GPU).
-		 */
-		if (flags & I915_WAIT_LOCKED &&
-		    __i915_wait_request_check_and_reset(rq))
-			continue;
-
 		/* Only spin if we know the GPU is processing this request */
 		if (__i915_spin_request(rq, wait.seqno, state, 2))
 			break;
@@ -1235,8 +1190,6 @@ long i915_request_wait(struct i915_request *rq,
 	intel_engine_remove_wait(rq->engine, &wait);
 complete:
 	__set_current_state(TASK_RUNNING);
-	if (flags & I915_WAIT_LOCKED)
-		remove_wait_queue(errq, &reset);
 	remove_wait_queue(&rq->execute, &exec);
 	trace_i915_request_wait_end(rq);
 
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 2961c21d9420..064fc6da1512 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -5,6 +5,7 @@
  */
 
 #include <linux/sched/mm.h>
+#include <linux/stop_machine.h>
 
 #include "i915_drv.h"
 #include "i915_gpu_error.h"
@@ -17,22 +18,23 @@ static void engine_skip_context(struct i915_request *rq)
 	struct intel_engine_cs *engine = rq->engine;
 	struct i915_gem_context *hung_ctx = rq->gem_context;
 	struct i915_timeline *timeline = rq->timeline;
-	unsigned long flags;
 
+	lockdep_assert_held(&engine->timeline.lock);
 	GEM_BUG_ON(timeline == &engine->timeline);
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
 	spin_lock(&timeline->lock);
 
-	list_for_each_entry_continue(rq, &engine->timeline.requests, link)
-		if (rq->gem_context == hung_ctx)
-			i915_request_skip(rq, -EIO);
+	if (rq->global_seqno) {
+		list_for_each_entry_continue(rq,
+					     &engine->timeline.requests, link)
+			if (rq->gem_context == hung_ctx)
+				i915_request_skip(rq, -EIO);
+	}
 
 	list_for_each_entry(rq, &timeline->requests, link)
 		i915_request_skip(rq, -EIO);
 
 	spin_unlock(&timeline->lock);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
 static void client_mark_guilty(struct drm_i915_file_private *file_priv,
@@ -59,7 +61,7 @@ static void client_mark_guilty(struct drm_i915_file_private *file_priv,
 	}
 }
 
-static void context_mark_guilty(struct i915_gem_context *ctx)
+static bool context_mark_guilty(struct i915_gem_context *ctx)
 {
 	unsigned int score;
 	bool banned, bannable;
@@ -72,7 +74,7 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
 
 	/* Cool contexts don't accumulate client ban score */
 	if (!bannable)
-		return;
+		return false;
 
 	if (banned) {
 		DRM_DEBUG_DRIVER("context %s: guilty %d, score %u, banned\n",
@@ -83,6 +85,8 @@ static void context_mark_guilty(struct i915_gem_context *ctx)
 
 	if (!IS_ERR_OR_NULL(ctx->file_priv))
 		client_mark_guilty(ctx->file_priv, ctx);
+
+	return banned;
 }
 
 static void context_mark_innocent(struct i915_gem_context *ctx)
@@ -90,6 +94,21 @@ static void context_mark_innocent(struct i915_gem_context *ctx)
 	atomic_inc(&ctx->active_count);
 }
 
+void i915_reset_request(struct i915_request *rq, bool guilty)
+{
+	lockdep_assert_held(&rq->engine->timeline.lock);
+	GEM_BUG_ON(i915_request_completed(rq));
+
+	if (guilty) {
+		i915_request_skip(rq, -EIO);
+		if (context_mark_guilty(rq->gem_context))
+			engine_skip_context(rq);
+	} else {
+		dma_fence_set_error(&rq->fence, -EAGAIN);
+		context_mark_innocent(rq->gem_context);
+	}
+}
+
 static void gen3_stop_engine(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
@@ -533,22 +552,6 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
 	int retry;
 	int ret;
 
-	/*
-	 * We want to perform per-engine reset from atomic context (e.g.
-	 * softirq), which imposes the constraint that we cannot sleep.
-	 * However, experience suggests that spending a bit of time waiting
-	 * for a reset helps in various cases, so for a full-device reset
-	 * we apply the opposite rule and wait if we want to. As we should
-	 * always follow up a failed per-engine reset with a full device reset,
-	 * being a little faster, stricter and more error prone for the
-	 * atomic case seems an acceptable compromise.
-	 *
-	 * Unfortunately this leads to a bimodal routine, when the goal was
-	 * to have a single reset function that worked for resetting any
-	 * number of engines simultaneously.
-	 */
-	might_sleep_if(engine_mask == ALL_ENGINES);
-
 	/*
 	 * If the power well sleeps during the reset, the reset
 	 * request may be dropped and never completes (causing -EIO).
@@ -580,8 +583,6 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
 		}
 		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
 			break;
-
-		cond_resched();
 	}
 	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 
@@ -620,11 +621,8 @@ int intel_reset_guc(struct drm_i915_private *i915)
  * Ensure irq handler finishes, and not run again.
  * Also return the active request so that we only search for it once.
  */
-static struct i915_request *
-reset_prepare_engine(struct intel_engine_cs *engine)
+static void reset_prepare_engine(struct intel_engine_cs *engine)
 {
-	struct i915_request *rq;
-
 	/*
 	 * During the reset sequence, we must prevent the engine from
 	 * entering RC6. As the context state is undefined until we restart
@@ -633,162 +631,86 @@ reset_prepare_engine(struct intel_engine_cs *engine)
 	 * GPU state upon resume, i.e. fail to restart after a reset.
 	 */
 	intel_uncore_forcewake_get(engine->i915, FORCEWAKE_ALL);
-
-	rq = engine->reset.prepare(engine);
-	if (rq && rq->fence.error == -EIO)
-		rq = ERR_PTR(-EIO); /* Previous reset failed! */
-
-	return rq;
+	engine->reset.prepare(engine);
 }
 
-static int reset_prepare(struct drm_i915_private *i915)
+static void reset_prepare(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
-	struct i915_request *rq;
 	enum intel_engine_id id;
-	int err = 0;
 
-	for_each_engine(engine, i915, id) {
-		rq = reset_prepare_engine(engine);
-		if (IS_ERR(rq)) {
-			err = PTR_ERR(rq);
-			continue;
-		}
-
-		engine->hangcheck.active_request = rq;
-	}
+	for_each_engine(engine, i915, id)
+		reset_prepare_engine(engine);
 
-	i915_gem_revoke_fences(i915);
 	intel_uc_sanitize(i915);
-
-	return err;
 }
 
-/* Returns the request if it was guilty of the hang */
-static struct i915_request *
-reset_request(struct intel_engine_cs *engine,
-	      struct i915_request *rq,
-	      bool stalled)
+static int gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
 {
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int err;
+
 	/*
-	 * The guilty request will get skipped on a hung engine.
-	 *
-	 * Users of client default contexts do not rely on logical
-	 * state preserved between batches so it is safe to execute
-	 * queued requests following the hang. Non default contexts
-	 * rely on preserved state, so skipping a batch loses the
-	 * evolution of the state and it needs to be considered corrupted.
-	 * Executing more queued batches on top of corrupted state is
-	 * risky. But we take the risk by trying to advance through
-	 * the queued requests in order to make the client behaviour
-	 * more predictable around resets, by not throwing away random
-	 * amount of batches it has prepared for execution. Sophisticated
-	 * clients can use gem_reset_stats_ioctl and dma fence status
-	 * (exported via sync_file info ioctl on explicit fences) to observe
-	 * when it loses the context state and should rebuild accordingly.
-	 *
-	 * The context ban, and ultimately the client ban, mechanism are safety
-	 * valves if client submission ends up resulting in nothing more than
-	 * subsequent hangs.
+	 * Everything depends on having the GTT running, so we need to start
+	 * there.
 	 */
+	err = i915_ggtt_enable_hw(i915);
+	if (err)
+		return err;
 
-	if (i915_request_completed(rq)) {
-		GEM_TRACE("%s pardoned global=%d (fence %llx:%lld), current %d\n",
-			  engine->name, rq->global_seqno,
-			  rq->fence.context, rq->fence.seqno,
-			  intel_engine_get_seqno(engine));
-		stalled = false;
-	}
-
-	if (stalled) {
-		context_mark_guilty(rq->gem_context);
-		i915_request_skip(rq, -EIO);
+	for_each_engine(engine, i915, id)
+		intel_engine_reset(engine, stalled_mask & ENGINE_MASK(id));
 
-		/* If this context is now banned, skip all pending requests. */
-		if (i915_gem_context_is_banned(rq->gem_context))
-			engine_skip_context(rq);
-	} else {
-		/*
-		 * Since this is not the hung engine, it may have advanced
-		 * since the hang declaration. Double check by refinding
-		 * the active request at the time of the reset.
-		 */
-		rq = i915_gem_find_active_request(engine);
-		if (rq) {
-			unsigned long flags;
-
-			context_mark_innocent(rq->gem_context);
-			dma_fence_set_error(&rq->fence, -EAGAIN);
-
-			/* Rewind the engine to replay the incomplete rq */
-			spin_lock_irqsave(&engine->timeline.lock, flags);
-			rq = list_prev_entry(rq, link);
-			if (&rq->link == &engine->timeline.requests)
-				rq = NULL;
-			spin_unlock_irqrestore(&engine->timeline.lock, flags);
-		}
-	}
+	i915_gem_restore_fences(i915);
 
-	return rq;
+	return err;
 }
 
-static void reset_engine(struct intel_engine_cs *engine,
-			 struct i915_request *rq,
-			 bool stalled)
+static void reset_finish_engine(struct intel_engine_cs *engine)
 {
-	if (rq)
-		rq = reset_request(engine, rq, stalled);
-
-	/* Setup the CS to resume from the breadcrumb of the hung request */
-	engine->reset.reset(engine, rq);
+	engine->reset.finish(engine);
+	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
 }
 
-static void gt_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+struct i915_gpu_restart {
+	struct work_struct work;
+	struct drm_i915_private *i915;
+};
+
+static void restart_work(struct work_struct *work)
 {
+	struct i915_gpu_restart *arg = container_of(work, typeof(*arg), work);
+	struct drm_i915_private *i915 = arg->i915;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+	mutex_lock(&i915->drm.struct_mutex);
 
-	i915_retire_requests(i915);
+	smp_store_mb(i915->gpu_error.restart, NULL);
 
 	for_each_engine(engine, i915, id) {
-		struct intel_context *ce;
-
-		reset_engine(engine,
-			     engine->hangcheck.active_request,
-			     stalled_mask & ENGINE_MASK(id));
-		ce = fetch_and_zero(&engine->last_retired_context);
-		if (ce)
-			intel_context_unpin(ce);
+		struct i915_request *rq;
 
 		/*
 		 * Ostensibily, we always want a context loaded for powersaving,
 		 * so if the engine is idle after the reset, send a request
 		 * to load our scratch kernel_context.
-		 *
-		 * More mysteriously, if we leave the engine idle after a reset,
-		 * the next userspace batch may hang, with what appears to be
-		 * an incoherent read by the CS (presumably stale TLB). An
-		 * empty request appears sufficient to paper over the glitch.
 		 */
-		if (intel_engine_is_idle(engine)) {
-			struct i915_request *rq;
+		if (!intel_engine_is_idle(engine))
+			continue;
 
-			rq = i915_request_alloc(engine, i915->kernel_context);
-			if (!IS_ERR(rq))
-				i915_request_add(rq);
-		}
+		rq = i915_request_alloc(engine, i915->kernel_context);
+		if (!IS_ERR(rq))
+			i915_request_add(rq);
 	}
 
-	i915_gem_restore_fences(i915);
-}
-
-static void reset_finish_engine(struct intel_engine_cs *engine)
-{
-	engine->reset.finish(engine);
+	mutex_unlock(&i915->drm.struct_mutex);
+	intel_runtime_pm_put(i915, wakeref);
 
-	intel_uncore_forcewake_put(engine->i915, FORCEWAKE_ALL);
+	kfree(arg);
 }
 
 static void reset_finish(struct drm_i915_private *i915)
@@ -796,11 +718,30 @@ static void reset_finish(struct drm_i915_private *i915)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
-
-	for_each_engine(engine, i915, id) {
-		engine->hangcheck.active_request = NULL;
+	for_each_engine(engine, i915, id)
 		reset_finish_engine(engine);
+}
+
+static void reset_restart(struct drm_i915_private *i915)
+{
+	struct i915_gpu_restart *arg;
+
+	/*
+	 * Following the reset, ensure that we always reload context for
+	 * powersaving, and to correct engine->last_retired_context. Since
+	 * this requires us to submit a request, queue a worker to do that
+	 * task for us to evade any locking here.
+	 */
+	if (READ_ONCE(i915->gpu_error.restart))
+		return;
+
+	arg = kmalloc(sizeof(*arg), GFP_KERNEL);
+	if (arg) {
+		arg->i915 = i915;
+		INIT_WORK(&arg->work, restart_work);
+
+		WRITE_ONCE(i915->gpu_error.restart, arg);
+		queue_work(i915->wq, &arg->work);
 	}
 }
 
@@ -889,8 +830,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	struct i915_timeline *tl;
 	bool ret = false;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
-
 	if (!test_bit(I915_WEDGED, &error->flags))
 		return true;
 
@@ -913,9 +852,9 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 */
 	list_for_each_entry(tl, &i915->gt.timelines, link) {
 		struct i915_request *rq;
+		long timeout;
 
-		rq = i915_gem_active_peek(&tl->last_request,
-					  &i915->drm.struct_mutex);
+		rq = i915_gem_active_get_unlocked(&tl->last_request);
 		if (!rq)
 			continue;
 
@@ -930,12 +869,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 		 * and when the seqno passes the fence, the signaler
 		 * then signals the fence waking us up).
 		 */
-		if (dma_fence_default_wait(&rq->fence, true,
-					   MAX_SCHEDULE_TIMEOUT) < 0)
+		timeout = dma_fence_default_wait(&rq->fence, true,
+						 MAX_SCHEDULE_TIMEOUT);
+		i915_request_put(rq);
+		if (timeout < 0)
 			goto unlock;
 	}
-	i915_retire_requests(i915);
-	GEM_BUG_ON(i915->gt.active_requests);
 
 	intel_engines_sanitize(i915, false);
 
@@ -949,7 +888,6 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 * context and do not require stop_machine().
 	 */
 	intel_engines_reset_default_submission(i915);
-	i915_gem_contexts_lost(i915);
 
 	GEM_TRACE("end\n");
 
@@ -962,6 +900,43 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	return ret;
 }
 
+struct __i915_reset {
+	struct drm_i915_private *i915;
+	unsigned int stalled_mask;
+};
+
+static int __i915_reset__BKL(void *data)
+{
+	struct __i915_reset *arg = data;
+	int err;
+
+	err = intel_gpu_reset(arg->i915, ALL_ENGINES);
+	if (err)
+		return err;
+
+	return gt_reset(arg->i915, arg->stalled_mask);
+}
+
+#if 0
+#define __do_reset(fn, arg) stop_machine(fn, arg, NULL)
+#else
+#define __do_reset(fn, arg) fn(arg)
+#endif
+
+static int do_reset(struct drm_i915_private *i915, unsigned int stalled_mask)
+{
+	struct __i915_reset arg = { i915, stalled_mask };
+	int err, i;
+
+	err = __do_reset(__i915_reset__BKL, &arg);
+	for (i = 0; err && i < 3; i++) {
+		msleep(100);
+		err = __do_reset(__i915_reset__BKL, &arg);
+	}
+
+	return err;
+}
+
 /**
  * i915_reset - reset chip after a hang
  * @i915: #drm_i915_private to reset
@@ -987,31 +962,22 @@ void i915_reset(struct drm_i915_private *i915,
 {
 	struct i915_gpu_error *error = &i915->gpu_error;
 	int ret;
-	int i;
 
 	GEM_TRACE("flags=%lx\n", error->flags);
 
 	might_sleep();
-	lockdep_assert_held(&i915->drm.struct_mutex);
 	assert_rpm_wakelock_held(i915);
 	GEM_BUG_ON(!test_bit(I915_RESET_BACKOFF, &error->flags));
 
-	if (!test_bit(I915_RESET_HANDOFF, &error->flags))
-		return;
-
 	/* Clear any previous failed attempts at recovery. Time to try again. */
 	if (!i915_gem_unset_wedged(i915))
-		goto wakeup;
+		return;
 
 	if (reason)
 		dev_notice(i915->drm.dev, "Resetting chip for %s\n", reason);
 	error->reset_count++;
 
-	ret = reset_prepare(i915);
-	if (ret) {
-		dev_err(i915->drm.dev, "GPU recovery failed\n");
-		goto taint;
-	}
+	reset_prepare(i915);
 
 	if (!intel_has_gpu_reset(i915)) {
 		if (i915_modparams.reset)
@@ -1021,32 +987,11 @@ void i915_reset(struct drm_i915_private *i915,
 		goto error;
 	}
 
-	for (i = 0; i < 3; i++) {
-		ret = intel_gpu_reset(i915, ALL_ENGINES);
-		if (ret == 0)
-			break;
-
-		msleep(100);
-	}
-	if (ret) {
+	if (do_reset(i915, stalled_mask)) {
 		dev_err(i915->drm.dev, "Failed to reset chip\n");
 		goto taint;
 	}
 
-	/* Ok, now get things going again... */
-
-	/*
-	 * Everything depends on having the GTT running, so we need to start
-	 * there.
-	 */
-	ret = i915_ggtt_enable_hw(i915);
-	if (ret) {
-		DRM_ERROR("Failed to re-enable GGTT following reset (%d)\n",
-			  ret);
-		goto error;
-	}
-
-	gt_reset(i915, stalled_mask);
 	intel_overlay_reset(i915);
 
 	/*
@@ -1068,9 +1013,8 @@ void i915_reset(struct drm_i915_private *i915,
 
 finish:
 	reset_finish(i915);
-wakeup:
-	clear_bit(I915_RESET_HANDOFF, &error->flags);
-	wake_up_bit(&error->flags, I915_RESET_HANDOFF);
+	if (!i915_terminally_wedged(error))
+		reset_restart(i915);
 	return;
 
 taint:
@@ -1089,7 +1033,6 @@ void i915_reset(struct drm_i915_private *i915,
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 error:
 	i915_gem_set_wedged(i915);
-	i915_retire_requests(i915);
 	goto finish;
 }
 
@@ -1115,18 +1058,16 @@ static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
 int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
 {
 	struct i915_gpu_error *error = &engine->i915->gpu_error;
-	struct i915_request *active_request;
 	int ret;
 
 	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
 	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
 
-	active_request = reset_prepare_engine(engine);
-	if (IS_ERR_OR_NULL(active_request)) {
-		/* Either the previous reset failed, or we pardon the reset. */
-		ret = PTR_ERR(active_request);
-		goto out;
-	}
+	if (i915_seqno_passed(intel_engine_get_seqno(engine),
+			      intel_engine_last_submit(engine)))
+		return 0;
+
+	reset_prepare_engine(engine);
 
 	if (msg)
 		dev_notice(engine->i915->drm.dev,
@@ -1150,7 +1091,7 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
 	 * active request and can drop it, adjust head to skip the offending
 	 * request to resume executing remaining requests in the queue.
 	 */
-	reset_engine(engine, active_request, true);
+	intel_engine_reset(engine, true);
 
 	/*
 	 * The engine and its registers (and workarounds in case of render)
@@ -1187,30 +1128,7 @@ static void i915_reset_device(struct drm_i915_private *i915,
 	i915_wedge_on_timeout(&w, i915, 5 * HZ) {
 		intel_prepare_reset(i915);
 
-		error->reason = reason;
-		error->stalled_mask = engine_mask;
-
-		/* Signal that locked waiters should reset the GPU */
-		smp_mb__before_atomic();
-		set_bit(I915_RESET_HANDOFF, &error->flags);
-		wake_up_all(&error->wait_queue);
-
-		/*
-		 * Wait for anyone holding the lock to wakeup, without
-		 * blocking indefinitely on struct_mutex.
-		 */
-		do {
-			if (mutex_trylock(&i915->drm.struct_mutex)) {
-				i915_reset(i915, engine_mask, reason);
-				mutex_unlock(&i915->drm.struct_mutex);
-			}
-		} while (wait_on_bit_timeout(&error->flags,
-					     I915_RESET_HANDOFF,
-					     TASK_UNINTERRUPTIBLE,
-					     1));
-
-		error->stalled_mask = 0;
-		error->reason = NULL;
+		i915_reset(i915, engine_mask, reason);
 
 		intel_finish_reset(i915);
 	}
@@ -1366,6 +1284,25 @@ void i915_handle_error(struct drm_i915_private *i915,
 	intel_runtime_pm_put(i915, wakeref);
 }
 
+bool i915_reset_flush(struct drm_i915_private *i915)
+{
+	int err;
+
+	cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
+
+	flush_workqueue(i915->wq);
+	GEM_BUG_ON(READ_ONCE(i915->gpu_error.restart));
+
+	mutex_lock(&i915->drm.struct_mutex);
+	err = i915_gem_wait_for_idle(i915,
+				     I915_WAIT_LOCKED |
+				     I915_WAIT_FOR_IDLE_BOOST,
+				     MAX_SCHEDULE_TIMEOUT);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	return !err;
+}
+
 static void i915_wedge_me(struct work_struct *work)
 {
 	struct i915_wedge_me *w = container_of(work, typeof(*w), work.work);
diff --git a/drivers/gpu/drm/i915/i915_reset.h b/drivers/gpu/drm/i915/i915_reset.h
index b6a519bde67d..f2d347f319df 100644
--- a/drivers/gpu/drm/i915/i915_reset.h
+++ b/drivers/gpu/drm/i915/i915_reset.h
@@ -29,6 +29,9 @@ void i915_reset(struct drm_i915_private *i915,
 int i915_reset_engine(struct intel_engine_cs *engine,
 		      const char *reason);
 
+void i915_reset_request(struct i915_request *rq, bool guilty);
+bool i915_reset_flush(struct drm_i915_private *i915);
+
 bool intel_has_gpu_reset(struct drm_i915_private *i915);
 bool intel_has_reset_engine(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 2f3c71f6d313..fc52737751e7 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1071,10 +1071,8 @@ void intel_engines_sanitize(struct drm_i915_private *i915, bool force)
 	if (!reset_engines(i915) && !force)
 		return;
 
-	for_each_engine(engine, i915, id) {
-		if (engine->reset.reset)
-			engine->reset.reset(engine, NULL);
-	}
+	for_each_engine(engine, i915, id)
+		intel_engine_reset(engine, false);
 }
 
 /**
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index ab1c49b106f2..7217c7e3ee8d 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -834,8 +834,7 @@ static void guc_submission_tasklet(unsigned long data)
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
-static struct i915_request *
-guc_reset_prepare(struct intel_engine_cs *engine)
+static void guc_reset_prepare(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
 
@@ -861,8 +860,6 @@ guc_reset_prepare(struct intel_engine_cs *engine)
 	 */
 	if (engine->i915->guc.preempt_wq)
 		flush_workqueue(engine->i915->guc.preempt_wq);
-
-	return i915_gem_find_active_request(engine);
 }
 
 /*
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 7dc11fcb13de..925146dcc0f8 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -25,6 +25,17 @@
 #include "i915_drv.h"
 #include "i915_reset.h"
 
+struct hangcheck {
+	u64 acthd;
+	u32 seqno;
+	enum intel_engine_hangcheck_action action;
+	unsigned long action_timestamp;
+	int deadlock;
+	struct intel_instdone instdone;
+	bool wedged:1;
+	bool stalled:1;
+};
+
 static bool instdone_unchanged(u32 current_instdone, u32 *old_instdone)
 {
 	u32 tmp = current_instdone | *old_instdone;
@@ -119,25 +130,22 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 }
 
 static void hangcheck_load_sample(struct intel_engine_cs *engine,
-				  struct intel_engine_hangcheck *hc)
+				  struct hangcheck *hc)
 {
 	hc->acthd = intel_engine_get_active_head(engine);
 	hc->seqno = intel_engine_get_seqno(engine);
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
-				   const struct intel_engine_hangcheck *hc)
+				   const struct hangcheck *hc)
 {
 	engine->hangcheck.acthd = hc->acthd;
 	engine->hangcheck.seqno = hc->seqno;
-	engine->hangcheck.action = hc->action;
-	engine->hangcheck.stalled = hc->stalled;
-	engine->hangcheck.wedged = hc->wedged;
 }
 
 static enum intel_engine_hangcheck_action
 hangcheck_get_action(struct intel_engine_cs *engine,
-		     const struct intel_engine_hangcheck *hc)
+		     const struct hangcheck *hc)
 {
 	if (engine->hangcheck.seqno != hc->seqno)
 		return ENGINE_ACTIVE_SEQNO;
@@ -149,7 +157,7 @@ hangcheck_get_action(struct intel_engine_cs *engine,
 }
 
 static void hangcheck_accumulate_sample(struct intel_engine_cs *engine,
-					struct intel_engine_hangcheck *hc)
+					struct hangcheck *hc)
 {
 	unsigned long timeout = I915_ENGINE_DEAD_TIMEOUT;
 
@@ -269,19 +277,19 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	intel_uncore_arm_unclaimed_mmio_detection(dev_priv);
 
 	for_each_engine(engine, dev_priv, id) {
-		struct intel_engine_hangcheck hc;
+		struct hangcheck hc;
 
 		hangcheck_load_sample(engine, &hc);
 		hangcheck_accumulate_sample(engine, &hc);
 		hangcheck_store_sample(engine, &hc);
 
-		if (engine->hangcheck.stalled) {
+		if (hc.stalled) {
 			hung |= intel_engine_flag(engine);
 			if (hc.action != ENGINE_DEAD)
 				stuck |= intel_engine_flag(engine);
 		}
 
-		if (engine->hangcheck.wedged)
+		if (hc.wedged)
 			wedged |= intel_engine_flag(engine);
 	}
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 379a43ed2f90..ebb693c93046 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -136,6 +136,7 @@
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include "i915_gem_render_state.h"
+#include "i915_reset.h"
 #include "i915_vgpu.h"
 #include "intel_lrc_reg.h"
 #include "intel_mocs.h"
@@ -266,7 +267,8 @@ static void unwind_wa_tail(struct i915_request *rq)
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
 
-static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
+static struct i915_request *
+__unwind_incomplete_requests(struct intel_engine_cs *engine)
 {
 	struct i915_request *rq, *rn, *active = NULL;
 	struct list_head *uninitialized_var(pl);
@@ -307,6 +309,8 @@ static void __unwind_incomplete_requests(struct intel_engine_cs *engine)
 		list_move_tail(&active->sched.link,
 			       i915_sched_lookup_priolist(engine, prio));
 	}
+
+	return active;
 }
 
 void
@@ -1724,11 +1728,9 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	return 0;
 }
 
-static struct i915_request *
-execlists_reset_prepare(struct intel_engine_cs *engine)
+static void execlists_reset_prepare(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
-	struct i915_request *request, *active;
 	unsigned long flags;
 
 	GEM_TRACE("%s: depth<-%d\n", engine->name,
@@ -1744,59 +1746,21 @@ execlists_reset_prepare(struct intel_engine_cs *engine)
 	 * prevents the race.
 	 */
 	__tasklet_disable_sync_once(&execlists->tasklet);
+	GEM_BUG_ON(!reset_in_progress(execlists));
 
+	/* And flush any current direct submission. */
 	spin_lock_irqsave(&engine->timeline.lock, flags);
-
-	/*
-	 * We want to flush the pending context switches, having disabled
-	 * the tasklet above, we can assume exclusive access to the execlists.
-	 * For this allows us to catch up with an inflight preemption event,
-	 * and avoid blaming an innocent request if the stall was due to the
-	 * preemption itself.
-	 */
-	process_csb(engine);
-
-	/*
-	 * The last active request can then be no later than the last request
-	 * now in ELSP[0]. So search backwards from there, so that if the GPU
-	 * has advanced beyond the last CSB update, it will be pardoned.
-	 */
-	active = NULL;
-	request = port_request(execlists->port);
-	if (request) {
-		/*
-		 * Prevent the breadcrumb from advancing before we decide
-		 * which request is currently active.
-		 */
-		intel_engine_stop_cs(engine);
-
-		list_for_each_entry_from_reverse(request,
-						 &engine->timeline.requests,
-						 link) {
-			if (__i915_request_completed(request,
-						     request->global_seqno))
-				break;
-
-			active = request;
-		}
-	}
-
+	process_csb(engine); /* drain preemption events */
 	spin_unlock_irqrestore(&engine->timeline.lock, flags);
-
-	return active;
 }
 
-static void execlists_reset(struct intel_engine_cs *engine,
-			    struct i915_request *request)
+static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
+	struct i915_request *rq;
 	unsigned long flags;
 	u32 *regs;
 
-	GEM_TRACE("%s request global=%d, current=%d\n",
-		  engine->name, request ? request->global_seqno : 0,
-		  intel_engine_get_seqno(engine));
-
 	spin_lock_irqsave(&engine->timeline.lock, flags);
 
 	/*
@@ -1811,12 +1775,18 @@ static void execlists_reset(struct intel_engine_cs *engine,
 	execlists_cancel_port_requests(execlists);
 
 	/* Push back any incomplete requests for replay after the reset. */
-	__unwind_incomplete_requests(engine);
+	rq = __unwind_incomplete_requests(engine);
 
 	/* Following the reset, we need to reload the CSB read/write pointers */
 	reset_csb_pointers(&engine->execlists);
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
+		  engine->name,
+		  rq ? rq->global_seqno : 0,
+		  intel_engine_get_seqno(engine),
+		  yesno(stalled));
+	if (!rq)
+		goto out_unlock;
 
 	/*
 	 * If the request was innocent, we leave the request in the ELSP
@@ -1829,8 +1799,9 @@ static void execlists_reset(struct intel_engine_cs *engine,
 	 * and have to at least restore the RING register in the context
 	 * image back to the expected values to skip over the guilty request.
 	 */
-	if (!request || request->fence.error != -EIO)
-		return;
+	i915_reset_request(rq, stalled);
+	if (!stalled)
+		goto out_unlock;
 
 	/*
 	 * We want a simple context + ring to execute the breadcrumb update.
@@ -1840,25 +1811,23 @@ static void execlists_reset(struct intel_engine_cs *engine,
 	 * future request will be after userspace has had the opportunity
 	 * to recreate its own state.
 	 */
-	regs = request->hw_context->lrc_reg_state;
+	regs = rq->hw_context->lrc_reg_state;
 	if (engine->pinned_default_state) {
 		memcpy(regs, /* skip restoring the vanilla PPHWSP */
 		       engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
 		       engine->context_size - PAGE_SIZE);
 	}
-	execlists_init_reg_state(regs,
-				 request->gem_context, engine, request->ring);
+	execlists_init_reg_state(regs, rq->gem_context, engine, rq->ring);
 
 	/* Move the RING_HEAD onto the breadcrumb, past the hanging batch */
-	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(request->ring->vma);
-
-	request->ring->head = intel_ring_wrap(request->ring, request->postfix);
-	regs[CTX_RING_HEAD + 1] = request->ring->head;
+	regs[CTX_RING_BUFFER_START + 1] = i915_ggtt_offset(rq->ring->vma);
 
-	intel_ring_update_space(request->ring);
+	rq->ring->head = intel_ring_wrap(rq->ring, rq->postfix);
+	regs[CTX_RING_HEAD + 1] = rq->ring->head;
+	intel_ring_update_space(rq->ring);
 
-	/* Reset WaIdleLiteRestore:bdw,skl as well */
-	unwind_wa_tail(request);
+out_unlock:
+	spin_unlock_irqrestore(&engine->timeline.lock, flags);
 }
 
 static void execlists_reset_finish(struct intel_engine_cs *engine)
@@ -1871,6 +1840,7 @@ static void execlists_reset_finish(struct intel_engine_cs *engine)
 	 * to sleep before we restart and reload a context.
 	 *
 	 */
+	GEM_BUG_ON(!reset_in_progress(execlists));
 	if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
 		execlists->tasklet.func(execlists->tasklet.data);
 
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index c81db81e4416..f68c7975006c 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -478,8 +478,6 @@ void intel_overlay_reset(struct drm_i915_private *dev_priv)
 	if (!overlay)
 		return;
 
-	intel_overlay_release_old_vid(overlay);
-
 	overlay->old_xscale = 0;
 	overlay->old_yscale = 0;
 	overlay->crtc = NULL;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 26b7274a2d43..662907e1a286 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -33,6 +33,7 @@
 
 #include "i915_drv.h"
 #include "i915_gem_render_state.h"
+#include "i915_reset.h"
 #include "i915_trace.h"
 #include "intel_drv.h"
 #include "intel_workarounds.h"
@@ -707,52 +708,80 @@ static int init_ring_common(struct intel_engine_cs *engine)
 	return ret;
 }
 
-static struct i915_request *reset_prepare(struct intel_engine_cs *engine)
+static void reset_prepare(struct intel_engine_cs *engine)
 {
 	intel_engine_stop_cs(engine);
-	return i915_gem_find_active_request(engine);
 }
 
-static void skip_request(struct i915_request *rq)
+static void reset_ring(struct intel_engine_cs *engine, bool stalled)
 {
-	void *vaddr = rq->ring->vaddr;
+	struct i915_timeline *tl = &engine->timeline;
+	struct i915_request *pos, *rq;
+	unsigned long flags;
 	u32 head;
 
-	head = rq->infix;
-	if (rq->postfix < head) {
-		memset32(vaddr + head, MI_NOOP,
-			 (rq->ring->size - head) / sizeof(u32));
-		head = 0;
+	rq = NULL;
+	spin_lock_irqsave(&tl->lock, flags);
+	list_for_each_entry(pos, &tl->requests, link) {
+		if (!__i915_request_completed(pos, pos->global_seqno)) {
+			rq = pos;
+			break;
+		}
 	}
-	memset32(vaddr + head, MI_NOOP, (rq->postfix - head) / sizeof(u32));
-}
-
-static void reset_ring(struct intel_engine_cs *engine, struct i915_request *rq)
-{
-	GEM_TRACE("%s request global=%d, current=%d\n",
-		  engine->name, rq ? rq->global_seqno : 0,
-		  intel_engine_get_seqno(engine));
 
+	GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
+		  engine->name,
+		  rq ? rq->global_seqno : 0,
+		  intel_engine_get_seqno(engine),
+		  yesno(stalled));
 	/*
-	 * Try to restore the logical GPU state to match the continuation
-	 * of the request queue. If we skip the context/PD restore, then
-	 * the next request may try to execute assuming that its context
-	 * is valid and loaded on the GPU and so may try to access invalid
-	 * memory, prompting repeated GPU hangs.
+	 * The guilty request will get skipped on a hung engine.
 	 *
-	 * If the request was guilty, we still restore the logical state
-	 * in case the next request requires it (e.g. the aliasing ppgtt),
-	 * but skip over the hung batch.
+	 * Users of client default contexts do not rely on logical
+	 * state preserved between batches so it is safe to execute
+	 * queued requests following the hang. Non default contexts
+	 * rely on preserved state, so skipping a batch loses the
+	 * evolution of the state and it needs to be considered corrupted.
+	 * Executing more queued batches on top of corrupted state is
+	 * risky. But we take the risk by trying to advance through
+	 * the queued requests in order to make the client behaviour
+	 * more predictable around resets, by not throwing away random
+	 * amount of batches it has prepared for execution. Sophisticated
+	 * clients can use gem_reset_stats_ioctl and dma fence status
+	 * (exported via sync_file info ioctl on explicit fences) to observe
+	 * when it loses the context state and should rebuild accordingly.
 	 *
-	 * If the request was innocent, we try to replay the request with
-	 * the restored context.
+	 * The context ban, and ultimately the client ban, mechanism are safety
+	 * valves if client submission ends up resulting in nothing more than
+	 * subsequent hangs.
 	 */
+
 	if (rq) {
-		/* If the rq hung, jump to its breadcrumb and skip the batch */
-		rq->ring->head = intel_ring_wrap(rq->ring, rq->head);
-		if (rq->fence.error == -EIO)
-			skip_request(rq);
+		/*
+		 * Try to restore the logical GPU state to match the
+		 * continuation of the request queue. If we skip the
+		 * context/PD restore, then the next request may try to execute
+		 * assuming that its context is valid and loaded on the GPU and
+		 * so may try to access invalid memory, prompting repeated GPU
+		 * hangs.
+		 *
+		 * If the request was guilty, we still restore the logical
+		 * state in case the next request requires it (e.g. the
+		 * aliasing ppgtt), but skip over the hung batch.
+		 *
+		 * If the request was innocent, we try to replay the request
+		 * with the restored context.
+		 */
+		i915_reset_request(rq, stalled);
+
+		GEM_BUG_ON(rq->ring != engine->buffer);
+		head = rq->head;
+	} else {
+		head = engine->buffer->tail;
 	}
+	engine->buffer->head = intel_ring_wrap(engine->buffer, head);
+
+	spin_unlock_irqrestore(&tl->lock, flags);
 }
 
 static void reset_finish(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c5975c67f74d..9ecae7de3b0a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -120,13 +120,8 @@ struct intel_instdone {
 struct intel_engine_hangcheck {
 	u64 acthd;
 	u32 seqno;
-	enum intel_engine_hangcheck_action action;
 	unsigned long action_timestamp;
-	int deadlock;
 	struct intel_instdone instdone;
-	struct i915_request *active_request;
-	bool stalled:1;
-	bool wedged:1;
 };
 
 struct intel_ring {
@@ -452,9 +447,8 @@ struct intel_engine_cs {
 	int		(*init_hw)(struct intel_engine_cs *engine);
 
 	struct {
-		struct i915_request *(*prepare)(struct intel_engine_cs *engine);
-		void (*reset)(struct intel_engine_cs *engine,
-			      struct i915_request *rq);
+		void (*prepare)(struct intel_engine_cs *engine);
+		void (*reset)(struct intel_engine_cs *engine, bool stalled);
 		void (*finish)(struct intel_engine_cs *engine);
 	} reset;
 
@@ -1026,6 +1020,13 @@ gen8_emit_ggtt_write(u32 *cs, u32 value, u32 gtt_offset)
 	return cs;
 }
 
+static inline void intel_engine_reset(struct intel_engine_cs *engine,
+				      bool stalled)
+{
+	if (engine->reset.reset)
+		engine->reset.reset(engine, stalled);
+}
+
 void intel_engines_sanitize(struct drm_i915_private *i915, bool force);
 
 bool intel_engine_is_idle(struct intel_engine_cs *engine);
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 12550b55c42f..67431355cd6e 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -363,9 +363,7 @@ static int igt_global_reset(void *arg)
 	/* Check that we can issue a global GPU reset */
 
 	igt_global_reset_lock(i915);
-	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 
-	mutex_lock(&i915->drm.struct_mutex);
 	reset_count = i915_reset_count(&i915->gpu_error);
 
 	i915_reset(i915, ALL_ENGINES, NULL);
@@ -374,9 +372,7 @@ static int igt_global_reset(void *arg)
 		pr_err("No GPU reset recorded!\n");
 		err = -EINVAL;
 	}
-	mutex_unlock(&i915->drm.struct_mutex);
 
-	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 	igt_global_reset_unlock(i915);
 
 	if (i915_terminally_wedged(&i915->gpu_error))
@@ -399,9 +395,7 @@ static int igt_wedged_reset(void *arg)
 	i915_gem_set_wedged(i915);
 	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
 
-	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 	i915_reset(i915, ALL_ENGINES, NULL);
-	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 
 	intel_runtime_pm_put(i915, wakeref);
 	mutex_unlock(&i915->drm.struct_mutex);
@@ -511,7 +505,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 				break;
 			}
 
-			if (!wait_for_idle(engine)) {
+			if (!i915_reset_flush(i915)) {
 				struct drm_printer p =
 					drm_info_printer(i915->drm.dev);
 
@@ -903,20 +897,13 @@ static int igt_reset_engines(void *arg)
 	return 0;
 }
 
-static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
+static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask)
 {
-	struct i915_gpu_error *error = &rq->i915->gpu_error;
-	u32 reset_count = i915_reset_count(error);
-
-	error->stalled_mask = mask;
-
-	/* set_bit() must be after we have setup the backchannel (mask) */
-	smp_mb__before_atomic();
-	set_bit(I915_RESET_HANDOFF, &error->flags);
+	u32 count = i915_reset_count(&i915->gpu_error);
 
-	wake_up_all(&error->wait_queue);
+	i915_reset(i915, mask, NULL);
 
-	return reset_count;
+	return count;
 }
 
 static int igt_reset_wait(void *arg)
@@ -962,7 +949,7 @@ static int igt_reset_wait(void *arg)
 		goto out_rq;
 	}
 
-	reset_count = fake_hangcheck(rq, ALL_ENGINES);
+	reset_count = fake_hangcheck(i915, ALL_ENGINES);
 
 	timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
 	if (timeout < 0) {
@@ -972,7 +959,6 @@ static int igt_reset_wait(void *arg)
 		goto out_rq;
 	}
 
-	GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
 	if (i915_reset_count(&i915->gpu_error) == reset_count) {
 		pr_err("No GPU reset recorded!\n");
 		err = -EINVAL;
@@ -1162,7 +1148,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 	}
 
 out_reset:
-	fake_hangcheck(rq, intel_engine_flag(rq->engine));
+	fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
 
 	if (tsk) {
 		struct igt_wedge_me w;
@@ -1341,12 +1327,7 @@ static int igt_reset_queue(void *arg)
 				goto fini;
 			}
 
-			reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
-
-			i915_reset(i915, ENGINE_MASK(id), NULL);
-
-			GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
-					    &i915->gpu_error.flags));
+			reset_count = fake_hangcheck(i915, ENGINE_MASK(id));
 
 			if (prev->fence.error != -EIO) {
 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
@@ -1565,6 +1546,7 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
 		       __func__, engine->name,
 		       rq->fence.seqno, hws_seqno(&h, rq));
+		i915_gem_set_wedged(i915);
 		err = -EIO;
 	}
 
@@ -1588,7 +1570,6 @@ static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
 static void force_reset(struct drm_i915_private *i915)
 {
 	i915_gem_set_wedged(i915);
-	set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
 	i915_reset(i915, 0, NULL);
 }
 
@@ -1618,6 +1599,26 @@ static int igt_atomic_reset(void *arg)
 	if (i915_terminally_wedged(&i915->gpu_error))
 		goto unlock;
 
+	if (intel_has_gpu_reset(i915)) {
+		const typeof(*phases) *p;
+
+		for (p = phases; p->name; p++) {
+			GEM_TRACE("intel_gpu_reset under %s\n", p->name);
+
+			p->critical_section_begin();
+			err = intel_gpu_reset(i915, ALL_ENGINES);
+			p->critical_section_end();
+
+			if (err) {
+				pr_err("intel_gpu_reset failed under %s\n",
+				       p->name);
+				goto out;
+			}
+		}
+
+		force_reset(i915);
+	}
+
 	if (intel_has_reset_engine(i915)) {
 		struct intel_engine_cs *engine;
 		enum intel_engine_id id;
diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
index a8cac56be835..b15c4f26c593 100644
--- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
@@ -214,7 +214,6 @@ static int check_whitelist(struct i915_gem_context *ctx,
 
 static int do_device_reset(struct intel_engine_cs *engine)
 {
-	set_bit(I915_RESET_HANDOFF, &engine->i915->gpu_error.flags);
 	i915_reset(engine->i915, ENGINE_MASK(engine->id), "live_workarounds");
 	return 0;
 }
@@ -394,7 +393,6 @@ static int
 live_gpu_reset_gt_engine_workarounds(void *arg)
 {
 	struct drm_i915_private *i915 = arg;
-	struct i915_gpu_error *error = &i915->gpu_error;
 	intel_wakeref_t wakeref;
 	struct wa_lists lists;
 	bool ok;
@@ -413,7 +411,6 @@ live_gpu_reset_gt_engine_workarounds(void *arg)
 	if (!ok)
 		goto out;
 
-	set_bit(I915_RESET_HANDOFF, &error->flags);
 	i915_reset(i915, ALL_ENGINES, "live_workarounds");
 
 	ok = verify_gt_engine_wa(i915, &lists, "after reset");
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 3cda66292e76..888c6978bc54 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -58,8 +58,8 @@ static void mock_device_release(struct drm_device *dev)
 	i915_gem_contexts_lost(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 
-	cancel_delayed_work_sync(&i915->gt.retire_work);
-	cancel_delayed_work_sync(&i915->gt.idle_work);
+	drain_delayed_work(&i915->gt.retire_work);
+	drain_delayed_work(&i915->gt.idle_work);
 	i915_gem_drain_workqueue(i915);
 
 	mutex_lock(&i915->drm.struct_mutex);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 05/38] drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (3 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 04/38] drm/i915: Remove GPU reset dependence on struct_mutex Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:29   ` Mika Kuoppala
  2019-01-18 14:00 ` [PATCH 06/38] drm/i915: Issue engine resets onto idle engines Chris Wilson
                   ` (34 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx; +Cc: Mika Kuoppala

Trim the struct_mutex hold and exclude the call to i915_gem_set_wedged()
as a reminder that it must be callable without struct_mutex held.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
Cc: Mika Kuoppala <mika.kuoppala@intel.com>
---
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 67431355cd6e..28144fd72550 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -389,16 +389,16 @@ static int igt_wedged_reset(void *arg)
 	/* Check that we can recover a wedged device with a GPU reset */
 
 	igt_global_reset_lock(i915);
-	mutex_lock(&i915->drm.struct_mutex);
 	wakeref = intel_runtime_pm_get(i915);
 
 	i915_gem_set_wedged(i915);
-	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
 
+	mutex_lock(&i915->drm.struct_mutex);
+	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
 	i915_reset(i915, ALL_ENGINES, NULL);
+	mutex_unlock(&i915->drm.struct_mutex);
 
 	intel_runtime_pm_put(i915, wakeref);
-	mutex_unlock(&i915->drm.struct_mutex);
 	igt_global_reset_unlock(i915);
 
 	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 06/38] drm/i915: Issue engine resets onto idle engines
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (4 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 05/38] drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA Chris Wilson
                   ` (33 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Always perform the requested reset, even if we believe the engine is
idle. Presumably there was a reason the caller wanted the reset, and in
the near future we lose the easy tracking for whether the engine is
idle.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_reset.c             |  4 ----
 .../gpu/drm/i915/selftests/intel_hangcheck.c  | 22 +++++--------------
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 064fc6da1512..d44b095e2860 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -1063,10 +1063,6 @@ int i915_reset_engine(struct intel_engine_cs *engine, const char *msg)
 	GEM_TRACE("%s flags=%lx\n", engine->name, error->flags);
 	GEM_BUG_ON(!test_bit(I915_RESET_ENGINE + engine->id, &error->flags));
 
-	if (i915_seqno_passed(intel_engine_get_seqno(engine),
-			      intel_engine_last_submit(engine)))
-		return 0;
-
 	reset_prepare_engine(engine);
 
 	if (msg)
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 28144fd72550..9d0cc9d63a1e 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -449,8 +449,6 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 
 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 		do {
-			u32 seqno = intel_engine_get_seqno(engine);
-
 			if (active) {
 				struct i915_request *rq;
 
@@ -479,8 +477,6 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 					break;
 				}
 
-				GEM_BUG_ON(!rq->global_seqno);
-				seqno = rq->global_seqno - 1;
 				i915_request_put(rq);
 			}
 
@@ -496,11 +492,10 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
 				break;
 			}
 
-			reset_engine_count += active;
 			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
-			    reset_engine_count) {
-				pr_err("%s engine reset %srecorded!\n",
-				       engine->name, active ? "not " : "");
+			    ++reset_engine_count) {
+				pr_err("%s engine reset not recorded!\n",
+				       engine->name);
 				err = -EINVAL;
 				break;
 			}
@@ -728,7 +723,6 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
 
 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
 		do {
-			u32 seqno = intel_engine_get_seqno(engine);
 			struct i915_request *rq = NULL;
 
 			if (flags & TEST_ACTIVE) {
@@ -756,9 +750,6 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
 					err = -EIO;
 					break;
 				}
-
-				GEM_BUG_ON(!rq->global_seqno);
-				seqno = rq->global_seqno - 1;
 			}
 
 			err = i915_reset_engine(engine, NULL);
@@ -795,10 +786,9 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
 
 		reported = i915_reset_engine_count(&i915->gpu_error, engine);
 		reported -= threads[engine->id].resets;
-		if (reported != (flags & TEST_ACTIVE ? count : 0)) {
-			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
-			       engine->name, test_name, count, reported,
-			       (flags & TEST_ACTIVE ? count : 0));
+		if (reported != count) {
+			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
+			       engine->name, test_name, count, reported);
 			if (!err)
 				err = -EINVAL;
 		}
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (5 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 06/38] drm/i915: Issue engine resets onto idle engines Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 16:03   ` Tvrtko Ursulin
  2019-01-18 14:00 ` [PATCH 08/38] drm/i915: Pull VM lists under the VM mutex Chris Wilson
                   ` (32 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Our goal is to remove struct_mutex and replace it with fine grained
locking. One of the thorny issues is our eviction logic for reclaiming
space for an execbuffer (or GTT mmaping, among a few other examples).
While eviction itself is easy to move under a per-VM mutex, performing
the activity tracking is less agreeable. One solution is not to do any
MRU tracking and do a simple coarse evaluation during eviction of
active/inactive, with a loose temporal ordering of last
insertion/evaluation. That keeps all the locking constrained to when we
are manipulating the VM itself, neatly avoiding the tricky handling of
possible recursive locking during execbuf and elsewhere.

Note that discarding the MRU is unlikely to impact upon our efficiency
to reclaim VM space (where we think a LRU model is best) as our
current strategy is to use random idle replacement first before doing
a search, and over time the use of softpinned 48b per-ppGTT is growing
(thereby eliminating any need to perform any eviction searches, in
theory at least).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c               | 10 +--
 drivers/gpu/drm/i915/i915_gem_evict.c         | 71 ++++++++++++-------
 drivers/gpu/drm/i915/i915_gem_gtt.c           | 15 ++--
 drivers/gpu/drm/i915/i915_gem_gtt.h           | 26 +------
 drivers/gpu/drm/i915/i915_gem_shrinker.c      |  8 ++-
 drivers/gpu/drm/i915/i915_gem_stolen.c        |  3 +-
 drivers/gpu/drm/i915/i915_gpu_error.c         | 37 +++++-----
 drivers/gpu/drm/i915/i915_vma.c               |  9 +--
 .../gpu/drm/i915/selftests/i915_gem_evict.c   |  4 +-
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |  2 +-
 10 files changed, 84 insertions(+), 101 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d20b42386c3c..f45186ddb236 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -253,10 +253,7 @@ i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 
 	pinned = ggtt->vm.reserved;
 	mutex_lock(&dev->struct_mutex);
-	list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
-		if (i915_vma_is_pinned(vma))
-			pinned += vma->node.size;
-	list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
+	list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link)
 		if (i915_vma_is_pinned(vma))
 			pinned += vma->node.size;
 	mutex_unlock(&dev->struct_mutex);
@@ -1539,13 +1536,10 @@ static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
 
 	for_each_ggtt_vma(vma, obj) {
-		if (i915_vma_is_active(vma))
-			continue;
-
 		if (!drm_mm_node_allocated(&vma->node))
 			continue;
 
-		list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
+		list_move_tail(&vma->vm_link, &vma->vm->bound_list);
 	}
 
 	i915 = to_i915(obj->base.dev);
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index f6855401f247..5cfe4b75e7d6 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -126,14 +126,10 @@ i915_gem_evict_something(struct i915_address_space *vm,
 	struct drm_i915_private *dev_priv = vm->i915;
 	struct drm_mm_scan scan;
 	struct list_head eviction_list;
-	struct list_head *phases[] = {
-		&vm->inactive_list,
-		&vm->active_list,
-		NULL,
-	}, **phase;
 	struct i915_vma *vma, *next;
 	struct drm_mm_node *node;
 	enum drm_mm_insert_mode mode;
+	struct i915_vma *active;
 	int ret;
 
 	lockdep_assert_held(&vm->i915->drm.struct_mutex);
@@ -169,17 +165,46 @@ i915_gem_evict_something(struct i915_address_space *vm,
 	 */
 	if (!(flags & PIN_NONBLOCK))
 		i915_retire_requests(dev_priv);
-	else
-		phases[1] = NULL;
 
 search_again:
+	active = NULL;
 	INIT_LIST_HEAD(&eviction_list);
-	phase = phases;
-	do {
-		list_for_each_entry(vma, *phase, vm_link)
-			if (mark_free(&scan, vma, flags, &eviction_list))
-				goto found;
-	} while (*++phase);
+	list_for_each_entry_safe(vma, next, &vm->bound_list, vm_link) {
+		/*
+		 * We keep this list in a rough least-recently scanned order
+		 * of active elements (inactive elements are cheap to reap).
+		 * New entries are added to the end, and we move anything we
+		 * scan to the end. The assumption is that the working set
+		 * of applications is either steady state (and thanks to the
+		 * userspace bo cache it almost always is) or volatile and
+		 * frequently replaced after a frame, which are self-evicting!
+		 * Given that assumption, the MRU order of the scan list is
+		 * fairly static, and keeping it in least-recently scan order
+		 * is suitable.
+		 *
+		 * To notice when we complete one full cycle, we record the
+		 * first active element seen, before moving it to the tail.
+		 */
+		if (i915_vma_is_active(vma)) {
+			if (vma == active) {
+				if (flags & PIN_NONBLOCK)
+					break;
+
+				active = ERR_PTR(-EAGAIN);
+			}
+
+			if (active != ERR_PTR(-EAGAIN)) {
+				if (!active)
+					active = vma;
+
+				list_move_tail(&vma->vm_link, &vm->bound_list);
+				continue;
+			}
+		}
+
+		if (mark_free(&scan, vma, flags, &eviction_list))
+			goto found;
+	}
 
 	/* Nothing found, clean up and bail out! */
 	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
@@ -388,11 +413,6 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
  */
 int i915_gem_evict_vm(struct i915_address_space *vm)
 {
-	struct list_head *phases[] = {
-		&vm->inactive_list,
-		&vm->active_list,
-		NULL
-	}, **phase;
 	struct list_head eviction_list;
 	struct i915_vma *vma, *next;
 	int ret;
@@ -412,16 +432,13 @@ int i915_gem_evict_vm(struct i915_address_space *vm)
 	}
 
 	INIT_LIST_HEAD(&eviction_list);
-	phase = phases;
-	do {
-		list_for_each_entry(vma, *phase, vm_link) {
-			if (i915_vma_is_pinned(vma))
-				continue;
+	list_for_each_entry(vma, &vm->bound_list, vm_link) {
+		if (i915_vma_is_pinned(vma))
+			continue;
 
-			__i915_vma_pin(vma);
-			list_add(&vma->evict_link, &eviction_list);
-		}
-	} while (*++phase);
+		__i915_vma_pin(vma);
+		list_add(&vma->evict_link, &eviction_list);
+	}
 
 	ret = 0;
 	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 9081e3bc5a59..2ad9070a54c1 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -491,9 +491,8 @@ static void i915_address_space_init(struct i915_address_space *vm, int subclass)
 
 	stash_init(&vm->free_pages);
 
-	INIT_LIST_HEAD(&vm->active_list);
-	INIT_LIST_HEAD(&vm->inactive_list);
 	INIT_LIST_HEAD(&vm->unbound_list);
+	INIT_LIST_HEAD(&vm->bound_list);
 }
 
 static void i915_address_space_fini(struct i915_address_space *vm)
@@ -2111,8 +2110,7 @@ void i915_ppgtt_close(struct i915_address_space *vm)
 static void ppgtt_destroy_vma(struct i915_address_space *vm)
 {
 	struct list_head *phases[] = {
-		&vm->active_list,
-		&vm->inactive_list,
+		&vm->bound_list,
 		&vm->unbound_list,
 		NULL,
 	}, **phase;
@@ -2135,8 +2133,7 @@ void i915_ppgtt_release(struct kref *kref)
 
 	ppgtt_destroy_vma(&ppgtt->vm);
 
-	GEM_BUG_ON(!list_empty(&ppgtt->vm.active_list));
-	GEM_BUG_ON(!list_empty(&ppgtt->vm.inactive_list));
+	GEM_BUG_ON(!list_empty(&ppgtt->vm.bound_list));
 	GEM_BUG_ON(!list_empty(&ppgtt->vm.unbound_list));
 
 	ppgtt->vm.cleanup(&ppgtt->vm);
@@ -2801,8 +2798,7 @@ void i915_ggtt_cleanup_hw(struct drm_i915_private *dev_priv)
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	i915_gem_fini_aliasing_ppgtt(dev_priv);
 
-	GEM_BUG_ON(!list_empty(&ggtt->vm.active_list));
-	list_for_each_entry_safe(vma, vn, &ggtt->vm.inactive_list, vm_link)
+	list_for_each_entry_safe(vma, vn, &ggtt->vm.bound_list, vm_link)
 		WARN_ON(i915_vma_unbind(vma));
 
 	if (drm_mm_node_allocated(&ggtt->error_capture))
@@ -3514,8 +3510,7 @@ void i915_gem_restore_gtt_mappings(struct drm_i915_private *dev_priv)
 	ggtt->vm.closed = true; /* skip rewriting PTE on VMA unbind */
 
 	/* clflush objects bound into the GGTT and rebind them. */
-	GEM_BUG_ON(!list_empty(&ggtt->vm.active_list));
-	list_for_each_entry_safe(vma, vn, &ggtt->vm.inactive_list, vm_link) {
+	list_for_each_entry_safe(vma, vn, &ggtt->vm.bound_list, vm_link) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
 		if (!(vma->flags & I915_VMA_GLOBAL_BIND))
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index a0039ea97cdc..bd679c8c56dd 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -299,32 +299,12 @@ struct i915_address_space {
 	struct i915_page_directory_pointer *scratch_pdp; /* GEN8+ & 48b PPGTT */
 
 	/**
-	 * List of objects currently involved in rendering.
-	 *
-	 * Includes buffers having the contents of their GPU caches
-	 * flushed, not necessarily primitives. last_read_req
-	 * represents when the rendering involved will be completed.
-	 *
-	 * A reference is held on the buffer while on this list.
+	 * List of vma currently bound.
 	 */
-	struct list_head active_list;
+	struct list_head bound_list;
 
 	/**
-	 * LRU list of objects which are not in the ringbuffer and
-	 * are ready to unbind, but are still in the GTT.
-	 *
-	 * last_read_req is NULL while an object is in this list.
-	 *
-	 * A reference is not held on the buffer while on this list,
-	 * as merely being GTT-bound shouldn't prevent its being
-	 * freed, and we'll pull it off the list in the free path.
-	 */
-	struct list_head inactive_list;
-
-	/**
-	 * List of vma that have been unbound.
-	 *
-	 * A reference is not held on the buffer while on this list.
+	 * List of vma that are not unbound.
 	 */
 	struct list_head unbound_list;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index 8ceecb026910..a76d6c95c824 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -462,9 +462,13 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
 
 	/* We also want to clear any cached iomaps as they wrap vmap */
 	list_for_each_entry_safe(vma, next,
-				 &i915->ggtt.vm.inactive_list, vm_link) {
+				 &i915->ggtt.vm.bound_list, vm_link) {
 		unsigned long count = vma->node.size >> PAGE_SHIFT;
-		if (vma->iomap && i915_vma_unbind(vma) == 0)
+
+		if (!vma->iomap || i915_vma_is_active(vma))
+			continue;
+
+		if (i915_vma_unbind(vma) == 0)
 			freed_pages += count;
 	}
 
diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
index 9df615eea2d8..a9e365789686 100644
--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
@@ -701,7 +701,8 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_i915_private *dev_priv
 	vma->pages = obj->mm.pages;
 	vma->flags |= I915_VMA_GLOBAL_BIND;
 	__i915_vma_set_map_and_fenceable(vma);
-	list_move_tail(&vma->vm_link, &ggtt->vm.inactive_list);
+
+	list_move_tail(&vma->vm_link, &ggtt->vm.bound_list);
 
 	spin_lock(&dev_priv->mm.obj_lock);
 	list_move_tail(&obj->mm.link, &dev_priv->mm.bound_list);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index afccffa9f3f9..6f2fcad0a6ee 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -1121,7 +1121,7 @@ static void capture_bo(struct drm_i915_error_buffer *err,
 
 static u32 capture_error_bo(struct drm_i915_error_buffer *err,
 			    int count, struct list_head *head,
-			    bool pinned_only)
+			    bool active_only, bool pinned_only)
 {
 	struct i915_vma *vma;
 	int i = 0;
@@ -1130,6 +1130,9 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
 		if (!vma->obj)
 			continue;
 
+		if (active_only && !i915_vma_is_active(vma))
+			continue;
+
 		if (pinned_only && !i915_vma_is_pinned(vma))
 			continue;
 
@@ -1599,14 +1602,16 @@ static void gem_capture_vm(struct i915_gpu_state *error,
 	int count;
 
 	count = 0;
-	list_for_each_entry(vma, &vm->active_list, vm_link)
-		count++;
+	list_for_each_entry(vma, &vm->bound_list, vm_link)
+		if (i915_vma_is_active(vma))
+			count++;
 
 	active_bo = NULL;
 	if (count)
 		active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
 	if (active_bo)
-		count = capture_error_bo(active_bo, count, &vm->active_list, false);
+		count = capture_error_bo(active_bo, count, &vm->bound_list,
+					 true, false);
 	else
 		count = 0;
 
@@ -1644,28 +1649,20 @@ static void capture_pinned_buffers(struct i915_gpu_state *error)
 	struct i915_address_space *vm = &error->i915->ggtt.vm;
 	struct drm_i915_error_buffer *bo;
 	struct i915_vma *vma;
-	int count_inactive, count_active;
-
-	count_inactive = 0;
-	list_for_each_entry(vma, &vm->inactive_list, vm_link)
-		count_inactive++;
+	int count;
 
-	count_active = 0;
-	list_for_each_entry(vma, &vm->active_list, vm_link)
-		count_active++;
+	count = 0;
+	list_for_each_entry(vma, &vm->bound_list, vm_link)
+		count++;
 
 	bo = NULL;
-	if (count_inactive + count_active)
-		bo = kcalloc(count_inactive + count_active,
-			     sizeof(*bo), GFP_ATOMIC);
+	if (count)
+		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
 	if (!bo)
 		return;
 
-	count_inactive = capture_error_bo(bo, count_inactive,
-					  &vm->active_list, true);
-	count_active = capture_error_bo(bo + count_inactive, count_active,
-					&vm->inactive_list, true);
-	error->pinned_bo_count = count_inactive + count_active;
+	error->pinned_bo_count =
+		capture_error_bo(bo, count, &vm->bound_list, false, true);
 	error->pinned_bo = bo;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 5b4d78cdb4ca..7de28baffb8f 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -79,9 +79,6 @@ __i915_vma_retire(struct i915_vma *vma, struct i915_request *rq)
 	if (--vma->active_count)
 		return;
 
-	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
-	list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
-
 	GEM_BUG_ON(!i915_gem_object_is_active(obj));
 	if (--obj->active_count)
 		return;
@@ -659,7 +656,7 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
 	GEM_BUG_ON(!i915_gem_valid_gtt_space(vma, cache_level));
 
-	list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
+	list_move_tail(&vma->vm_link, &vma->vm->bound_list);
 
 	if (vma->obj) {
 		struct drm_i915_gem_object *obj = vma->obj;
@@ -1003,10 +1000,8 @@ int i915_vma_move_to_active(struct i915_vma *vma,
 	 * add the active reference first and queue for it to be dropped
 	 * *last*.
 	 */
-	if (!i915_gem_active_isset(active) && !vma->active_count++) {
-		list_move_tail(&vma->vm_link, &vma->vm->active_list);
+	if (!i915_gem_active_isset(active) && !vma->active_count++)
 		obj->active_count++;
-	}
 	i915_gem_active_set(active, rq);
 	GEM_BUG_ON(!i915_vma_is_active(vma));
 	GEM_BUG_ON(!obj->active_count);
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
index 543d618c152b..9e6b9304f2bd 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
@@ -90,7 +90,7 @@ static int populate_ggtt(struct drm_i915_private *i915)
 		goto cleanup;
 	}
 
-	if (list_empty(&i915->ggtt.vm.inactive_list)) {
+	if (list_empty(&i915->ggtt.vm.bound_list)) {
 		pr_err("No objects on the GGTT inactive list!\n");
 		return -EINVAL;
 	}
@@ -111,7 +111,7 @@ static void unpin_ggtt(struct drm_i915_private *i915)
 {
 	struct i915_vma *vma;
 
-	list_for_each_entry(vma, &i915->ggtt.vm.inactive_list, vm_link)
+	list_for_each_entry(vma, &i915->ggtt.vm.bound_list, vm_link)
 		i915_vma_unpin(vma);
 }
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
index fea8ab14e79d..852b06cb50a0 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
@@ -1237,7 +1237,7 @@ static void track_vma_bind(struct i915_vma *vma)
 	__i915_gem_object_pin_pages(obj);
 
 	vma->pages = obj->mm.pages;
-	list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
+	list_move_tail(&vma->vm_link, &vma->vm->bound_list);
 }
 
 static int exercise_mock(struct drm_i915_private *i915,
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 08/38] drm/i915: Pull VM lists under the VM mutex.
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (6 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 16:04   ` Tvrtko Ursulin
  2019-01-18 14:00 ` [PATCH 09/38] drm/i915: Move vma lookup to its own lock Chris Wilson
                   ` (31 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

A starting point to counter the pervasive struct_mutex. For the goal of
avoiding (or at least blocking under them!) global locks during user
request submission, a simple but important step is being able to manage
each clients GTT separately. For which, we want to replace using the
struct_mutex as the guard for all things GTT/VM and switch instead to a
specific mutex inside i915_address_space.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c                 | 14 ++++++++------
 drivers/gpu/drm/i915/i915_gem_evict.c           |  2 ++
 drivers/gpu/drm/i915/i915_gem_gtt.c             | 15 +++++++++++++--
 drivers/gpu/drm/i915/i915_gem_shrinker.c        |  4 ++++
 drivers/gpu/drm/i915/i915_gem_stolen.c          |  2 ++
 drivers/gpu/drm/i915/i915_vma.c                 | 11 +++++++++++
 drivers/gpu/drm/i915/selftests/i915_gem_evict.c |  3 +++
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c   |  3 +++
 8 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f45186ddb236..538fa5404603 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -245,18 +245,19 @@ int
 i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 			    struct drm_file *file)
 {
-	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct i915_ggtt *ggtt = &dev_priv->ggtt;
+	struct i915_ggtt *ggtt = &to_i915(dev)->ggtt;
 	struct drm_i915_gem_get_aperture *args = data;
 	struct i915_vma *vma;
 	u64 pinned;
 
+	mutex_lock(&ggtt->vm.mutex);
+
 	pinned = ggtt->vm.reserved;
-	mutex_lock(&dev->struct_mutex);
 	list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link)
 		if (i915_vma_is_pinned(vma))
 			pinned += vma->node.size;
-	mutex_unlock(&dev->struct_mutex);
+
+	mutex_unlock(&ggtt->vm.mutex);
 
 	args->aper_size = ggtt->vm.total;
 	args->aper_available_size = args->aper_size - pinned;
@@ -1529,20 +1530,21 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 
 static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
 {
-	struct drm_i915_private *i915;
+	struct drm_i915_private *i915 = to_i915(obj->base.dev);
 	struct list_head *list;
 	struct i915_vma *vma;
 
 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
 
+	mutex_lock(&i915->ggtt.vm.mutex);
 	for_each_ggtt_vma(vma, obj) {
 		if (!drm_mm_node_allocated(&vma->node))
 			continue;
 
 		list_move_tail(&vma->vm_link, &vma->vm->bound_list);
 	}
+	mutex_unlock(&i915->ggtt.vm.mutex);
 
-	i915 = to_i915(obj->base.dev);
 	spin_lock(&i915->mm.obj_lock);
 	list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
 	list_move_tail(&obj->mm.link, list);
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index 5cfe4b75e7d6..dc137701acb8 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -432,6 +432,7 @@ int i915_gem_evict_vm(struct i915_address_space *vm)
 	}
 
 	INIT_LIST_HEAD(&eviction_list);
+	mutex_lock(&vm->mutex);
 	list_for_each_entry(vma, &vm->bound_list, vm_link) {
 		if (i915_vma_is_pinned(vma))
 			continue;
@@ -439,6 +440,7 @@ int i915_gem_evict_vm(struct i915_address_space *vm)
 		__i915_vma_pin(vma);
 		list_add(&vma->evict_link, &eviction_list);
 	}
+	mutex_unlock(&vm->mutex);
 
 	ret = 0;
 	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 2ad9070a54c1..49b00996a15e 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -1931,7 +1931,10 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
 	vma->ggtt_view.type = I915_GGTT_VIEW_ROTATED; /* prevent fencing */
 
 	INIT_LIST_HEAD(&vma->obj_link);
+
+	mutex_lock(&vma->vm->mutex);
 	list_add(&vma->vm_link, &vma->vm->unbound_list);
+	mutex_unlock(&vma->vm->mutex);
 
 	return vma;
 }
@@ -3504,9 +3507,10 @@ void i915_gem_restore_gtt_mappings(struct drm_i915_private *dev_priv)
 
 	i915_check_and_clear_faults(dev_priv);
 
+	mutex_lock(&ggtt->vm.mutex);
+
 	/* First fill our portion of the GTT with scratch pages */
 	ggtt->vm.clear_range(&ggtt->vm, 0, ggtt->vm.total);
-
 	ggtt->vm.closed = true; /* skip rewriting PTE on VMA unbind */
 
 	/* clflush objects bound into the GGTT and rebind them. */
@@ -3516,19 +3520,26 @@ void i915_gem_restore_gtt_mappings(struct drm_i915_private *dev_priv)
 		if (!(vma->flags & I915_VMA_GLOBAL_BIND))
 			continue;
 
+		mutex_unlock(&ggtt->vm.mutex);
+
 		if (!i915_vma_unbind(vma))
-			continue;
+			goto lock;
 
 		WARN_ON(i915_vma_bind(vma,
 				      obj ? obj->cache_level : 0,
 				      PIN_UPDATE));
 		if (obj)
 			WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
+
+lock:
+		mutex_lock(&ggtt->vm.mutex);
 	}
 
 	ggtt->vm.closed = false;
 	i915_ggtt_invalidate(dev_priv);
 
+	mutex_unlock(&ggtt->vm.mutex);
+
 	if (INTEL_GEN(dev_priv) >= 8) {
 		struct intel_ppat *ppat = &dev_priv->ppat;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
index a76d6c95c824..6da795c7e62e 100644
--- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
@@ -461,6 +461,7 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
 					       I915_SHRINK_VMAPS);
 
 	/* We also want to clear any cached iomaps as they wrap vmap */
+	mutex_lock(&i915->ggtt.vm.mutex);
 	list_for_each_entry_safe(vma, next,
 				 &i915->ggtt.vm.bound_list, vm_link) {
 		unsigned long count = vma->node.size >> PAGE_SHIFT;
@@ -468,9 +469,12 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
 		if (!vma->iomap || i915_vma_is_active(vma))
 			continue;
 
+		mutex_unlock(&i915->ggtt.vm.mutex);
 		if (i915_vma_unbind(vma) == 0)
 			freed_pages += count;
+		mutex_lock(&i915->ggtt.vm.mutex);
 	}
+	mutex_unlock(&i915->ggtt.vm.mutex);
 
 out:
 	shrinker_unlock(i915, unlock);
diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
index a9e365789686..74a9661479ca 100644
--- a/drivers/gpu/drm/i915/i915_gem_stolen.c
+++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
@@ -702,7 +702,9 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_i915_private *dev_priv
 	vma->flags |= I915_VMA_GLOBAL_BIND;
 	__i915_vma_set_map_and_fenceable(vma);
 
+	mutex_lock(&ggtt->vm.mutex);
 	list_move_tail(&vma->vm_link, &ggtt->vm.bound_list);
+	mutex_unlock(&ggtt->vm.mutex);
 
 	spin_lock(&dev_priv->mm.obj_lock);
 	list_move_tail(&obj->mm.link, &dev_priv->mm.bound_list);
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index 7de28baffb8f..dcbd0d345c72 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -213,7 +213,10 @@ vma_create(struct drm_i915_gem_object *obj,
 	}
 	rb_link_node(&vma->obj_node, rb, p);
 	rb_insert_color(&vma->obj_node, &obj->vma_tree);
+
+	mutex_lock(&vm->mutex);
 	list_add(&vma->vm_link, &vm->unbound_list);
+	mutex_unlock(&vm->mutex);
 
 	return vma;
 
@@ -656,7 +659,9 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
 	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
 	GEM_BUG_ON(!i915_gem_valid_gtt_space(vma, cache_level));
 
+	mutex_lock(&vma->vm->mutex);
 	list_move_tail(&vma->vm_link, &vma->vm->bound_list);
+	mutex_unlock(&vma->vm->mutex);
 
 	if (vma->obj) {
 		struct drm_i915_gem_object *obj = vma->obj;
@@ -689,8 +694,10 @@ i915_vma_remove(struct i915_vma *vma)
 
 	vma->ops->clear_pages(vma);
 
+	mutex_lock(&vma->vm->mutex);
 	drm_mm_remove_node(&vma->node);
 	list_move_tail(&vma->vm_link, &vma->vm->unbound_list);
+	mutex_unlock(&vma->vm->mutex);
 
 	/*
 	 * Since the unbound list is global, only move to that list if
@@ -802,7 +809,11 @@ static void __i915_vma_destroy(struct i915_vma *vma)
 	GEM_BUG_ON(i915_gem_active_isset(&vma->last_fence));
 
 	list_del(&vma->obj_link);
+
+	mutex_lock(&vma->vm->mutex);
 	list_del(&vma->vm_link);
+	mutex_unlock(&vma->vm->mutex);
+
 	if (vma->obj)
 		rb_erase(&vma->obj_node, &vma->obj->vma_tree);
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
index 9e6b9304f2bd..c8deb961a020 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
@@ -109,10 +109,13 @@ static int populate_ggtt(struct drm_i915_private *i915)
 
 static void unpin_ggtt(struct drm_i915_private *i915)
 {
+	struct i915_ggtt *ggtt = &i915->ggtt;
 	struct i915_vma *vma;
 
+	mutex_lock(&ggtt->vm.mutex);
 	list_for_each_entry(vma, &i915->ggtt.vm.bound_list, vm_link)
 		i915_vma_unpin(vma);
+	mutex_unlock(&ggtt->vm.mutex);
 }
 
 static void cleanup_objects(struct drm_i915_private *i915)
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
index 852b06cb50a0..35eb40e5de91 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
@@ -1237,7 +1237,10 @@ static void track_vma_bind(struct i915_vma *vma)
 	__i915_gem_object_pin_pages(obj);
 
 	vma->pages = obj->mm.pages;
+
+	mutex_lock(&vma->vm->mutex);
 	list_move_tail(&vma->vm_link, &vma->vm->bound_list);
+	mutex_unlock(&vma->vm->mutex);
 }
 
 static int exercise_mock(struct drm_i915_private *i915,
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 09/38] drm/i915: Move vma lookup to its own lock
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (7 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 08/38] drm/i915: Pull VM lists under the VM mutex Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 16:14   ` Tvrtko Ursulin
  2019-01-18 14:00 ` [PATCH 10/38] drm/i915/selftests: Allocate mock ring/timeline per context Chris Wilson
                   ` (30 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Remove the struct_mutex requirement for looking up the vma for an
object.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c       |  6 +--
 drivers/gpu/drm/i915/i915_gem.c           | 33 +++++++-----
 drivers/gpu/drm/i915/i915_gem_object.h    | 45 +++++++++-------
 drivers/gpu/drm/i915/i915_vma.c           | 66 ++++++++++++++++-------
 drivers/gpu/drm/i915/i915_vma.h           |  2 +-
 drivers/gpu/drm/i915/selftests/i915_vma.c |  4 +-
 6 files changed, 98 insertions(+), 58 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 3ec369980d40..2a6e4044f25b 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -159,14 +159,14 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 		   obj->mm.madv == I915_MADV_DONTNEED ? " purgeable" : "");
 	if (obj->base.name)
 		seq_printf(m, " (name: %d)", obj->base.name);
-	list_for_each_entry(vma, &obj->vma_list, obj_link) {
+	list_for_each_entry(vma, &obj->vma.list, obj_link) {
 		if (i915_vma_is_pinned(vma))
 			pin_count++;
 	}
 	seq_printf(m, " (pinned x %d)", pin_count);
 	if (obj->pin_global)
 		seq_printf(m, " (global)");
-	list_for_each_entry(vma, &obj->vma_list, obj_link) {
+	list_for_each_entry(vma, &obj->vma.list, obj_link) {
 		if (!drm_mm_node_allocated(&vma->node))
 			continue;
 
@@ -322,7 +322,7 @@ static int per_file_stats(int id, void *ptr, void *data)
 	if (obj->base.name || obj->base.dma_buf)
 		stats->shared += obj->base.size;
 
-	list_for_each_entry(vma, &obj->vma_list, obj_link) {
+	list_for_each_entry(vma, &obj->vma.list, obj_link) {
 		if (!drm_mm_node_allocated(&vma->node))
 			continue;
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 538fa5404603..15acd052da46 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -437,15 +437,19 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
 	if (ret)
 		return ret;
 
-	while ((vma = list_first_entry_or_null(&obj->vma_list,
-					       struct i915_vma,
-					       obj_link))) {
+	spin_lock(&obj->vma.lock);
+	while (!ret && (vma = list_first_entry_or_null(&obj->vma.list,
+						       struct i915_vma,
+						       obj_link))) {
 		list_move_tail(&vma->obj_link, &still_in_list);
+		spin_unlock(&obj->vma.lock);
+
 		ret = i915_vma_unbind(vma);
-		if (ret)
-			break;
+
+		spin_lock(&obj->vma.lock);
 	}
-	list_splice(&still_in_list, &obj->vma_list);
+	list_splice(&still_in_list, &obj->vma.list);
+	spin_unlock(&obj->vma.lock);
 
 	return ret;
 }
@@ -3489,7 +3493,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 	 * reading an invalid PTE on older architectures.
 	 */
 restart:
-	list_for_each_entry(vma, &obj->vma_list, obj_link) {
+	list_for_each_entry(vma, &obj->vma.list, obj_link) {
 		if (!drm_mm_node_allocated(&vma->node))
 			continue;
 
@@ -3567,7 +3571,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 			 */
 		}
 
-		list_for_each_entry(vma, &obj->vma_list, obj_link) {
+		list_for_each_entry(vma, &obj->vma.list, obj_link) {
 			if (!drm_mm_node_allocated(&vma->node))
 				continue;
 
@@ -3577,7 +3581,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 		}
 	}
 
-	list_for_each_entry(vma, &obj->vma_list, obj_link)
+	list_for_each_entry(vma, &obj->vma.list, obj_link)
 		vma->node.color = cache_level;
 	i915_gem_object_set_cache_coherency(obj, cache_level);
 	obj->cache_dirty = true; /* Always invalidate stale cachelines */
@@ -4153,7 +4157,9 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
 {
 	mutex_init(&obj->mm.lock);
 
-	INIT_LIST_HEAD(&obj->vma_list);
+	spin_lock_init(&obj->vma.lock);
+	INIT_LIST_HEAD(&obj->vma.list);
+
 	INIT_LIST_HEAD(&obj->lut_list);
 	INIT_LIST_HEAD(&obj->batch_pool_link);
 
@@ -4319,14 +4325,13 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		mutex_lock(&i915->drm.struct_mutex);
 
 		GEM_BUG_ON(i915_gem_object_is_active(obj));
-		list_for_each_entry_safe(vma, vn,
-					 &obj->vma_list, obj_link) {
+		list_for_each_entry_safe(vma, vn, &obj->vma.list, obj_link) {
 			GEM_BUG_ON(i915_vma_is_active(vma));
 			vma->flags &= ~I915_VMA_PIN_MASK;
 			i915_vma_destroy(vma);
 		}
-		GEM_BUG_ON(!list_empty(&obj->vma_list));
-		GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
+		GEM_BUG_ON(!list_empty(&obj->vma.list));
+		GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma.tree));
 
 		/* This serializes freeing with the shrinker. Since the free
 		 * is delayed, first by RCU then by the workqueue, we want the
diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
index cb1b0144d274..5a33b6d9f942 100644
--- a/drivers/gpu/drm/i915/i915_gem_object.h
+++ b/drivers/gpu/drm/i915/i915_gem_object.h
@@ -87,24 +87,33 @@ struct drm_i915_gem_object {
 
 	const struct drm_i915_gem_object_ops *ops;
 
-	/**
-	 * @vma_list: List of VMAs backed by this object
-	 *
-	 * The VMA on this list are ordered by type, all GGTT vma are placed
-	 * at the head and all ppGTT vma are placed at the tail. The different
-	 * types of GGTT vma are unordered between themselves, use the
-	 * @vma_tree (which has a defined order between all VMA) to find an
-	 * exact match.
-	 */
-	struct list_head vma_list;
-	/**
-	 * @vma_tree: Ordered tree of VMAs backed by this object
-	 *
-	 * All VMA created for this object are placed in the @vma_tree for
-	 * fast retrieval via a binary search in i915_vma_instance().
-	 * They are also added to @vma_list for easy iteration.
-	 */
-	struct rb_root vma_tree;
+	struct {
+		/**
+		 * @vma.lock: protect the list/tree of vmas
+		 */
+		struct spinlock lock;
+
+		/**
+		 * @vma.list: List of VMAs backed by this object
+		 *
+		 * The VMA on this list are ordered by type, all GGTT vma are
+		 * placed at the head and all ppGTT vma are placed at the tail.
+		 * The different types of GGTT vma are unordered between
+		 * themselves, use the @vma.tree (which has a defined order
+		 * between all VMA) to quickly find an exact match.
+		 */
+		struct list_head list;
+
+		/**
+		 * @vma.tree: Ordered tree of VMAs backed by this object
+		 *
+		 * All VMA created for this object are placed in the @vma.tree
+		 * for fast retrieval via a binary search in
+		 * i915_vma_instance(). They are also added to @vma.list for
+		 * easy iteration.
+		 */
+		struct rb_root tree;
+	} vma;
 
 	/**
 	 * @lut_list: List of vma lookup entries in use for this object.
diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
index dcbd0d345c72..d83b8ad5f859 100644
--- a/drivers/gpu/drm/i915/i915_vma.c
+++ b/drivers/gpu/drm/i915/i915_vma.c
@@ -187,32 +187,52 @@ vma_create(struct drm_i915_gem_object *obj,
 								i915_gem_object_get_stride(obj));
 		GEM_BUG_ON(!is_power_of_2(vma->fence_alignment));
 
-		/*
-		 * We put the GGTT vma at the start of the vma-list, followed
-		 * by the ppGGTT vma. This allows us to break early when
-		 * iterating over only the GGTT vma for an object, see
-		 * for_each_ggtt_vma()
-		 */
 		vma->flags |= I915_VMA_GGTT;
-		list_add(&vma->obj_link, &obj->vma_list);
-	} else {
-		list_add_tail(&vma->obj_link, &obj->vma_list);
 	}
 
+	spin_lock(&obj->vma.lock);
+
 	rb = NULL;
-	p = &obj->vma_tree.rb_node;
+	p = &obj->vma.tree.rb_node;
 	while (*p) {
 		struct i915_vma *pos;
+		long cmp;
 
 		rb = *p;
 		pos = rb_entry(rb, struct i915_vma, obj_node);
-		if (i915_vma_compare(pos, vm, view) < 0)
+
+		/*
+		 * If the view already exists in the tree, another thread
+		 * already created a matching vma, so return the older instance
+		 * and dispose of ours.
+		 */
+		cmp = i915_vma_compare(pos, vm, view);
+		if (cmp == 0) {
+			spin_unlock(&obj->vma.lock);
+			kmem_cache_free(vm->i915->vmas, vma);
+			return pos;
+		}
+
+		if (cmp < 0)
 			p = &rb->rb_right;
 		else
 			p = &rb->rb_left;
 	}
 	rb_link_node(&vma->obj_node, rb, p);
-	rb_insert_color(&vma->obj_node, &obj->vma_tree);
+	rb_insert_color(&vma->obj_node, &obj->vma.tree);
+
+	if (i915_vma_is_ggtt(vma))
+		/*
+		 * We put the GGTT vma at the start of the vma-list, followed
+		 * by the ppGGTT vma. This allows us to break early when
+		 * iterating over only the GGTT vma for an object, see
+		 * for_each_ggtt_vma()
+		 */
+		list_add(&vma->obj_link, &obj->vma.list);
+	else
+		list_add_tail(&vma->obj_link, &obj->vma.list);
+
+	spin_unlock(&obj->vma.lock);
 
 	mutex_lock(&vm->mutex);
 	list_add(&vma->vm_link, &vm->unbound_list);
@@ -232,7 +252,7 @@ vma_lookup(struct drm_i915_gem_object *obj,
 {
 	struct rb_node *rb;
 
-	rb = obj->vma_tree.rb_node;
+	rb = obj->vma.tree.rb_node;
 	while (rb) {
 		struct i915_vma *vma = rb_entry(rb, struct i915_vma, obj_node);
 		long cmp;
@@ -272,16 +292,18 @@ i915_vma_instance(struct drm_i915_gem_object *obj,
 {
 	struct i915_vma *vma;
 
-	lockdep_assert_held(&obj->base.dev->struct_mutex);
 	GEM_BUG_ON(view && !i915_is_ggtt(vm));
 	GEM_BUG_ON(vm->closed);
 
+	spin_lock(&obj->vma.lock);
 	vma = vma_lookup(obj, vm, view);
-	if (!vma)
+	spin_unlock(&obj->vma.lock);
+
+	/* vma_create() will resolve the race if another creates the vma */
+	if (unlikely(!vma))
 		vma = vma_create(obj, vm, view);
 
 	GEM_BUG_ON(!IS_ERR(vma) && i915_vma_compare(vma, vm, view));
-	GEM_BUG_ON(!IS_ERR(vma) && vma_lookup(obj, vm, view) != vma);
 	return vma;
 }
 
@@ -808,14 +830,18 @@ static void __i915_vma_destroy(struct i915_vma *vma)
 
 	GEM_BUG_ON(i915_gem_active_isset(&vma->last_fence));
 
-	list_del(&vma->obj_link);
-
 	mutex_lock(&vma->vm->mutex);
 	list_del(&vma->vm_link);
 	mutex_unlock(&vma->vm->mutex);
 
-	if (vma->obj)
-		rb_erase(&vma->obj_node, &vma->obj->vma_tree);
+	if (vma->obj) {
+		struct drm_i915_gem_object *obj = vma->obj;
+
+		spin_lock(&obj->vma.lock);
+		list_del(&vma->obj_link);
+		rb_erase(&vma->obj_node, &vma->obj->vma.tree);
+		spin_unlock(&obj->vma.lock);
+	}
 
 	rbtree_postorder_for_each_entry_safe(iter, n, &vma->active, node) {
 		GEM_BUG_ON(i915_gem_active_isset(&iter->base));
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 4f7c1c7599f4..7252abc73d3e 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -425,7 +425,7 @@ void i915_vma_parked(struct drm_i915_private *i915);
  * or the list is empty ofc.
  */
 #define for_each_ggtt_vma(V, OBJ) \
-	list_for_each_entry(V, &(OBJ)->vma_list, obj_link)		\
+	list_for_each_entry(V, &(OBJ)->vma.list, obj_link)		\
 		for_each_until(!i915_vma_is_ggtt(V))
 
 #endif
diff --git a/drivers/gpu/drm/i915/selftests/i915_vma.c b/drivers/gpu/drm/i915/selftests/i915_vma.c
index ffa74290e054..f1008b07dfd2 100644
--- a/drivers/gpu/drm/i915/selftests/i915_vma.c
+++ b/drivers/gpu/drm/i915/selftests/i915_vma.c
@@ -670,7 +670,7 @@ static int igt_vma_partial(void *arg)
 		}
 
 		count = 0;
-		list_for_each_entry(vma, &obj->vma_list, obj_link)
+		list_for_each_entry(vma, &obj->vma.list, obj_link)
 			count++;
 		if (count != nvma) {
 			pr_err("(%s) All partial vma were not recorded on the obj->vma_list: found %u, expected %u\n",
@@ -699,7 +699,7 @@ static int igt_vma_partial(void *arg)
 		i915_vma_unpin(vma);
 
 		count = 0;
-		list_for_each_entry(vma, &obj->vma_list, obj_link)
+		list_for_each_entry(vma, &obj->vma.list, obj_link)
 			count++;
 		if (count != nvma) {
 			pr_err("(%s) allocated an extra full vma!\n", p->name);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 10/38] drm/i915/selftests: Allocate mock ring/timeline per context
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (8 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 09/38] drm/i915: Move vma lookup to its own lock Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 16:26   ` Tvrtko Ursulin
  2019-01-18 14:00 ` [PATCH 11/38] drm/i915: Always allocate an object/vma for the HWSP Chris Wilson
                   ` (29 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

To correctly simulate preemption between contexts, we need independent
timelines along each context. Make it so.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/selftests/mock_engine.c | 90 ++++++++++----------
 1 file changed, 47 insertions(+), 43 deletions(-)

diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 9fe5b2c8f8d4..8b8d51af7d6a 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -30,6 +30,36 @@ struct mock_ring {
 	struct i915_timeline timeline;
 };
 
+static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
+{
+	const unsigned long sz = PAGE_SIZE / 2;
+	struct mock_ring *ring;
+
+	ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL);
+	if (!ring)
+		return NULL;
+
+	i915_timeline_init(engine->i915, &ring->timeline, engine->name);
+
+	ring->base.size = sz;
+	ring->base.effective_size = sz;
+	ring->base.vaddr = (void *)(ring + 1);
+	ring->base.timeline = &ring->timeline;
+
+	INIT_LIST_HEAD(&ring->base.request_list);
+	intel_ring_update_space(&ring->base);
+
+	return &ring->base;
+}
+
+static void mock_ring_free(struct intel_ring *base)
+{
+	struct mock_ring *ring = container_of(base, typeof(*ring), base);
+
+	i915_timeline_fini(&ring->timeline);
+	kfree(ring);
+}
+
 static struct mock_request *first_request(struct mock_engine *engine)
 {
 	return list_first_entry_or_null(&engine->hw_queue,
@@ -80,6 +110,9 @@ static void mock_context_unpin(struct intel_context *ce)
 static void mock_context_destroy(struct intel_context *ce)
 {
 	GEM_BUG_ON(ce->pin_count);
+
+	if (ce->ring)
+		mock_ring_free(ce->ring);
 }
 
 static const struct intel_context_ops mock_context_ops = {
@@ -93,13 +126,22 @@ mock_context_pin(struct intel_engine_cs *engine,
 {
 	struct intel_context *ce = to_intel_context(ctx, engine);
 
-	if (!ce->pin_count++) {
-		i915_gem_context_get(ctx);
-		ce->ring = engine->buffer;
-		ce->ops = &mock_context_ops;
+	if (ce->pin_count++)
+		return ce;
+
+	if (!ce->ring) {
+		ce->ring = mock_ring(engine);
+		if (!ce->ring)
+			goto err;
 	}
 
+	ce->ops = &mock_context_ops;
+	i915_gem_context_get(ctx);
 	return ce;
+
+err:
+	ce->pin_count = 0;
+	return ERR_PTR(-ENOMEM);
 }
 
 static int mock_request_alloc(struct i915_request *request)
@@ -143,36 +185,6 @@ static void mock_submit_request(struct i915_request *request)
 	spin_unlock_irq(&engine->hw_lock);
 }
 
-static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
-{
-	const unsigned long sz = PAGE_SIZE / 2;
-	struct mock_ring *ring;
-
-	ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL);
-	if (!ring)
-		return NULL;
-
-	i915_timeline_init(engine->i915, &ring->timeline, engine->name);
-
-	ring->base.size = sz;
-	ring->base.effective_size = sz;
-	ring->base.vaddr = (void *)(ring + 1);
-	ring->base.timeline = &ring->timeline;
-
-	INIT_LIST_HEAD(&ring->base.request_list);
-	intel_ring_update_space(&ring->base);
-
-	return &ring->base;
-}
-
-static void mock_ring_free(struct intel_ring *base)
-{
-	struct mock_ring *ring = container_of(base, typeof(*ring), base);
-
-	i915_timeline_fini(&ring->timeline);
-	kfree(ring);
-}
-
 struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 				    const char *name,
 				    int id)
@@ -207,17 +219,11 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 	timer_setup(&engine->hw_delay, hw_delay_complete, 0);
 	INIT_LIST_HEAD(&engine->hw_queue);
 
-	engine->base.buffer = mock_ring(&engine->base);
-	if (!engine->base.buffer)
-		goto err_breadcrumbs;
-
 	if (IS_ERR(intel_context_pin(i915->kernel_context, &engine->base)))
-		goto err_ring;
+		goto err_breadcrumbs;
 
 	return &engine->base;
 
-err_ring:
-	mock_ring_free(engine->base.buffer);
 err_breadcrumbs:
 	intel_engine_fini_breadcrumbs(&engine->base);
 	i915_timeline_fini(&engine->base.timeline);
@@ -260,8 +266,6 @@ void mock_engine_free(struct intel_engine_cs *engine)
 
 	__intel_context_unpin(engine->i915->kernel_context, engine);
 
-	mock_ring_free(engine->buffer);
-
 	intel_engine_fini_breadcrumbs(engine);
 	i915_timeline_fini(&engine->timeline);
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 11/38] drm/i915: Always allocate an object/vma for the HWSP
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (9 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 10/38] drm/i915/selftests: Allocate mock ring/timeline per context Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 12/38] drm/i915: Move list of timelines under its own lock Chris Wilson
                   ` (28 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx; +Cc: Matthew Auld

Currently we only allocate an object and vma if we are using a GGTT
virtual HWSP, and a plain struct page for a physical HWSP. For
convenience later on with global timelines, it will be useful to always
have the status page being tracked by a struct i915_vma. Make it so.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Matthew Auld <matthew.auld@intel.com>
---
 drivers/gpu/drm/i915/intel_engine_cs.c       | 109 ++++++++++---------
 drivers/gpu/drm/i915/intel_guc_submission.c  |   5 +
 drivers/gpu/drm/i915/intel_lrc.c             |  11 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c      |  20 +++-
 drivers/gpu/drm/i915/intel_ringbuffer.h      |  23 +---
 drivers/gpu/drm/i915/selftests/mock_engine.c |   2 +-
 6 files changed, 90 insertions(+), 80 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index fc52737751e7..4b4b7358c482 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -506,27 +506,61 @@ void intel_engine_setup_common(struct intel_engine_cs *engine)
 
 static void cleanup_status_page(struct intel_engine_cs *engine)
 {
+	struct i915_vma *vma;
+
 	/* Prevent writes into HWSP after returning the page to the system */
 	intel_engine_set_hwsp_writemask(engine, ~0u);
 
-	if (HWS_NEEDS_PHYSICAL(engine->i915)) {
-		void *addr = fetch_and_zero(&engine->status_page.page_addr);
+	vma = fetch_and_zero(&engine->status_page.vma);
+	if (!vma)
+		return;
 
-		__free_page(virt_to_page(addr));
-	}
+	if (!HWS_NEEDS_PHYSICAL(engine->i915))
+		i915_vma_unpin(vma);
+
+	i915_gem_object_unpin_map(vma->obj);
+	__i915_gem_object_release_unless_active(vma->obj);
+}
+
+static int pin_ggtt_status_page(struct intel_engine_cs *engine,
+				struct i915_vma *vma)
+{
+	unsigned int flags;
+
+	flags = PIN_GLOBAL;
+	if (!HAS_LLC(engine->i915))
+		/*
+		 * On g33, we cannot place HWS above 256MiB, so
+		 * restrict its pinning to the low mappable arena.
+		 * Though this restriction is not documented for
+		 * gen4, gen5, or byt, they also behave similarly
+		 * and hang if the HWS is placed at the top of the
+		 * GTT. To generalise, it appears that all !llc
+		 * platforms have issues with us placing the HWS
+		 * above the mappable region (even though we never
+		 * actually map it).
+		 */
+		flags |= PIN_MAPPABLE;
+	else
+		flags |= PIN_HIGH;
 
-	i915_vma_unpin_and_release(&engine->status_page.vma,
-				   I915_VMA_RELEASE_MAP);
+	return i915_vma_pin(vma, 0, 0, flags);
 }
 
 static int init_status_page(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_object *obj;
 	struct i915_vma *vma;
-	unsigned int flags;
 	void *vaddr;
 	int ret;
 
+	/*
+	 * Though the HWS register does support 36bit addresses, historically
+	 * we have had hangs and corruption reported due to wild writes if
+	 * the HWS is placed above 4G. We only allow objects to be allocated
+	 * in GFP_DMA32 for i965, and no earlier physical address users had
+	 * access to more than 4G.
+	 */
 	obj = i915_gem_object_create_internal(engine->i915, PAGE_SIZE);
 	if (IS_ERR(obj)) {
 		DRM_ERROR("Failed to allocate status page\n");
@@ -543,61 +577,30 @@ static int init_status_page(struct intel_engine_cs *engine)
 		goto err;
 	}
 
-	flags = PIN_GLOBAL;
-	if (!HAS_LLC(engine->i915))
-		/* On g33, we cannot place HWS above 256MiB, so
-		 * restrict its pinning to the low mappable arena.
-		 * Though this restriction is not documented for
-		 * gen4, gen5, or byt, they also behave similarly
-		 * and hang if the HWS is placed at the top of the
-		 * GTT. To generalise, it appears that all !llc
-		 * platforms have issues with us placing the HWS
-		 * above the mappable region (even though we never
-		 * actually map it).
-		 */
-		flags |= PIN_MAPPABLE;
-	else
-		flags |= PIN_HIGH;
-	ret = i915_vma_pin(vma, 0, 0, flags);
-	if (ret)
-		goto err;
-
 	vaddr = i915_gem_object_pin_map(obj, I915_MAP_WB);
 	if (IS_ERR(vaddr)) {
 		ret = PTR_ERR(vaddr);
-		goto err_unpin;
+		goto err;
 	}
 
+	engine->status_page.addr = memset(vaddr, 0, PAGE_SIZE);
 	engine->status_page.vma = vma;
-	engine->status_page.ggtt_offset = i915_ggtt_offset(vma);
-	engine->status_page.page_addr = memset(vaddr, 0, PAGE_SIZE);
+
+	if (!HWS_NEEDS_PHYSICAL(engine->i915)) {
+		ret = pin_ggtt_status_page(engine, vma);
+		if (ret)
+			goto err_unpin;
+	}
+
 	return 0;
 
 err_unpin:
-	i915_vma_unpin(vma);
+	i915_gem_object_unpin_map(obj);
 err:
 	i915_gem_object_put(obj);
 	return ret;
 }
 
-static int init_phys_status_page(struct intel_engine_cs *engine)
-{
-	struct page *page;
-
-	/*
-	 * Though the HWS register does support 36bit addresses, historically
-	 * we have had hangs and corruption reported due to wild writes if
-	 * the HWS is placed above 4G.
-	 */
-	page = alloc_page(GFP_KERNEL | __GFP_DMA32 | __GFP_ZERO);
-	if (!page)
-		return -ENOMEM;
-
-	engine->status_page.page_addr = page_address(page);
-
-	return 0;
-}
-
 static void __intel_context_unpin(struct i915_gem_context *ctx,
 				  struct intel_engine_cs *engine)
 {
@@ -650,10 +653,7 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
 	if (ret)
 		goto err_unpin_preempt;
 
-	if (HWS_NEEDS_PHYSICAL(i915))
-		ret = init_phys_status_page(engine);
-	else
-		ret = init_status_page(engine);
+	ret = init_status_page(engine);
 	if (ret)
 		goto err_breadcrumbs;
 
@@ -1318,7 +1318,8 @@ static void intel_engine_print_registers(const struct intel_engine_cs *engine,
 	}
 
 	if (HAS_EXECLISTS(dev_priv)) {
-		const u32 *hws = &engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
+		const u32 *hws =
+			&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
 		unsigned int idx;
 		u8 read, write;
 
@@ -1501,7 +1502,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	spin_unlock_irqrestore(&b->rb_lock, flags);
 
 	drm_printf(m, "HWSP:\n");
-	hexdump(m, engine->status_page.page_addr, PAGE_SIZE);
+	hexdump(m, engine->status_page.addr, PAGE_SIZE);
 
 	drm_printf(m, "Idle? %s\n", yesno(intel_engine_is_idle(engine)));
 }
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index 7217c7e3ee8d..b044162a41d3 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -81,6 +81,11 @@
  *
  */
 
+static inline u32 intel_hws_preempt_done_address(struct intel_engine_cs *engine)
+{
+	return i915_ggtt_offset(engine->status_page.vma) + I915_GEM_HWS_PREEMPT_ADDR;
+}
+
 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 {
 	return rb_entry(rb, struct i915_priolist, node);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ebb693c93046..b2d4abe7b601 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -172,6 +172,11 @@ static void execlists_init_reg_state(u32 *reg_state,
 				     struct intel_engine_cs *engine,
 				     struct intel_ring *ring);
 
+static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
+{
+	return i915_ggtt_offset(engine->status_page.vma) + I915_GEM_HWS_INDEX_ADDR;
+}
+
 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 {
 	return rb_entry(rb, struct i915_priolist, node);
@@ -1691,7 +1696,7 @@ static void enable_execlists(struct intel_engine_cs *engine)
 		   _MASKED_BIT_DISABLE(STOP_RING));
 
 	I915_WRITE(RING_HWS_PGA(engine->mmio_base),
-		   engine->status_page.ggtt_offset);
+		   i915_ggtt_offset(engine->status_page.vma));
 	POSTING_READ(RING_HWS_PGA(engine->mmio_base));
 }
 
@@ -2238,10 +2243,10 @@ static int logical_ring_init(struct intel_engine_cs *engine)
 	}
 
 	execlists->csb_status =
-		&engine->status_page.page_addr[I915_HWS_CSB_BUF0_INDEX];
+		&engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
 
 	execlists->csb_write =
-		&engine->status_page.page_addr[intel_hws_csb_write_index(i915)];
+		&engine->status_page.addr[intel_hws_csb_write_index(i915)];
 
 	reset_csb_pointers(execlists);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 662907e1a286..d72012b42f20 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -43,6 +43,11 @@
  */
 #define LEGACY_REQUEST_SIZE 200
 
+static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
+{
+	return i915_ggtt_offset(engine->status_page.vma) + I915_GEM_HWS_INDEX_ADDR;
+}
+
 static unsigned int __intel_ring_space(unsigned int head,
 				       unsigned int tail,
 				       unsigned int size)
@@ -499,12 +504,17 @@ static void set_hws_pga(struct intel_engine_cs *engine, phys_addr_t phys)
 	I915_WRITE(HWS_PGA, addr);
 }
 
-static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
+static struct page *status_page(struct intel_engine_cs *engine)
 {
-	struct page *page = virt_to_page(engine->status_page.page_addr);
-	phys_addr_t phys = PFN_PHYS(page_to_pfn(page));
+	struct drm_i915_gem_object *obj = engine->status_page.vma->obj;
 
-	set_hws_pga(engine, phys);
+	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
+	return sg_page(obj->mm.pages->sgl);
+}
+
+static void ring_setup_phys_status_page(struct intel_engine_cs *engine)
+{
+	set_hws_pga(engine, PFN_PHYS(page_to_pfn(status_page(engine))));
 	set_hwstam(engine, ~0u);
 }
 
@@ -571,7 +581,7 @@ static void flush_cs_tlb(struct intel_engine_cs *engine)
 
 static void ring_setup_status_page(struct intel_engine_cs *engine)
 {
-	set_hwsp(engine, engine->status_page.ggtt_offset);
+	set_hwsp(engine, i915_ggtt_offset(engine->status_page.vma));
 	set_hwstam(engine, ~0u);
 
 	flush_cs_tlb(engine);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 9ecae7de3b0a..d1a82610e0c1 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -32,8 +32,7 @@ struct i915_sched_attr;
 
 struct intel_hw_status_page {
 	struct i915_vma *vma;
-	u32 *page_addr;
-	u32 ggtt_offset;
+	u32 *addr;
 };
 
 #define I915_READ_TAIL(engine) I915_READ(RING_TAIL((engine)->mmio_base))
@@ -679,7 +678,7 @@ static inline u32
 intel_read_status_page(const struct intel_engine_cs *engine, int reg)
 {
 	/* Ensure that the compiler doesn't optimize away the load. */
-	return READ_ONCE(engine->status_page.page_addr[reg]);
+	return READ_ONCE(engine->status_page.addr[reg]);
 }
 
 static inline void
@@ -692,12 +691,12 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
 	 */
 	if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
 		mb();
-		clflush(&engine->status_page.page_addr[reg]);
-		engine->status_page.page_addr[reg] = value;
-		clflush(&engine->status_page.page_addr[reg]);
+		clflush(&engine->status_page.addr[reg]);
+		engine->status_page.addr[reg] = value;
+		clflush(&engine->status_page.addr[reg]);
 		mb();
 	} else {
-		WRITE_ONCE(engine->status_page.page_addr[reg], value);
+		WRITE_ONCE(engine->status_page.addr[reg], value);
 	}
 }
 
@@ -885,16 +884,6 @@ static inline bool intel_engine_has_started(struct intel_engine_cs *engine,
 void intel_engine_get_instdone(struct intel_engine_cs *engine,
 			       struct intel_instdone *instdone);
 
-static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
-{
-	return engine->status_page.ggtt_offset + I915_GEM_HWS_INDEX_ADDR;
-}
-
-static inline u32 intel_hws_preempt_done_address(struct intel_engine_cs *engine)
-{
-	return engine->status_page.ggtt_offset + I915_GEM_HWS_PREEMPT_ADDR;
-}
-
 /* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
 int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
 
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 8b8d51af7d6a..968a7e139a67 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -201,7 +201,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 	engine->base.i915 = i915;
 	snprintf(engine->base.name, sizeof(engine->base.name), "%s", name);
 	engine->base.id = id;
-	engine->base.status_page.page_addr = (void *)(engine + 1);
+	engine->base.status_page.addr = (void *)(engine + 1);
 
 	engine->base.context_pin = mock_context_pin;
 	engine->base.request_alloc = mock_request_alloc;
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 12/38] drm/i915: Move list of timelines under its own lock
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (10 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 11/38] drm/i915: Always allocate an object/vma for the HWSP Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 16:28   ` Tvrtko Ursulin
  2019-01-18 14:00 ` [PATCH 13/38] drm/i915: Introduce concept of per-timeline (context) HWSP Chris Wilson
                   ` (27 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Currently, the list of timelines is serialised by the struct_mutex, but
to alleviate difficulties with using that mutex in future, move the
list management under its own dedicated mutex.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h               |  5 +-
 drivers/gpu/drm/i915/i915_gem.c               | 88 +++++++++----------
 drivers/gpu/drm/i915/i915_reset.c             |  8 +-
 drivers/gpu/drm/i915/i915_timeline.c          | 38 ++++++--
 drivers/gpu/drm/i915/i915_timeline.h          |  3 +
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  7 +-
 .../gpu/drm/i915/selftests/mock_timeline.c    |  3 +-
 7 files changed, 94 insertions(+), 58 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 94680b15bed0..3913900600b7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1975,7 +1975,10 @@ struct drm_i915_private {
 		void (*resume)(struct drm_i915_private *);
 		void (*cleanup_engine)(struct intel_engine_cs *engine);
 
-		struct list_head timelines;
+		struct i915_gt_timelines {
+			struct mutex mutex; /* protects list, tainted by GPU */
+			struct list_head list;
+		} timelines;
 
 		struct list_head active_rings;
 		struct list_head closed_vma;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 15acd052da46..0b44059dfab2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3222,33 +3222,6 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
 	return ret;
 }
 
-static long wait_for_timeline(struct i915_timeline *tl,
-			      unsigned int flags, long timeout)
-{
-	struct i915_request *rq;
-
-	rq = i915_gem_active_get_unlocked(&tl->last_request);
-	if (!rq)
-		return timeout;
-
-	/*
-	 * "Race-to-idle".
-	 *
-	 * Switching to the kernel context is often used a synchronous
-	 * step prior to idling, e.g. in suspend for flushing all
-	 * current operations to memory before sleeping. These we
-	 * want to complete as quickly as possible to avoid prolonged
-	 * stalls, so allow the gpu to boost to maximum clocks.
-	 */
-	if (flags & I915_WAIT_FOR_IDLE_BOOST)
-		gen6_rps_boost(rq, NULL);
-
-	timeout = i915_request_wait(rq, flags, timeout);
-	i915_request_put(rq);
-
-	return timeout;
-}
-
 static int wait_for_engines(struct drm_i915_private *i915)
 {
 	if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
@@ -3265,6 +3238,8 @@ static int wait_for_engines(struct drm_i915_private *i915)
 int i915_gem_wait_for_idle(struct drm_i915_private *i915,
 			   unsigned int flags, long timeout)
 {
+	struct i915_timeline *tl;
+
 	GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
 		  flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
 		  timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
@@ -3273,17 +3248,44 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
 	if (!READ_ONCE(i915->gt.awake))
 		return 0;
 
+	mutex_lock(&i915->gt.timelines.mutex);
+	list_for_each_entry(tl, &i915->gt.timelines.list, link) {
+		struct i915_request *rq;
+
+		rq = i915_gem_active_get_unlocked(&tl->last_request);
+		if (!rq)
+			continue;
+
+		mutex_unlock(&i915->gt.timelines.mutex);
+
+		/*
+		 * "Race-to-idle".
+		 *
+		 * Switching to the kernel context is often used a synchronous
+		 * step prior to idling, e.g. in suspend for flushing all
+		 * current operations to memory before sleeping. These we
+		 * want to complete as quickly as possible to avoid prolonged
+		 * stalls, so allow the gpu to boost to maximum clocks.
+		 */
+		if (flags & I915_WAIT_FOR_IDLE_BOOST)
+			gen6_rps_boost(rq, NULL);
+
+		timeout = i915_request_wait(rq, flags, timeout);
+		i915_request_put(rq);
+		if (timeout < 0)
+			return timeout;
+
+		/* restart after reacquiring the lock */
+		mutex_lock(&i915->gt.timelines.mutex);
+		tl = list_entry(&i915->gt.timelines.list, typeof(*tl), link);
+	}
+	mutex_unlock(&i915->gt.timelines.mutex);
+
 	if (flags & I915_WAIT_LOCKED) {
-		struct i915_timeline *tl;
 		int err;
 
 		lockdep_assert_held(&i915->drm.struct_mutex);
 
-		list_for_each_entry(tl, &i915->gt.timelines, link) {
-			timeout = wait_for_timeline(tl, flags, timeout);
-			if (timeout < 0)
-				return timeout;
-		}
 		if (GEM_SHOW_DEBUG() && !timeout) {
 			/* Presume that timeout was non-zero to begin with! */
 			dev_warn(&i915->drm.pdev->dev,
@@ -3297,17 +3299,6 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
 
 		i915_retire_requests(i915);
 		GEM_BUG_ON(i915->gt.active_requests);
-	} else {
-		struct intel_engine_cs *engine;
-		enum intel_engine_id id;
-
-		for_each_engine(engine, i915, id) {
-			struct i915_timeline *tl = &engine->timeline;
-
-			timeout = wait_for_timeline(tl, flags, timeout);
-			if (timeout < 0)
-				return timeout;
-		}
 	}
 
 	return 0;
@@ -5008,6 +4999,8 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
 	}
 
+	i915_timelines_init(dev_priv);
+
 	ret = i915_gem_init_userptr(dev_priv);
 	if (ret)
 		return ret;
@@ -5130,8 +5123,10 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
 err_uc_misc:
 	intel_uc_fini_misc(dev_priv);
 
-	if (ret != -EIO)
+	if (ret != -EIO) {
 		i915_gem_cleanup_userptr(dev_priv);
+		i915_timelines_fini(dev_priv);
+	}
 
 	if (ret == -EIO) {
 		mutex_lock(&dev_priv->drm.struct_mutex);
@@ -5182,6 +5177,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv)
 
 	intel_uc_fini_misc(dev_priv);
 	i915_gem_cleanup_userptr(dev_priv);
+	i915_timelines_fini(dev_priv);
 
 	i915_gem_drain_freed_objects(dev_priv);
 
@@ -5284,7 +5280,6 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
 	if (!dev_priv->priorities)
 		goto err_dependencies;
 
-	INIT_LIST_HEAD(&dev_priv->gt.timelines);
 	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
 	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
 
@@ -5328,7 +5323,6 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
 	GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
 	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
 	WARN_ON(dev_priv->mm.object_count);
-	WARN_ON(!list_empty(&dev_priv->gt.timelines));
 
 	kmem_cache_destroy(dev_priv->priorities);
 	kmem_cache_destroy(dev_priv->dependencies);
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index d44b095e2860..12e5a2bc825c 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -850,7 +850,8 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 	 *
 	 * No more can be submitted until we reset the wedged bit.
 	 */
-	list_for_each_entry(tl, &i915->gt.timelines, link) {
+	mutex_lock(&i915->gt.timelines.mutex);
+	list_for_each_entry(tl, &i915->gt.timelines.list, link) {
 		struct i915_request *rq;
 		long timeout;
 
@@ -872,9 +873,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
 		timeout = dma_fence_default_wait(&rq->fence, true,
 						 MAX_SCHEDULE_TIMEOUT);
 		i915_request_put(rq);
-		if (timeout < 0)
+		if (timeout < 0) {
+			mutex_unlock(&i915->gt.timelines.mutex);
 			goto unlock;
+		}
 	}
+	mutex_unlock(&i915->gt.timelines.mutex);
 
 	intel_engines_sanitize(i915, false);
 
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 4667cc08c416..84550f17d3df 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -13,7 +13,7 @@ void i915_timeline_init(struct drm_i915_private *i915,
 			struct i915_timeline *timeline,
 			const char *name)
 {
-	lockdep_assert_held(&i915->drm.struct_mutex);
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
 
 	/*
 	 * Ideally we want a set of engines on a single leaf as we expect
@@ -23,9 +23,12 @@ void i915_timeline_init(struct drm_i915_private *i915,
 	 */
 	BUILD_BUG_ON(KSYNCMAP < I915_NUM_ENGINES);
 
+	timeline->i915 = i915;
 	timeline->name = name;
 
-	list_add(&timeline->link, &i915->gt.timelines);
+	mutex_lock(&gt->mutex);
+	list_add(&timeline->link, &gt->list);
+	mutex_unlock(&gt->mutex);
 
 	/* Called during early_init before we know how many engines there are */
 
@@ -39,6 +42,17 @@ void i915_timeline_init(struct drm_i915_private *i915,
 	i915_syncmap_init(&timeline->sync);
 }
 
+void i915_timelines_init(struct drm_i915_private *i915)
+{
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
+
+	mutex_init(&gt->mutex);
+	INIT_LIST_HEAD(&gt->list);
+
+	/* via i915_gem_wait_for_idle() */
+	i915_gem_shrinker_taints_mutex(i915, &gt->mutex);
+}
+
 /**
  * i915_timelines_park - called when the driver idles
  * @i915: the drm_i915_private device
@@ -51,11 +65,11 @@ void i915_timeline_init(struct drm_i915_private *i915,
  */
 void i915_timelines_park(struct drm_i915_private *i915)
 {
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
 	struct i915_timeline *timeline;
 
-	lockdep_assert_held(&i915->drm.struct_mutex);
-
-	list_for_each_entry(timeline, &i915->gt.timelines, link) {
+	mutex_lock(&gt->mutex);
+	list_for_each_entry(timeline, &gt->list, link) {
 		/*
 		 * All known fences are completed so we can scrap
 		 * the current sync point tracking and start afresh,
@@ -64,15 +78,20 @@ void i915_timelines_park(struct drm_i915_private *i915)
 		 */
 		i915_syncmap_free(&timeline->sync);
 	}
+	mutex_unlock(&gt->mutex);
 }
 
 void i915_timeline_fini(struct i915_timeline *timeline)
 {
+	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
+
 	GEM_BUG_ON(!list_empty(&timeline->requests));
 
 	i915_syncmap_free(&timeline->sync);
 
+	mutex_lock(&gt->mutex);
 	list_del(&timeline->link);
+	mutex_unlock(&gt->mutex);
 }
 
 struct i915_timeline *
@@ -99,6 +118,15 @@ void __i915_timeline_free(struct kref *kref)
 	kfree(timeline);
 }
 
+void i915_timelines_fini(struct drm_i915_private *i915)
+{
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
+
+	GEM_BUG_ON(!list_empty(&gt->list));
+
+	mutex_destroy(&gt->mutex);
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/mock_timeline.c"
 #include "selftests/i915_timeline.c"
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index 38c1e15e927a..87ad2dd31c20 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -66,6 +66,7 @@ struct i915_timeline {
 
 	struct list_head link;
 	const char *name;
+	struct drm_i915_private *i915;
 
 	struct kref kref;
 };
@@ -134,6 +135,8 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
 	return __i915_timeline_sync_is_later(tl, fence->context, fence->seqno);
 }
 
+void i915_timelines_init(struct drm_i915_private *i915);
 void i915_timelines_park(struct drm_i915_private *i915);
+void i915_timelines_fini(struct drm_i915_private *i915);
 
 #endif
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 888c6978bc54..41ae502361d7 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -68,13 +68,14 @@ static void mock_device_release(struct drm_device *dev)
 	i915_gem_contexts_fini(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 
+	i915_timelines_fini(i915);
+
 	drain_workqueue(i915->wq);
 	i915_gem_drain_freed_objects(i915);
 
 	mutex_lock(&i915->drm.struct_mutex);
 	mock_fini_ggtt(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
-	WARN_ON(!list_empty(&i915->gt.timelines));
 
 	destroy_workqueue(i915->wq);
 
@@ -226,7 +227,8 @@ struct drm_i915_private *mock_gem_device(void)
 	if (!i915->priorities)
 		goto err_dependencies;
 
-	INIT_LIST_HEAD(&i915->gt.timelines);
+	i915_timelines_init(i915);
+
 	INIT_LIST_HEAD(&i915->gt.active_rings);
 	INIT_LIST_HEAD(&i915->gt.closed_vma);
 
@@ -253,6 +255,7 @@ struct drm_i915_private *mock_gem_device(void)
 	i915_gem_contexts_fini(i915);
 err_unlock:
 	mutex_unlock(&i915->drm.struct_mutex);
+	i915_timelines_fini(i915);
 	kmem_cache_destroy(i915->priorities);
 err_dependencies:
 	kmem_cache_destroy(i915->dependencies);
diff --git a/drivers/gpu/drm/i915/selftests/mock_timeline.c b/drivers/gpu/drm/i915/selftests/mock_timeline.c
index dcf3b16f5a07..cf39ccd9fc05 100644
--- a/drivers/gpu/drm/i915/selftests/mock_timeline.c
+++ b/drivers/gpu/drm/i915/selftests/mock_timeline.c
@@ -10,6 +10,7 @@
 
 void mock_timeline_init(struct i915_timeline *timeline, u64 context)
 {
+	timeline->i915 = NULL;
 	timeline->fence_context = context;
 
 	spin_lock_init(&timeline->lock);
@@ -24,5 +25,5 @@ void mock_timeline_init(struct i915_timeline *timeline, u64 context)
 
 void mock_timeline_fini(struct i915_timeline *timeline)
 {
-	i915_timeline_fini(timeline);
+	i915_syncmap_free(&timeline->sync);
 }
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 13/38] drm/i915: Introduce concept of per-timeline (context) HWSP
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (11 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 12/38] drm/i915: Move list of timelines under its own lock Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 14/38] drm/i915: Enlarge vma->pin_count Chris Wilson
                   ` (26 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Supplement the per-engine HWSP with a per-timeline HWSP. That is a
per-request pointer through which we can check a local seqno,
abstracting away the presumption of a global seqno. In this first step,
we point each request back into the engine's HWSP so everything
continues to work with the global timeline.

v2: s/i915_request_hwsp/hwsp_seqno/ to emphasis that this is the current
HW value and that we are accessing it via i915_request merely as a
convenience.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_request.c | 16 +++++++++-----
 drivers/gpu/drm/i915/i915_request.h | 34 +++++++++++++++++++++++------
 drivers/gpu/drm/i915/intel_lrc.c    |  9 +++++---
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index fb723ed2f574..7d068c406a49 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -182,10 +182,11 @@ static void free_capture_list(struct i915_request *request)
 static void __retire_engine_request(struct intel_engine_cs *engine,
 				    struct i915_request *rq)
 {
-	GEM_TRACE("%s(%s) fence %llx:%lld, global=%d, current %d\n",
+	GEM_TRACE("%s(%s) fence %llx:%lld, global=%d, current %d:%d\n",
 		  __func__, engine->name,
 		  rq->fence.context, rq->fence.seqno,
 		  rq->global_seqno,
+		  hwsp_seqno(rq),
 		  intel_engine_get_seqno(engine));
 
 	GEM_BUG_ON(!i915_request_completed(rq));
@@ -244,10 +245,11 @@ static void i915_request_retire(struct i915_request *request)
 {
 	struct i915_gem_active *active, *next;
 
-	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d\n",
+	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d:%d\n",
 		  request->engine->name,
 		  request->fence.context, request->fence.seqno,
 		  request->global_seqno,
+		  hwsp_seqno(request),
 		  intel_engine_get_seqno(request->engine));
 
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
@@ -307,10 +309,11 @@ void i915_request_retire_upto(struct i915_request *rq)
 	struct intel_ring *ring = rq->ring;
 	struct i915_request *tmp;
 
-	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d\n",
+	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d:%d\n",
 		  rq->engine->name,
 		  rq->fence.context, rq->fence.seqno,
 		  rq->global_seqno,
+		  hwsp_seqno(rq),
 		  intel_engine_get_seqno(rq->engine));
 
 	lockdep_assert_held(&rq->i915->drm.struct_mutex);
@@ -348,10 +351,11 @@ void __i915_request_submit(struct i915_request *request)
 	struct intel_engine_cs *engine = request->engine;
 	u32 seqno;
 
-	GEM_TRACE("%s fence %llx:%lld -> global=%d, current %d\n",
+	GEM_TRACE("%s fence %llx:%lld -> global=%d, current %d:%d\n",
 		  engine->name,
 		  request->fence.context, request->fence.seqno,
 		  engine->timeline.seqno + 1,
+		  hwsp_seqno(request),
 		  intel_engine_get_seqno(engine));
 
 	GEM_BUG_ON(!irqs_disabled());
@@ -398,10 +402,11 @@ void __i915_request_unsubmit(struct i915_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
 
-	GEM_TRACE("%s fence %llx:%lld <- global=%d, current %d\n",
+	GEM_TRACE("%s fence %llx:%lld <- global=%d, current %d:%d\n",
 		  engine->name,
 		  request->fence.context, request->fence.seqno,
 		  request->global_seqno,
+		  hwsp_seqno(request),
 		  intel_engine_get_seqno(engine));
 
 	GEM_BUG_ON(!irqs_disabled());
@@ -609,6 +614,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	rq->ring = ce->ring;
 	rq->timeline = ce->ring->timeline;
 	GEM_BUG_ON(rq->timeline == &engine->timeline);
+	rq->hwsp_seqno = &engine->status_page.addr[I915_GEM_HWS_INDEX];
 
 	spin_lock_init(&rq->lock);
 	dma_fence_init(&rq->fence,
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index d014b0605445..4dd22dadf5ce 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -130,6 +130,13 @@ struct i915_request {
 	struct i915_sched_node sched;
 	struct i915_dependency dep;
 
+	/*
+	 * A convenience pointer to the current breadcrumb value stored in
+	 * the HW status page (or our timeline's local equivalent). The full
+	 * path would be rq->hw_context->ring->timeline->hwsp_seqno.
+	 */
+	const u32 *hwsp_seqno;
+
 	/**
 	 * GEM sequence number associated with this request on the
 	 * global execution timeline. It is zero when the request is not
@@ -280,11 +287,6 @@ long i915_request_wait(struct i915_request *rq,
 #define I915_WAIT_ALL		BIT(3) /* used by i915_gem_object_wait() */
 #define I915_WAIT_FOR_IDLE_BOOST BIT(4)
 
-static inline bool intel_engine_has_started(struct intel_engine_cs *engine,
-					    u32 seqno);
-static inline bool intel_engine_has_completed(struct intel_engine_cs *engine,
-					      u32 seqno);
-
 /**
  * Returns true if seq1 is later than seq2.
  */
@@ -293,6 +295,24 @@ static inline bool i915_seqno_passed(u32 seq1, u32 seq2)
 	return (s32)(seq1 - seq2) >= 0;
 }
 
+/**
+ * hwsp_seqno - the current breadcrumb value in the HW status page
+ * @rq: the request, to chase the relevant HW status page
+ *
+ * The emphasis in naming here is that hwsp_seqno() is not a property of the
+ * request, but an indication of the current HW state (associated with this
+ * request). Its value will change as the GPU executes more requests.
+ *
+ * Returns the current breadcrumb value in the associated HW status page (or
+ * the local timeline's equivalent) for this request. The request itself
+ * has the associated breadcrumb value of rq->fence.seqno, when the HW
+ * status page has that breadcrumb or later, this request is complete.
+ */
+static inline u32 hwsp_seqno(const struct i915_request *rq)
+{
+	return READ_ONCE(*rq->hwsp_seqno);
+}
+
 /**
  * i915_request_started - check if the request has begun being executed
  * @rq: the request
@@ -310,14 +330,14 @@ static inline bool i915_request_started(const struct i915_request *rq)
 	if (!seqno) /* not yet submitted to HW */
 		return false;
 
-	return intel_engine_has_started(rq->engine, seqno);
+	return i915_seqno_passed(hwsp_seqno(rq), seqno - 1);
 }
 
 static inline bool
 __i915_request_completed(const struct i915_request *rq, u32 seqno)
 {
 	GEM_BUG_ON(!seqno);
-	return intel_engine_has_completed(rq->engine, seqno) &&
+	return i915_seqno_passed(hwsp_seqno(rq), seqno) &&
 		seqno == i915_request_global_seqno(rq);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b2d4abe7b601..dabebb68759f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -446,11 +446,12 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
 			desc = execlists_update_context(rq);
 			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
 
-			GEM_TRACE("%s in[%d]:  ctx=%d.%d, global=%d (fence %llx:%lld) (current %d), prio=%d\n",
+			GEM_TRACE("%s in[%d]:  ctx=%d.%d, global=%d (fence %llx:%lld) (current %d:%d), prio=%d\n",
 				  engine->name, n,
 				  port[n].context_id, count,
 				  rq->global_seqno,
 				  rq->fence.context, rq->fence.seqno,
+				  hwsp_seqno(rq),
 				  intel_engine_get_seqno(engine),
 				  rq_prio(rq));
 		} else {
@@ -747,11 +748,12 @@ execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
 	while (num_ports-- && port_isset(port)) {
 		struct i915_request *rq = port_request(port);
 
-		GEM_TRACE("%s:port%u global=%d (fence %llx:%lld), (current %d)\n",
+		GEM_TRACE("%s:port%u global=%d (fence %llx:%lld), (current %d:%d)\n",
 			  rq->engine->name,
 			  (unsigned int)(port - execlists->port),
 			  rq->global_seqno,
 			  rq->fence.context, rq->fence.seqno,
+			  hwsp_seqno(rq),
 			  intel_engine_get_seqno(rq->engine));
 
 		GEM_BUG_ON(!execlists->active);
@@ -975,12 +977,13 @@ static void process_csb(struct intel_engine_cs *engine)
 						EXECLISTS_ACTIVE_USER));
 
 		rq = port_unpack(port, &count);
-		GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%lld) (current %d), prio=%d\n",
+		GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%lld) (current %d:%d), prio=%d\n",
 			  engine->name,
 			  port->context_id, count,
 			  rq ? rq->global_seqno : 0,
 			  rq ? rq->fence.context : 0,
 			  rq ? rq->fence.seqno : 0,
+			  rq ? hwsp_seqno(rq) : 0,
 			  intel_engine_get_seqno(engine),
 			  rq ? rq_prio(rq) : 0);
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 14/38] drm/i915: Enlarge vma->pin_count
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (12 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 13/38] drm/i915: Introduce concept of per-timeline (context) HWSP Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 15/38] drm/i915: Allocate a status page for each timeline Chris Wilson
                   ` (25 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Previously we only accommodated having a vma pinned by a small number of
users, with the maximum being pinned for use by the display engine. As
such, we used a small bitfield only large enough to allow the vma to
be pinned twice (for back/front buffers) in each scanout plane. Keeping
the maximum permissible pin_count small allows us to quickly catch a
potential leak. However, as we want to split a 4096B page into 64
different cachelines and pin each cacheline for use by a different
timeline, we will exceed the current maximum permissible vma->pin_count
and so time has come to enlarge it.

Whilst we are here, try to pull together the similar bits:

Address/layout specification:
 - bias, mappable, zone_4g: address limit specifiers
 - fixed: address override, limits still apply though
 - high: not strictly an address limit, but an address direction to search

Search controls:
 - nonblock, nonfault, noevict

v2: Rewrite the guideline comment on bit consumption.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: John Harrison <john.C.Harrison@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_gtt.h | 26 ++++++++---------
 drivers/gpu/drm/i915/i915_vma.h     | 45 +++++++++++++++++++----------
 2 files changed, 42 insertions(+), 29 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index bd679c8c56dd..03ade71b8d9a 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -642,19 +642,19 @@ int i915_gem_gtt_insert(struct i915_address_space *vm,
 
 /* Flags used by pin/bind&friends. */
 #define PIN_NONBLOCK		BIT_ULL(0)
-#define PIN_MAPPABLE		BIT_ULL(1)
-#define PIN_ZONE_4G		BIT_ULL(2)
-#define PIN_NONFAULT		BIT_ULL(3)
-#define PIN_NOEVICT		BIT_ULL(4)
-
-#define PIN_MBZ			BIT_ULL(5) /* I915_VMA_PIN_OVERFLOW */
-#define PIN_GLOBAL		BIT_ULL(6) /* I915_VMA_GLOBAL_BIND */
-#define PIN_USER		BIT_ULL(7) /* I915_VMA_LOCAL_BIND */
-#define PIN_UPDATE		BIT_ULL(8)
-
-#define PIN_HIGH		BIT_ULL(9)
-#define PIN_OFFSET_BIAS		BIT_ULL(10)
-#define PIN_OFFSET_FIXED	BIT_ULL(11)
+#define PIN_NONFAULT		BIT_ULL(1)
+#define PIN_NOEVICT		BIT_ULL(2)
+#define PIN_MAPPABLE		BIT_ULL(3)
+#define PIN_ZONE_4G		BIT_ULL(4)
+#define PIN_HIGH		BIT_ULL(5)
+#define PIN_OFFSET_BIAS		BIT_ULL(6)
+#define PIN_OFFSET_FIXED	BIT_ULL(7)
+
+#define PIN_MBZ			BIT_ULL(8) /* I915_VMA_PIN_OVERFLOW */
+#define PIN_GLOBAL		BIT_ULL(9) /* I915_VMA_GLOBAL_BIND */
+#define PIN_USER		BIT_ULL(10) /* I915_VMA_LOCAL_BIND */
+#define PIN_UPDATE		BIT_ULL(11)
+
 #define PIN_OFFSET_MASK		(-I915_GTT_PAGE_SIZE)
 
 #endif
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 7252abc73d3e..5793abe509a2 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -71,29 +71,42 @@ struct i915_vma {
 	unsigned int open_count;
 	unsigned long flags;
 	/**
-	 * How many users have pinned this object in GTT space. The following
-	 * users can each hold at most one reference: pwrite/pread, execbuffer
-	 * (objects are not allowed multiple times for the same batchbuffer),
-	 * and the framebuffer code. When switching/pageflipping, the
-	 * framebuffer code has at most two buffers pinned per crtc.
+	 * How many users have pinned this object in GTT space.
 	 *
-	 * In the worst case this is 1 + 1 + 1 + 2*2 = 7. That would fit into 3
-	 * bits with absolutely no headroom. So use 4 bits.
+	 * This is a tightly bound, fairly small number of users, so we
+	 * stuff inside the flags field so that we can both check for overflow
+	 * and detect a no-op i915_vma_pin() in a single check, while also
+	 * pinning the vma.
+	 *
+	 * The worst case display setup would have the same vma pinned for
+	 * use on each plane on each crtc, while also building the next atomic
+	 * state and holding a pin for the length of the cleanup queue. In the
+	 * future, the flip queue may be increased from 1.
+	 * Estimated worst case: 3 [qlen] * 4 [max crtcs] * 7 [max planes] = 84
+	 *
+	 * For GEM, the number of concurrent users for pwrite/pread is
+	 * unbounded. For execbuffer, it is currently one but will in future
+	 * be extended to allow multiple clients to pin vma concurrently.
+	 *
+	 * We also use suballocated pages, with each suballocation claiming
+	 * its own pin on the shared vma. At present, this is limited to
+	 * exclusive cachelines of a single page, so a maximum of 64 possible
+	 * users.
 	 */
-#define I915_VMA_PIN_MASK 0xf
-#define I915_VMA_PIN_OVERFLOW	BIT(5)
+#define I915_VMA_PIN_MASK 0xff
+#define I915_VMA_PIN_OVERFLOW	BIT(8)
 
 	/** Flags and address space this VMA is bound to */
-#define I915_VMA_GLOBAL_BIND	BIT(6)
-#define I915_VMA_LOCAL_BIND	BIT(7)
+#define I915_VMA_GLOBAL_BIND	BIT(9)
+#define I915_VMA_LOCAL_BIND	BIT(10)
 #define I915_VMA_BIND_MASK (I915_VMA_GLOBAL_BIND | I915_VMA_LOCAL_BIND | I915_VMA_PIN_OVERFLOW)
 
-#define I915_VMA_GGTT		BIT(8)
-#define I915_VMA_CAN_FENCE	BIT(9)
-#define I915_VMA_CLOSED		BIT(10)
-#define I915_VMA_USERFAULT_BIT	11
+#define I915_VMA_GGTT		BIT(11)
+#define I915_VMA_CAN_FENCE	BIT(12)
+#define I915_VMA_CLOSED		BIT(13)
+#define I915_VMA_USERFAULT_BIT	14
 #define I915_VMA_USERFAULT	BIT(I915_VMA_USERFAULT_BIT)
-#define I915_VMA_GGTT_WRITE	BIT(12)
+#define I915_VMA_GGTT_WRITE	BIT(15)
 
 	unsigned int active_count;
 	struct rb_root active;
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 15/38] drm/i915: Allocate a status page for each timeline
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (13 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 14/38] drm/i915: Enlarge vma->pin_count Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-21 11:18   ` Tvrtko Ursulin
  2019-01-18 14:00 ` [PATCH 16/38] drm/i915: Share per-timeline HWSP using a slab suballocator Chris Wilson
                   ` (24 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Allocate a page for use as a status page by a group of timelines, as we
only need a dword of storage for each (rounded up to the cacheline for
safety) we can pack multiple timelines into the same page. Each timeline
will then be able to track its own HW seqno.

v2: Reuse the common per-engine HWSP for the solitary ringbuffer
timeline, so that we do not have to emit (using per-gen specialised
vfuncs) the breadcrumb into the distinct timeline HWSP and instead can
keep on using the common MI_STORE_DWORD_INDEX. However, to maintain the
sleight-of-hand for the global/per-context seqno switchover, we will
store both temporarily (and so use a custom offset for the shared timeline
HWSP until the switch over).

v3: Keep things simple and allocate a page for each timeline, page
sharing comes next.

v4: I was caught repeating the same MI_STORE_DWORD_IMM over and over
again in selftests.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_timeline.c          | 106 +++++-
 drivers/gpu/drm/i915/i915_timeline.h          |  21 +-
 drivers/gpu/drm/i915/intel_engine_cs.c        |  64 ++--
 drivers/gpu/drm/i915/intel_lrc.c              |  22 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c       |  10 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h       |   6 +-
 .../drm/i915/selftests/i915_live_selftests.h  |   1 +
 .../drm/i915/selftests/i915_mock_selftests.h  |   2 +-
 .../gpu/drm/i915/selftests/i915_timeline.c    | 314 +++++++++++++++++-
 drivers/gpu/drm/i915/selftests/mock_engine.c  |  17 +-
 10 files changed, 512 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 84550f17d3df..a7d902e9eaf1 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -9,11 +9,38 @@
 #include "i915_timeline.h"
 #include "i915_syncmap.h"
 
-void i915_timeline_init(struct drm_i915_private *i915,
-			struct i915_timeline *timeline,
-			const char *name)
+static int hwsp_alloc(struct i915_timeline *timeline)
+{
+	struct drm_i915_private *i915 = timeline->i915;
+	struct drm_i915_gem_object *obj;
+	struct i915_vma *vma;
+
+	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
+	if (IS_ERR(obj))
+		return PTR_ERR(obj);
+
+	i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
+
+	vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
+	if (IS_ERR(vma)) {
+		i915_gem_object_put(obj);
+		return PTR_ERR(vma);
+	}
+
+	timeline->hwsp_ggtt = vma;
+	timeline->hwsp_offset = 0;
+
+	return 0;
+}
+
+int i915_timeline_init(struct drm_i915_private *i915,
+		       struct i915_timeline *timeline,
+		       const char *name,
+		       struct i915_vma *global_hwsp)
 {
 	struct i915_gt_timelines *gt = &i915->gt.timelines;
+	void *vaddr;
+	int err;
 
 	/*
 	 * Ideally we want a set of engines on a single leaf as we expect
@@ -25,10 +52,27 @@ void i915_timeline_init(struct drm_i915_private *i915,
 
 	timeline->i915 = i915;
 	timeline->name = name;
+	timeline->pin_count = 0;
+
+	if (global_hwsp) {
+		timeline->hwsp_ggtt = i915_vma_get(global_hwsp);
+		timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
+	} else {
+		err = hwsp_alloc(timeline);
+		if (err)
+			return err;
+	}
 
-	mutex_lock(&gt->mutex);
-	list_add(&timeline->link, &gt->list);
-	mutex_unlock(&gt->mutex);
+	vaddr = i915_gem_object_pin_map(timeline->hwsp_ggtt->obj, I915_MAP_WB);
+	if (IS_ERR(vaddr)) {
+		i915_vma_put(timeline->hwsp_ggtt);
+		return PTR_ERR(vaddr);
+	}
+
+	timeline->hwsp_seqno =
+		memset(vaddr + timeline->hwsp_offset,
+		       0,
+		       sizeof(*timeline->hwsp_seqno));
 
 	/* Called during early_init before we know how many engines there are */
 
@@ -40,6 +84,12 @@ void i915_timeline_init(struct drm_i915_private *i915,
 	INIT_LIST_HEAD(&timeline->requests);
 
 	i915_syncmap_init(&timeline->sync);
+
+	mutex_lock(&gt->mutex);
+	list_add(&timeline->link, &gt->list);
+	mutex_unlock(&gt->mutex);
+
+	return 0;
 }
 
 void i915_timelines_init(struct drm_i915_private *i915)
@@ -85,6 +135,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 {
 	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
 
+	GEM_BUG_ON(timeline->pin_count);
 	GEM_BUG_ON(!list_empty(&timeline->requests));
 
 	i915_syncmap_free(&timeline->sync);
@@ -92,23 +143,62 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 	mutex_lock(&gt->mutex);
 	list_del(&timeline->link);
 	mutex_unlock(&gt->mutex);
+
+	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
+	i915_vma_put(timeline->hwsp_ggtt);
 }
 
 struct i915_timeline *
-i915_timeline_create(struct drm_i915_private *i915, const char *name)
+i915_timeline_create(struct drm_i915_private *i915,
+		     const char *name,
+		     struct i915_vma *global_hwsp)
 {
 	struct i915_timeline *timeline;
+	int err;
 
 	timeline = kzalloc(sizeof(*timeline), GFP_KERNEL);
 	if (!timeline)
 		return ERR_PTR(-ENOMEM);
 
-	i915_timeline_init(i915, timeline, name);
+	err = i915_timeline_init(i915, timeline, name, global_hwsp);
+	if (err) {
+		kfree(timeline);
+		return ERR_PTR(err);
+	}
+
 	kref_init(&timeline->kref);
 
 	return timeline;
 }
 
+int i915_timeline_pin(struct i915_timeline *tl)
+{
+	int err;
+
+	if (tl->pin_count++)
+		return 0;
+	GEM_BUG_ON(!tl->pin_count);
+
+	err = i915_vma_pin(tl->hwsp_ggtt, 0, 0, PIN_GLOBAL | PIN_HIGH);
+	if (err)
+		goto unpin;
+
+	return 0;
+
+unpin:
+	tl->pin_count = 0;
+	return err;
+}
+
+void i915_timeline_unpin(struct i915_timeline *tl)
+{
+	GEM_BUG_ON(!tl->pin_count);
+	if (--tl->pin_count)
+		return;
+
+	__i915_vma_unpin(tl->hwsp_ggtt);
+}
+
 void __i915_timeline_free(struct kref *kref)
 {
 	struct i915_timeline *timeline =
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index 87ad2dd31c20..0c3739d53d79 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -32,6 +32,8 @@
 #include "i915_syncmap.h"
 #include "i915_utils.h"
 
+struct i915_vma;
+
 struct i915_timeline {
 	u64 fence_context;
 	u32 seqno;
@@ -40,6 +42,11 @@ struct i915_timeline {
 #define TIMELINE_CLIENT 0 /* default subclass */
 #define TIMELINE_ENGINE 1
 
+	unsigned int pin_count;
+	const u32 *hwsp_seqno;
+	struct i915_vma *hwsp_ggtt;
+	u32 hwsp_offset;
+
 	/**
 	 * List of breadcrumbs associated with GPU requests currently
 	 * outstanding.
@@ -71,9 +78,10 @@ struct i915_timeline {
 	struct kref kref;
 };
 
-void i915_timeline_init(struct drm_i915_private *i915,
-			struct i915_timeline *tl,
-			const char *name);
+int i915_timeline_init(struct drm_i915_private *i915,
+		       struct i915_timeline *tl,
+		       const char *name,
+		       struct i915_vma *hwsp);
 void i915_timeline_fini(struct i915_timeline *tl);
 
 static inline void
@@ -96,7 +104,9 @@ i915_timeline_set_subclass(struct i915_timeline *timeline,
 }
 
 struct i915_timeline *
-i915_timeline_create(struct drm_i915_private *i915, const char *name);
+i915_timeline_create(struct drm_i915_private *i915,
+		     const char *name,
+		     struct i915_vma *global_hwsp);
 
 static inline struct i915_timeline *
 i915_timeline_get(struct i915_timeline *timeline)
@@ -135,6 +145,9 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
 	return __i915_timeline_sync_is_later(tl, fence->context, fence->seqno);
 }
 
+int i915_timeline_pin(struct i915_timeline *tl);
+void i915_timeline_unpin(struct i915_timeline *tl);
+
 void i915_timelines_init(struct drm_i915_private *i915);
 void i915_timelines_park(struct drm_i915_private *i915);
 void i915_timelines_fini(struct drm_i915_private *i915);
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 4b4b7358c482..c850d131d8c3 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -484,26 +484,6 @@ static void intel_engine_init_execlist(struct intel_engine_cs *engine)
 	execlists->queue = RB_ROOT_CACHED;
 }
 
-/**
- * intel_engines_setup_common - setup engine state not requiring hw access
- * @engine: Engine to setup.
- *
- * Initializes @engine@ structure members shared between legacy and execlists
- * submission modes which do not require hardware access.
- *
- * Typically done early in the submission mode specific engine setup stage.
- */
-void intel_engine_setup_common(struct intel_engine_cs *engine)
-{
-	i915_timeline_init(engine->i915, &engine->timeline, engine->name);
-	i915_timeline_set_subclass(&engine->timeline, TIMELINE_ENGINE);
-
-	intel_engine_init_execlist(engine);
-	intel_engine_init_hangcheck(engine);
-	intel_engine_init_batch_pool(engine);
-	intel_engine_init_cmd_parser(engine);
-}
-
 static void cleanup_status_page(struct intel_engine_cs *engine)
 {
 	struct i915_vma *vma;
@@ -601,6 +581,44 @@ static int init_status_page(struct intel_engine_cs *engine)
 	return ret;
 }
 
+/**
+ * intel_engines_setup_common - setup engine state not requiring hw access
+ * @engine: Engine to setup.
+ *
+ * Initializes @engine@ structure members shared between legacy and execlists
+ * submission modes which do not require hardware access.
+ *
+ * Typically done early in the submission mode specific engine setup stage.
+ */
+int intel_engine_setup_common(struct intel_engine_cs *engine)
+{
+	int err;
+
+	err = init_status_page(engine);
+	if (err)
+		return err;
+
+	err = i915_timeline_init(engine->i915,
+				 &engine->timeline,
+				 engine->name,
+				 engine->status_page.vma);
+	if (err)
+		goto err_hwsp;
+
+	i915_timeline_set_subclass(&engine->timeline, TIMELINE_ENGINE);
+
+	intel_engine_init_execlist(engine);
+	intel_engine_init_hangcheck(engine);
+	intel_engine_init_batch_pool(engine);
+	intel_engine_init_cmd_parser(engine);
+
+	return 0;
+
+err_hwsp:
+	cleanup_status_page(engine);
+	return err;
+}
+
 static void __intel_context_unpin(struct i915_gem_context *ctx,
 				  struct intel_engine_cs *engine)
 {
@@ -653,14 +671,8 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
 	if (ret)
 		goto err_unpin_preempt;
 
-	ret = init_status_page(engine);
-	if (ret)
-		goto err_breadcrumbs;
-
 	return 0;
 
-err_breadcrumbs:
-	intel_engine_fini_breadcrumbs(engine);
 err_unpin_preempt:
 	if (i915->preempt_context)
 		__intel_context_unpin(i915->preempt_context, engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dabebb68759f..ba59241add47 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2199,10 +2199,14 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
 	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
 }
 
-static void
+static int
 logical_ring_setup(struct intel_engine_cs *engine)
 {
-	intel_engine_setup_common(engine);
+	int err;
+
+	err = intel_engine_setup_common(engine);
+	if (err)
+		return err;
 
 	/* Intentionally left blank. */
 	engine->buffer = NULL;
@@ -2212,6 +2216,8 @@ logical_ring_setup(struct intel_engine_cs *engine)
 
 	logical_ring_default_vfuncs(engine);
 	logical_ring_default_irqs(engine);
+
+	return 0;
 }
 
 static int logical_ring_init(struct intel_engine_cs *engine)
@@ -2260,7 +2266,9 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
 {
 	int ret;
 
-	logical_ring_setup(engine);
+	ret = logical_ring_setup(engine);
+	if (ret)
+		return ret;
 
 	/* Override some for render ring. */
 	engine->init_context = gen8_init_rcs_context;
@@ -2290,7 +2298,11 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
 
 int logical_xcs_ring_init(struct intel_engine_cs *engine)
 {
-	logical_ring_setup(engine);
+	int err;
+
+	err = logical_ring_setup(engine);
+	if (err)
+		return err;
 
 	return logical_ring_init(engine);
 }
@@ -2624,7 +2636,7 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 		goto error_deref_obj;
 	}
 
-	timeline = i915_timeline_create(ctx->i915, ctx->name);
+	timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
 	if (IS_ERR(timeline)) {
 		ret = PTR_ERR(timeline);
 		goto error_deref_obj;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index d72012b42f20..5887304bc3ae 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1539,9 +1539,13 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 	struct intel_ring *ring;
 	int err;
 
-	intel_engine_setup_common(engine);
+	err = intel_engine_setup_common(engine);
+	if (err)
+		return err;
 
-	timeline = i915_timeline_create(engine->i915, engine->name);
+	timeline = i915_timeline_create(engine->i915,
+					engine->name,
+					engine->status_page.vma);
 	if (IS_ERR(timeline)) {
 		err = PTR_ERR(timeline);
 		goto err;
@@ -1565,6 +1569,8 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 	if (err)
 		goto err_unpin;
 
+	GEM_BUG_ON(ring->timeline->hwsp_ggtt != engine->status_page.vma);
+
 	return 0;
 
 err_unpin:
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d1a82610e0c1..de0099ea926b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -720,7 +720,9 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
 #define I915_GEM_HWS_INDEX_ADDR (I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
 #define I915_GEM_HWS_PREEMPT_INDEX	0x32
 #define I915_GEM_HWS_PREEMPT_ADDR (I915_GEM_HWS_PREEMPT_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
-#define I915_GEM_HWS_SCRATCH_INDEX	0x40
+#define I915_GEM_HWS_SEQNO		0x40
+#define I915_GEM_HWS_SEQNO_ADDR (I915_GEM_HWS_SEQNO << MI_STORE_DWORD_INDEX_SHIFT)
+#define I915_GEM_HWS_SCRATCH_INDEX	0x80
 #define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
 
 #define I915_HWS_CSB_BUF0_INDEX		0x10
@@ -826,7 +828,7 @@ intel_ring_set_tail(struct intel_ring *ring, unsigned int tail)
 
 void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno);
 
-void intel_engine_setup_common(struct intel_engine_cs *engine);
+int intel_engine_setup_common(struct intel_engine_cs *engine);
 int intel_engine_init_common(struct intel_engine_cs *engine);
 void intel_engine_cleanup_common(struct intel_engine_cs *engine);
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
index a15713cae3b3..76b4f87fc853 100644
--- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
@@ -13,6 +13,7 @@ selftest(sanitycheck, i915_live_sanitycheck) /* keep first (igt selfcheck) */
 selftest(uncore, intel_uncore_live_selftests)
 selftest(workarounds, intel_workarounds_live_selftests)
 selftest(requests, i915_request_live_selftests)
+selftest(timelines, i915_timeline_live_selftests)
 selftest(objects, i915_gem_object_live_selftests)
 selftest(dmabuf, i915_gem_dmabuf_live_selftests)
 selftest(coherency, i915_gem_coherency_live_selftests)
diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
index 1b70208eeea7..4a83a1c6c406 100644
--- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
@@ -16,7 +16,7 @@ selftest(syncmap, i915_syncmap_mock_selftests)
 selftest(uncore, intel_uncore_mock_selftests)
 selftest(engine, intel_engine_cs_mock_selftests)
 selftest(breadcrumbs, intel_breadcrumbs_mock_selftests)
-selftest(timelines, i915_gem_timeline_mock_selftests)
+selftest(timelines, i915_timeline_mock_selftests)
 selftest(requests, i915_request_mock_selftests)
 selftest(objects, i915_gem_object_mock_selftests)
 selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
diff --git a/drivers/gpu/drm/i915/selftests/i915_timeline.c b/drivers/gpu/drm/i915/selftests/i915_timeline.c
index 19f1c6a5c8fb..15a33ec7217d 100644
--- a/drivers/gpu/drm/i915/selftests/i915_timeline.c
+++ b/drivers/gpu/drm/i915/selftests/i915_timeline.c
@@ -256,7 +256,7 @@ static int bench_sync(void *arg)
 	return 0;
 }
 
-int i915_gem_timeline_mock_selftests(void)
+int i915_timeline_mock_selftests(void)
 {
 	static const struct i915_subtest tests[] = {
 		SUBTEST(igt_sync),
@@ -265,3 +265,315 @@ int i915_gem_timeline_mock_selftests(void)
 
 	return i915_subtests(tests, NULL);
 }
+
+static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
+{
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	if (INTEL_GEN(rq->i915) >= 8) {
+		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+		*cs++ = addr;
+		*cs++ = 0;
+		*cs++ = value;
+	} else if (INTEL_GEN(rq->i915) >= 4) {
+		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+		*cs++ = 0;
+		*cs++ = addr;
+		*cs++ = value;
+	} else {
+		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+		*cs++ = addr;
+		*cs++ = value;
+		*cs++ = MI_NOOP;
+	}
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static u32 hwsp_address(const struct i915_timeline *tl)
+{
+	return i915_ggtt_offset(tl->hwsp_ggtt) + tl->hwsp_offset;
+}
+
+static struct i915_request *
+tl_write(struct i915_timeline *tl, struct intel_engine_cs *engine, u32 value)
+{
+	struct i915_request *rq;
+	int err;
+
+	lockdep_assert_held(tl->i915->drm.struct_mutex); /* lazy with rq refs */
+
+	err = i915_timeline_pin(tl);
+	if (err)
+		return ERR_PTR(err);
+
+	rq = i915_request_alloc(engine, engine->i915->kernel_context);
+	if (IS_ERR(rq))
+		goto out_unpin;
+
+	err = emit_ggtt_store_dw(rq, hwsp_address(tl), value);
+	i915_request_add(rq);
+	if (err)
+		rq = ERR_PTR(err);
+
+out_unpin:
+	i915_timeline_unpin(tl);
+	return rq;
+}
+
+static int live_hwsp_engine(void *arg)
+{
+#define NUM_TIMELINES 4096
+	struct drm_i915_private *i915 = arg;
+	struct i915_timeline **timelines;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
+	unsigned long count, n;
+	int err = 0;
+
+	/*
+	 * Create a bunch of timelines and check we can write
+	 * independently to each of their breadcrumb slots.
+	 */
+
+	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
+				   sizeof(*timelines),
+				   GFP_KERNEL);
+	if (!timelines)
+		return -ENOMEM;
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	count = 0;
+	for_each_engine(engine, i915, id) {
+		if (!intel_engine_can_store_dword(engine))
+			continue;
+
+		for (n = 0; n < NUM_TIMELINES; n++) {
+			struct i915_timeline *tl;
+			struct i915_request *rq;
+
+			tl = i915_timeline_create(i915, "live", NULL);
+			if (IS_ERR(tl)) {
+				err = PTR_ERR(tl);
+				goto out;
+			}
+
+			if (*tl->hwsp_seqno != tl->seqno) {
+				pr_err("Timeline %lu created with incorrect breadcrumb, found %x, expected %x\n",
+				       count, *tl->hwsp_seqno, tl->seqno);
+				err = -EINVAL;
+				i915_timeline_put(tl);
+				goto out;
+			}
+
+			rq = tl_write(tl, engine, count);
+			if (IS_ERR(rq)) {
+				pr_err("Failed to write to timeline!\n");
+				i915_timeline_put(tl);
+				err = PTR_ERR(rq);
+				goto out;
+			}
+
+			timelines[count++] = tl;
+		}
+	}
+
+	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, MAX_SCHEDULE_TIMEOUT);
+
+out:
+	for (n = 0; n < count; n++) {
+		struct i915_timeline *tl = timelines[n];
+
+		if (!err && *tl->hwsp_seqno != n) {
+			pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
+			       n, *tl->hwsp_seqno);
+			err = -EINVAL;
+		}
+		i915_timeline_put(tl);
+	}
+
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	kvfree(timelines);
+
+	return err;
+#undef NUM_TIMELINES
+}
+
+static int live_hwsp_alternate(void *arg)
+{
+#define NUM_TIMELINES 4096
+	struct drm_i915_private *i915 = arg;
+	struct i915_timeline **timelines;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
+	unsigned long count, n;
+	int err = 0;
+
+	/*
+	 * Create a bunch of timelines and check we can write
+	 * independently to each of their breadcrumb slots with adjacent
+	 * engines.
+	 */
+
+	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
+				   sizeof(*timelines),
+				   GFP_KERNEL);
+	if (!timelines)
+		return -ENOMEM;
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	count = 0;
+	for (n = 0; n < NUM_TIMELINES; n++) {
+		for_each_engine(engine, i915, id) {
+			struct i915_timeline *tl;
+			struct i915_request *rq;
+
+			if (!intel_engine_can_store_dword(engine))
+				continue;
+
+			tl = i915_timeline_create(i915, "live", NULL);
+			if (IS_ERR(tl)) {
+				err = PTR_ERR(tl);
+				goto out;
+			}
+
+			if (*tl->hwsp_seqno != tl->seqno) {
+				pr_err("Timeline %lu created with incorrect breadcrumb, found %x, expected %x\n",
+				       count, *tl->hwsp_seqno, tl->seqno);
+				err = -EINVAL;
+				i915_timeline_put(tl);
+				goto out;
+			}
+
+			rq = tl_write(tl, engine, count);
+			if (IS_ERR(rq)) {
+				pr_err("Failed to write to timeline!\n");
+				i915_timeline_put(tl);
+				err = PTR_ERR(rq);
+				goto out;
+			}
+
+			timelines[count++] = tl;
+		}
+	}
+
+	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, MAX_SCHEDULE_TIMEOUT);
+
+out:
+	for (n = 0; n < count; n++) {
+		struct i915_timeline *tl = timelines[n];
+
+		if (!err && *tl->hwsp_seqno != n) {
+			pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
+			       n, *tl->hwsp_seqno);
+			err = -EINVAL;
+		}
+		i915_timeline_put(tl);
+	}
+
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	kvfree(timelines);
+
+	return err;
+#undef NUM_TIMELINES
+}
+
+static int live_hwsp_recycle(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
+	unsigned long count;
+	int err = 0;
+
+	/*
+	 * Check seqno writes into one timeline at a time. We expect to
+	 * recycle the breadcrumb slot between iterations and neither
+	 * want to confuse ourselves or the GPU.
+	 */
+
+	mutex_lock(&i915->drm.struct_mutex);
+	wakeref = intel_runtime_pm_get(i915);
+
+	count = 0;
+	for_each_engine(engine, i915, id) {
+		IGT_TIMEOUT(end_time);
+
+		if (!intel_engine_can_store_dword(engine))
+			continue;
+
+		do {
+			struct i915_timeline *tl;
+			struct i915_request *rq;
+
+			tl = i915_timeline_create(i915, "live", NULL);
+			if (IS_ERR(tl)) {
+				err = PTR_ERR(tl);
+				goto out;
+			}
+
+			if (*tl->hwsp_seqno != tl->seqno) {
+				pr_err("Timeline %lu created with incorrect breadcrumb, found %x, expected %x\n",
+				       count, *tl->hwsp_seqno, tl->seqno);
+				err = -EINVAL;
+				i915_timeline_put(tl);
+				goto out;
+			}
+
+			rq = tl_write(tl, engine, count);
+			if (IS_ERR(rq)) {
+				pr_err("Failed to write to timeline!\n");
+				i915_timeline_put(tl);
+				err = PTR_ERR(rq);
+				goto out;
+			}
+
+			i915_request_wait(rq, I915_WAIT_LOCKED, MAX_SCHEDULE_TIMEOUT);
+			if (*tl->hwsp_seqno != count) {
+				pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
+				       count, *tl->hwsp_seqno);
+				err = -EINVAL;
+			}
+
+			i915_timeline_put(tl);
+			count++;
+
+			if (err)
+				goto out;
+		} while (!__igt_timeout(end_time, NULL));
+	}
+
+out:
+	intel_runtime_pm_put(i915, wakeref);
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	return err;
+}
+
+int i915_timeline_live_selftests(struct drm_i915_private *i915)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(live_hwsp_recycle),
+		SUBTEST(live_hwsp_engine),
+		SUBTEST(live_hwsp_alternate),
+	};
+
+	return i915_subtests(tests, i915);
+}
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 968a7e139a67..acd27c7e807b 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -34,12 +34,20 @@ static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
 {
 	const unsigned long sz = PAGE_SIZE / 2;
 	struct mock_ring *ring;
+	int err;
 
 	ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL);
 	if (!ring)
 		return NULL;
 
-	i915_timeline_init(engine->i915, &ring->timeline, engine->name);
+	err = i915_timeline_init(engine->i915,
+				 &ring->timeline,
+				 engine->name,
+				 NULL);
+	if (err) {
+		kfree(ring);
+		return NULL;
+	}
 
 	ring->base.size = sz;
 	ring->base.effective_size = sz;
@@ -209,7 +217,11 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 	engine->base.emit_breadcrumb = mock_emit_breadcrumb;
 	engine->base.submit_request = mock_submit_request;
 
-	i915_timeline_init(i915, &engine->base.timeline, engine->base.name);
+	if (i915_timeline_init(i915,
+			       &engine->base.timeline,
+			       engine->base.name,
+			       NULL))
+		goto err_free;
 	i915_timeline_set_subclass(&engine->base.timeline, TIMELINE_ENGINE);
 
 	intel_engine_init_breadcrumbs(&engine->base);
@@ -227,6 +239,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 err_breadcrumbs:
 	intel_engine_fini_breadcrumbs(&engine->base);
 	i915_timeline_fini(&engine->base.timeline);
+err_free:
 	kfree(engine);
 	return NULL;
 }
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 16/38] drm/i915: Share per-timeline HWSP using a slab suballocator
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (14 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 15/38] drm/i915: Allocate a status page for each timeline Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 17/38] drm/i915: Keep all partially allocated HWSP on a freelist Chris Wilson
                   ` (23 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

If we restrict ourselves to only using a cacheline for each timeline's
HWSP (we could go smaller, but want to avoid needless polluting
cachelines on different engines between different contexts), then we can
suballocate a single 4k page into 64 different timeline HWSP. By
treating each fresh allocation as a slab of 64 entries, we can keep it
around for the next 64 allocation attempts until we need to refresh the
slab cache.

John Harrison noted the issue of fragmentation leading to the same worst
case performance of one page per timeline as before, which can be
mitigated by adopting a freelist.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h      |  5 ++
 drivers/gpu/drm/i915/i915_timeline.c | 80 ++++++++++++++++++++++++----
 2 files changed, 74 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 3913900600b7..d59228dabb6e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1978,6 +1978,11 @@ struct drm_i915_private {
 		struct i915_gt_timelines {
 			struct mutex mutex; /* protects list, tainted by GPU */
 			struct list_head list;
+
+			/* Pack multiple timelines' seqnos into the same page */
+			spinlock_t hwsp_lock;
+			struct i915_vma *hwsp;
+			u64 bitmap;
 		} timelines;
 
 		struct list_head active_rings;
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index a7d902e9eaf1..1579ace82cfd 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -11,26 +11,73 @@
 
 static int hwsp_alloc(struct i915_timeline *timeline)
 {
+#define NBITS BITS_PER_TYPE(typeof(gt->bitmap))
 	struct drm_i915_private *i915 = timeline->i915;
-	struct drm_i915_gem_object *obj;
+	struct i915_gt_timelines *gt = &i915->gt.timelines;
 	struct i915_vma *vma;
+	int offset;
+
+	spin_lock(&gt->hwsp_lock);
+
+restart:
+	offset = find_first_bit((unsigned long *)&gt->bitmap, NBITS);
+	if (offset == NBITS && gt->hwsp) {
+		i915_vma_put(gt->hwsp);
+		gt->hwsp = NULL;
+	}
+
+	vma = gt->hwsp;
+	if (!vma) {
+		struct drm_i915_gem_object *obj;
+
+		spin_unlock(&gt->hwsp_lock);
 
-	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
-	if (IS_ERR(obj))
-		return PTR_ERR(obj);
+		BUILD_BUG_ON(NBITS * CACHELINE_BYTES > PAGE_SIZE);
+		obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
+		if (IS_ERR(obj))
+			return PTR_ERR(obj);
 
-	i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
+		i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
 
-	vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
-	if (IS_ERR(vma)) {
-		i915_gem_object_put(obj);
-		return PTR_ERR(vma);
+		vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
+		if (IS_ERR(vma)) {
+			i915_gem_object_put(obj);
+			return PTR_ERR(vma);
+		}
+
+		spin_lock(&gt->hwsp_lock);
+		if (gt->hwsp) {
+			i915_gem_object_put(obj);
+			goto restart;
+		}
+
+		gt->hwsp = vma;
+		gt->bitmap = ~0ull;
+		offset = 0;
 	}
 
-	timeline->hwsp_ggtt = vma;
-	timeline->hwsp_offset = 0;
+	gt->bitmap &= ~BIT_ULL(offset);
+
+	spin_unlock(&gt->hwsp_lock);
+
+	timeline->hwsp_ggtt = i915_vma_get(vma);
+	timeline->hwsp_offset = offset * CACHELINE_BYTES;
 
 	return 0;
+#undef NBITS
+}
+
+static void hwsp_free(struct i915_timeline *timeline)
+{
+	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
+
+	if (timeline->hwsp_ggtt != gt->hwsp)
+		return;
+
+	spin_lock(&gt->hwsp_lock);
+	if (timeline->hwsp_ggtt == gt->hwsp)
+		gt->bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
+	spin_unlock(&gt->hwsp_lock);
 }
 
 int i915_timeline_init(struct drm_i915_private *i915,
@@ -65,6 +112,7 @@ int i915_timeline_init(struct drm_i915_private *i915,
 
 	vaddr = i915_gem_object_pin_map(timeline->hwsp_ggtt->obj, I915_MAP_WB);
 	if (IS_ERR(vaddr)) {
+		hwsp_free(timeline);
 		i915_vma_put(timeline->hwsp_ggtt);
 		return PTR_ERR(vaddr);
 	}
@@ -99,6 +147,8 @@ void i915_timelines_init(struct drm_i915_private *i915)
 	mutex_init(&gt->mutex);
 	INIT_LIST_HEAD(&gt->list);
 
+	spin_lock_init(&gt->hwsp_lock);
+
 	/* via i915_gem_wait_for_idle() */
 	i915_gem_shrinker_taints_mutex(i915, &gt->mutex);
 }
@@ -144,6 +194,9 @@ void i915_timeline_fini(struct i915_timeline *timeline)
 	list_del(&timeline->link);
 	mutex_unlock(&gt->mutex);
 
+	i915_syncmap_free(&timeline->sync);
+	hwsp_free(timeline);
+
 	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
 	i915_vma_put(timeline->hwsp_ggtt);
 }
@@ -211,9 +264,14 @@ void __i915_timeline_free(struct kref *kref)
 void i915_timelines_fini(struct drm_i915_private *i915)
 {
 	struct i915_gt_timelines *gt = &i915->gt.timelines;
+	struct i915_vma *vma;
 
 	GEM_BUG_ON(!list_empty(&gt->list));
 
+	vma = fetch_and_zero(&i915->gt.timelines.hwsp);
+	if (vma)
+		i915_vma_put(vma);
+
 	mutex_destroy(&gt->mutex);
 }
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 17/38] drm/i915: Keep all partially allocated HWSP on a freelist
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (15 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 16/38] drm/i915: Share per-timeline HWSP using a slab suballocator Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 18/38] drm/i915: Track the context's seqno in its own timeline HWSP Chris Wilson
                   ` (22 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Keep track of partially allocated pages for use in allocating future
timeline HWSP. This is still without migration, so it is possible for
the system to end up with each timeline in its own page, but we ensure
that no new allocation would needless allocate a fresh page!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: John Harrison <John.C.Harrison@Intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h      |  3 +-
 drivers/gpu/drm/i915/i915_timeline.c | 85 +++++++++++++++++-----------
 2 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index d59228dabb6e..cd2ea6e13fa6 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1981,8 +1981,7 @@ struct drm_i915_private {
 
 			/* Pack multiple timelines' seqnos into the same page */
 			spinlock_t hwsp_lock;
-			struct i915_vma *hwsp;
-			u64 bitmap;
+			struct list_head hwsp_free_list;
 		} timelines;
 
 		struct list_head active_rings;
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 1579ace82cfd..92532f713dd7 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -9,74 +9,98 @@
 #include "i915_timeline.h"
 #include "i915_syncmap.h"
 
+struct i915_timeline_hwsp {
+	struct list_head link;
+	struct i915_vma *vma;
+	u64 bitmap;
+};
+
 static int hwsp_alloc(struct i915_timeline *timeline)
 {
-#define NBITS BITS_PER_TYPE(typeof(gt->bitmap))
 	struct drm_i915_private *i915 = timeline->i915;
 	struct i915_gt_timelines *gt = &i915->gt.timelines;
-	struct i915_vma *vma;
+	struct i915_timeline_hwsp *hwsp;
 	int offset;
 
 	spin_lock(&gt->hwsp_lock);
 
-restart:
-	offset = find_first_bit((unsigned long *)&gt->bitmap, NBITS);
-	if (offset == NBITS && gt->hwsp) {
-		i915_vma_put(gt->hwsp);
-		gt->hwsp = NULL;
-	}
-
-	vma = gt->hwsp;
-	if (!vma) {
+	/* hwsp_free_list only contains HWSP that have available cachelines */
+	hwsp = list_first_entry_or_null(&gt->hwsp_free_list,
+					typeof(*hwsp), link);
+	if (!hwsp) {
 		struct drm_i915_gem_object *obj;
+		struct i915_vma *vma;
 
 		spin_unlock(&gt->hwsp_lock);
 
-		BUILD_BUG_ON(NBITS * CACHELINE_BYTES > PAGE_SIZE);
+		hwsp = kmalloc(sizeof(*hwsp), GFP_KERNEL);
+		if (!hwsp)
+			return -ENOMEM;
+
+		BUILD_BUG_ON(BITS_PER_TYPE(hwsp->bitmap) * CACHELINE_BYTES > PAGE_SIZE);
 		obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
-		if (IS_ERR(obj))
+		if (IS_ERR(obj)) {
+			kfree(hwsp);
 			return PTR_ERR(obj);
+		}
 
 		i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
 
 		vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
 		if (IS_ERR(vma)) {
 			i915_gem_object_put(obj);
+			kfree(hwsp);
 			return PTR_ERR(vma);
 		}
 
-		spin_lock(&gt->hwsp_lock);
-		if (gt->hwsp) {
-			i915_gem_object_put(obj);
-			goto restart;
-		}
+		vma->private = hwsp;
+		hwsp->vma = vma;
+		hwsp->bitmap = ~0ull;
 
-		gt->hwsp = vma;
-		gt->bitmap = ~0ull;
-		offset = 0;
+		spin_lock(&gt->hwsp_lock);
+		list_add(&hwsp->link, &gt->hwsp_free_list);
 	}
 
-	gt->bitmap &= ~BIT_ULL(offset);
+	GEM_BUG_ON(!hwsp->bitmap);
+	offset = __ffs64(hwsp->bitmap);
+	hwsp->bitmap &= ~BIT_ULL(offset);
+	if (!hwsp->bitmap)
+		list_del(&hwsp->link);
 
 	spin_unlock(&gt->hwsp_lock);
 
-	timeline->hwsp_ggtt = i915_vma_get(vma);
+	timeline->hwsp_ggtt = i915_vma_get(hwsp->vma);
 	timeline->hwsp_offset = offset * CACHELINE_BYTES;
 
+	GEM_BUG_ON(timeline->hwsp_ggtt->private != hwsp);
+
 	return 0;
-#undef NBITS
 }
 
 static void hwsp_free(struct i915_timeline *timeline)
 {
 	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
+	struct i915_timeline_hwsp *hwsp;
 
-	if (timeline->hwsp_ggtt != gt->hwsp)
+	hwsp = timeline->hwsp_ggtt->private;
+	if (!hwsp)
 		return;
 
 	spin_lock(&gt->hwsp_lock);
-	if (timeline->hwsp_ggtt == gt->hwsp)
-		gt->bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
+
+	/* As a cacheline becomes available, publish the HWSP on the freelist */
+	if (!hwsp->bitmap)
+		list_add_tail(&hwsp->link, &gt->hwsp_free_list);
+
+	hwsp->bitmap |= BIT_ULL(timeline->hwsp_offset / CACHELINE_BYTES);
+
+	/* And if no one is left using it, give the page back to the system */
+	if (hwsp->bitmap == ~0ull) {
+		i915_vma_put(hwsp->vma);
+		list_del(&hwsp->link);
+		kfree(hwsp);
+	}
+
 	spin_unlock(&gt->hwsp_lock);
 }
 
@@ -148,6 +172,7 @@ void i915_timelines_init(struct drm_i915_private *i915)
 	INIT_LIST_HEAD(&gt->list);
 
 	spin_lock_init(&gt->hwsp_lock);
+	INIT_LIST_HEAD(&gt->hwsp_free_list);
 
 	/* via i915_gem_wait_for_idle() */
 	i915_gem_shrinker_taints_mutex(i915, &gt->mutex);
@@ -264,13 +289,9 @@ void __i915_timeline_free(struct kref *kref)
 void i915_timelines_fini(struct drm_i915_private *i915)
 {
 	struct i915_gt_timelines *gt = &i915->gt.timelines;
-	struct i915_vma *vma;
 
 	GEM_BUG_ON(!list_empty(&gt->list));
-
-	vma = fetch_and_zero(&i915->gt.timelines.hwsp);
-	if (vma)
-		i915_vma_put(vma);
+	GEM_BUG_ON(!list_empty(&gt->hwsp_free_list));
 
 	mutex_destroy(&gt->mutex);
 }
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 18/38] drm/i915: Track the context's seqno in its own timeline HWSP
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (16 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 17/38] drm/i915: Keep all partially allocated HWSP on a freelist Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 19/38] drm/i915: Identify active requests Chris Wilson
                   ` (21 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Now that we have allocated ourselves a cacheline to store a breadcrumb,
we can emit a write from the GPU into the timeline's HWSP of the
per-context seqno as we complete each request. This drops the mirroring
of the per-engine HWSP and allows each context to operate independently.
We do not need to unwind the per-context timeline, and so requests are
always consistent with the timeline breadcrumb, greatly simplifying the
completion checks as we no longer need to be concerned about the
global_seqno changing mid check.

At this point, we are emitting both per-context and global seqno and
still using the single per-engine execution timeline for resolving
interrupts.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c              |  2 +-
 drivers/gpu/drm/i915/i915_request.c          |  2 +-
 drivers/gpu/drm/i915/i915_request.h          | 27 ++----
 drivers/gpu/drm/i915/i915_reset.c            |  1 +
 drivers/gpu/drm/i915/i915_vma.h              |  7 ++
 drivers/gpu/drm/i915/intel_lrc.c             | 32 ++++---
 drivers/gpu/drm/i915/intel_ringbuffer.c      | 91 ++++++++++++++------
 drivers/gpu/drm/i915/selftests/mock_engine.c |  8 +-
 8 files changed, 109 insertions(+), 61 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0b44059dfab2..9d6edb0c8a75 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2890,7 +2890,7 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
 	 */
 	spin_lock_irqsave(&engine->timeline.lock, flags);
 	list_for_each_entry(request, &engine->timeline.requests, link) {
-		if (__i915_request_completed(request, request->global_seqno))
+		if (i915_request_completed(request))
 			continue;
 
 		active = request;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 7d068c406a49..0d7b71aff28f 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -614,7 +614,7 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	rq->ring = ce->ring;
 	rq->timeline = ce->ring->timeline;
 	GEM_BUG_ON(rq->timeline == &engine->timeline);
-	rq->hwsp_seqno = &engine->status_page.addr[I915_GEM_HWS_INDEX];
+	rq->hwsp_seqno = rq->timeline->hwsp_seqno;
 
 	spin_lock_init(&rq->lock);
 	dma_fence_init(&rq->fence,
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 4dd22dadf5ce..a16a3b7f7d92 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -324,32 +324,21 @@ static inline u32 hwsp_seqno(const struct i915_request *rq)
  */
 static inline bool i915_request_started(const struct i915_request *rq)
 {
-	u32 seqno;
-
-	seqno = i915_request_global_seqno(rq);
-	if (!seqno) /* not yet submitted to HW */
-		return false;
-
-	return i915_seqno_passed(hwsp_seqno(rq), seqno - 1);
+	return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno - 1);
 }
 
-static inline bool
-__i915_request_completed(const struct i915_request *rq, u32 seqno)
+static inline bool i915_request_completed(const struct i915_request *rq)
 {
-	GEM_BUG_ON(!seqno);
-	return i915_seqno_passed(hwsp_seqno(rq), seqno) &&
-		seqno == i915_request_global_seqno(rq);
+	return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno);
 }
 
-static inline bool i915_request_completed(const struct i915_request *rq)
+static inline void i915_request_fake_complete(const struct i915_request *rq)
 {
-	u32 seqno;
-
-	seqno = i915_request_global_seqno(rq);
-	if (!seqno)
-		return false;
+	/* Don't allow ourselves to accidentally go backwards. */
+	if (i915_request_completed(rq))
+		return;
 
-	return __i915_request_completed(rq, seqno);
+	WRITE_ONCE(*(u32 *)rq->hwsp_seqno, rq->fence.seqno);
 }
 
 void i915_retire_requests(struct drm_i915_private *i915);
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index 12e5a2bc825c..eff76558b958 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -756,6 +756,7 @@ static void nop_submit_request(struct i915_request *request)
 
 	spin_lock_irqsave(&request->engine->timeline.lock, flags);
 	__i915_request_submit(request);
+	i915_request_fake_complete(request);
 	intel_engine_write_global_seqno(request->engine, request->global_seqno);
 	spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
 }
diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
index 5793abe509a2..18be786a970d 100644
--- a/drivers/gpu/drm/i915/i915_vma.h
+++ b/drivers/gpu/drm/i915/i915_vma.h
@@ -221,6 +221,13 @@ static inline u32 i915_ggtt_offset(const struct i915_vma *vma)
 	return lower_32_bits(vma->node.start);
 }
 
+/* XXX inline spaghetti */
+static inline u32 i915_timeline_seqno_address(const struct i915_timeline *tl)
+{
+	GEM_BUG_ON(!tl->pin_count);
+	return i915_ggtt_offset(tl->hwsp_ggtt) + tl->hwsp_offset;
+}
+
 static inline u32 i915_ggtt_pin_bias(struct i915_vma *vma)
 {
 	return i915_vm_to_ggtt(vma->vm)->pin_bias;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index ba59241add47..041b64fb203c 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -837,10 +837,10 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	list_for_each_entry(rq, &engine->timeline.requests, link) {
 		GEM_BUG_ON(!rq->global_seqno);
 
-		if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
-			continue;
+		if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
+			dma_fence_set_error(&rq->fence, -EIO);
 
-		dma_fence_set_error(&rq->fence, -EIO);
+		i915_request_fake_complete(rq);
 	}
 
 	/* Flush the queued requests to the timeline list (for retiring). */
@@ -853,6 +853,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 
 			dma_fence_set_error(&rq->fence, -EIO);
 			__i915_request_submit(rq);
+			i915_request_fake_complete(rq);
 		}
 
 		rb_erase_cached(&p->node, &execlists->queue);
@@ -2034,31 +2035,40 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
 
-	cs = gen8_emit_ggtt_write(cs, request->global_seqno,
+	cs = gen8_emit_ggtt_write(cs,
+				  request->fence.seqno,
+				  i915_timeline_seqno_address(request->timeline));
+
+	cs = gen8_emit_ggtt_write(cs,
+				  request->global_seqno,
 				  intel_hws_seqno_address(request->engine));
+
 	*cs++ = MI_USER_INTERRUPT;
 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
+
 	request->tail = intel_ring_offset(request, cs);
 	assert_ring_tail_valid(request->ring, request->tail);
 
 	gen8_emit_wa_tail(request, cs);
 }
-static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
 
 static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 {
-	/* We're using qword write, seqno should be aligned to 8 bytes. */
-	BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);
-
 	cs = gen8_emit_ggtt_write_rcs(cs,
-				      request->global_seqno,
-				      intel_hws_seqno_address(request->engine),
+				      request->fence.seqno,
+				      i915_timeline_seqno_address(request->timeline),
 				      PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
 				      PIPE_CONTROL_DEPTH_CACHE_FLUSH |
 				      PIPE_CONTROL_DC_FLUSH_ENABLE |
 				      PIPE_CONTROL_FLUSH_ENABLE |
 				      PIPE_CONTROL_CS_STALL);
 
+	cs = gen8_emit_ggtt_write_rcs(cs,
+				      request->global_seqno,
+				      intel_hws_seqno_address(request->engine),
+				      PIPE_CONTROL_CS_STALL);
+
 	*cs++ = MI_USER_INTERRUPT;
 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 
@@ -2067,7 +2077,7 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 
 	gen8_emit_wa_tail(request, cs);
 }
-static const int gen8_emit_breadcrumb_rcs_sz = 8 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_rcs_sz = 14 + WA_TAIL_DWORDS;
 
 static int gen8_init_rcs_context(struct i915_request *rq)
 {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 5887304bc3ae..bcc700e7037b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -325,6 +325,12 @@ static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 		 PIPE_CONTROL_DC_FLUSH_ENABLE |
 		 PIPE_CONTROL_QW_WRITE |
 		 PIPE_CONTROL_CS_STALL);
+	*cs++ = i915_timeline_seqno_address(rq->timeline) |
+		PIPE_CONTROL_GLOBAL_GTT;
+	*cs++ = rq->fence.seqno;
+
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
 	*cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
 	*cs++ = rq->global_seqno;
 
@@ -334,7 +340,7 @@ static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen6_rcs_emit_breadcrumb_sz = 14;
+static const int gen6_rcs_emit_breadcrumb_sz = 18;
 
 static int
 gen7_render_ring_cs_stall_wa(struct i915_request *rq)
@@ -425,6 +431,13 @@ static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 		 PIPE_CONTROL_QW_WRITE |
 		 PIPE_CONTROL_GLOBAL_GTT_IVB |
 		 PIPE_CONTROL_CS_STALL);
+	*cs++ = i915_timeline_seqno_address(rq->timeline);
+	*cs++ = rq->fence.seqno;
+
+	*cs++ = GFX_OP_PIPE_CONTROL(4);
+	*cs++ = (PIPE_CONTROL_QW_WRITE |
+		 PIPE_CONTROL_GLOBAL_GTT_IVB |
+		 PIPE_CONTROL_CS_STALL);
 	*cs++ = intel_hws_seqno_address(rq->engine);
 	*cs++ = rq->global_seqno;
 
@@ -434,27 +447,37 @@ static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen7_rcs_emit_breadcrumb_sz = 6;
+static const int gen7_rcs_emit_breadcrumb_sz = 10;
 
 static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
-	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
-	*cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
+	*cs++ = rq->fence.seqno;
+
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->global_seqno;
+
 	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen6_xcs_emit_breadcrumb_sz = 4;
+static const int gen6_xcs_emit_breadcrumb_sz = 8;
 
 #define GEN7_XCS_WA 32
 static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
 	int i;
 
-	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW;
-	*cs++ = intel_hws_seqno_address(rq->engine) | MI_FLUSH_DW_USE_GTT;
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
+	*cs++ = rq->fence.seqno;
+
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->global_seqno;
 
 	for (i = 0; i < GEN7_XCS_WA; i++) {
@@ -468,12 +491,11 @@ static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = 0;
 
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen7_xcs_emit_breadcrumb_sz = 8 + GEN7_XCS_WA * 3;
+static const int gen7_xcs_emit_breadcrumb_sz = 10 + GEN7_XCS_WA * 3;
 #undef GEN7_XCS_WA
 
 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
@@ -733,7 +755,7 @@ static void reset_ring(struct intel_engine_cs *engine, bool stalled)
 	rq = NULL;
 	spin_lock_irqsave(&tl->lock, flags);
 	list_for_each_entry(pos, &tl->requests, link) {
-		if (!__i915_request_completed(pos, pos->global_seqno)) {
+		if (!i915_request_completed(pos)) {
 			rq = pos;
 			break;
 		}
@@ -875,11 +897,10 @@ static void cancel_requests(struct intel_engine_cs *engine)
 	list_for_each_entry(request, &engine->timeline.requests, link) {
 		GEM_BUG_ON(!request->global_seqno);
 
-		if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
-			     &request->fence.flags))
-			continue;
-
-		dma_fence_set_error(&request->fence, -EIO);
+		if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
+			      &request->fence.flags))
+			dma_fence_set_error(&request->fence, -EIO);
+		i915_request_fake_complete(request);
 	}
 
 	intel_write_status_page(engine,
@@ -903,27 +924,38 @@ static void i9xx_submit_request(struct i915_request *request)
 
 static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
+	GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
+
 	*cs++ = MI_FLUSH;
 
+	*cs++ = MI_STORE_DWORD_INDEX;
+	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
+	*cs++ = rq->fence.seqno;
+
 	*cs++ = MI_STORE_DWORD_INDEX;
 	*cs++ = I915_GEM_HWS_INDEX_ADDR;
 	*cs++ = rq->global_seqno;
 
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int i9xx_emit_breadcrumb_sz = 6;
+static const int i9xx_emit_breadcrumb_sz = 8;
 
 #define GEN5_WA_STORES 8 /* must be at least 1! */
 static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
 	int i;
 
+	GEM_BUG_ON(rq->timeline->hwsp_ggtt != rq->engine->status_page.vma);
+
 	*cs++ = MI_FLUSH;
 
+	*cs++ = MI_STORE_DWORD_INDEX;
+	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
+	*cs++ = rq->fence.seqno;
+
 	BUILD_BUG_ON(GEN5_WA_STORES < 1);
 	for (i = 0; i < GEN5_WA_STORES; i++) {
 		*cs++ = MI_STORE_DWORD_INDEX;
@@ -932,11 +964,12 @@ static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	}
 
 	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 2;
+static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 6;
 #undef GEN5_WA_STORES
 
 static void
@@ -1163,6 +1196,10 @@ int intel_ring_pin(struct intel_ring *ring)
 
 	GEM_BUG_ON(ring->vaddr);
 
+	ret = i915_timeline_pin(ring->timeline);
+	if (ret)
+		return ret;
+
 	flags = PIN_GLOBAL;
 
 	/* Ring wraparound at offset 0 sometimes hangs. No idea why. */
@@ -1179,28 +1216,32 @@ int intel_ring_pin(struct intel_ring *ring)
 		else
 			ret = i915_gem_object_set_to_cpu_domain(vma->obj, true);
 		if (unlikely(ret))
-			return ret;
+			goto unpin_timeline;
 	}
 
 	ret = i915_vma_pin(vma, 0, 0, flags);
 	if (unlikely(ret))
-		return ret;
+		goto unpin_timeline;
 
 	if (i915_vma_is_map_and_fenceable(vma))
 		addr = (void __force *)i915_vma_pin_iomap(vma);
 	else
 		addr = i915_gem_object_pin_map(vma->obj, map);
-	if (IS_ERR(addr))
-		goto err;
+	if (IS_ERR(addr)) {
+		ret = PTR_ERR(addr);
+		goto unpin_ring;
+	}
 
 	vma->obj->pin_global++;
 
 	ring->vaddr = addr;
 	return 0;
 
-err:
+unpin_ring:
 	i915_vma_unpin(vma);
-	return PTR_ERR(addr);
+unpin_timeline:
+	i915_timeline_unpin(ring->timeline);
+	return ret;
 }
 
 void intel_ring_reset(struct intel_ring *ring, u32 tail)
@@ -1229,6 +1270,8 @@ void intel_ring_unpin(struct intel_ring *ring)
 
 	ring->vma->obj->pin_global--;
 	i915_vma_unpin(ring->vma);
+
+	i915_timeline_unpin(ring->timeline);
 }
 
 static struct i915_vma *
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index acd27c7e807b..b4b61056b227 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -79,6 +79,7 @@ static void advance(struct mock_engine *engine,
 		    struct mock_request *request)
 {
 	list_del_init(&request->link);
+	i915_request_fake_complete(&request->base);
 	mock_seqno_advance(&engine->base, request->base.global_seqno);
 }
 
@@ -253,16 +254,13 @@ void mock_engine_flush(struct intel_engine_cs *engine)
 	del_timer_sync(&mock->hw_delay);
 
 	spin_lock_irq(&mock->hw_lock);
-	list_for_each_entry_safe(request, rn, &mock->hw_queue, link) {
-		list_del_init(&request->link);
-		mock_seqno_advance(&mock->base, request->base.global_seqno);
-	}
+	list_for_each_entry_safe(request, rn, &mock->hw_queue, link)
+		advance(mock, request);
 	spin_unlock_irq(&mock->hw_lock);
 }
 
 void mock_engine_reset(struct intel_engine_cs *engine)
 {
-	intel_write_status_page(engine, I915_GEM_HWS_INDEX, 0);
 }
 
 void mock_engine_free(struct intel_engine_cs *engine)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 19/38] drm/i915: Identify active requests
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (17 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 18/38] drm/i915: Track the context's seqno in its own timeline HWSP Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 20/38] drm/i915: Remove the intel_engine_notify tracepoint Chris Wilson
                   ` (20 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

To allow requests to forgo a common execution timeline, one question we
need to be able to answer is "is this request running?". To track
whether a request has started on HW, we can emit a breadcrumb at the
beginning of the request and check its timeline's HWSP to see if the
breadcrumb has advanced past the start of this request. (This is in
contrast to the global timeline where we need only ask if we are on the
global timeline and if the timeline has advanced past the end of the
previous request.)

There is still confusion from a preempted request, which has already
started but relinquished the HW to a high priority request. For the
common case, this discrepancy should be negligible. However, for
identification of hung requests, knowing which one was running at the
time of the hang will be much more important.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_request.c     |  1 +
 drivers/gpu/drm/i915/i915_request.h     |  1 +
 drivers/gpu/drm/i915/i915_timeline.c    |  1 +
 drivers/gpu/drm/i915/i915_timeline.h    |  2 ++
 drivers/gpu/drm/i915/intel_engine_cs.c  |  4 +++-
 drivers/gpu/drm/i915/intel_lrc.c        | 23 +++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_ringbuffer.c |  2 ++
 7 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 0d7b71aff28f..f61cc5c1bf08 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -332,6 +332,7 @@ void i915_request_retire_upto(struct i915_request *rq)
 
 static u32 timeline_get_seqno(struct i915_timeline *tl)
 {
+	tl->seqno += tl->has_initial_breadcrumb;
 	return ++tl->seqno;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index a16a3b7f7d92..83ce982dcbd9 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -324,6 +324,7 @@ static inline u32 hwsp_seqno(const struct i915_request *rq)
  */
 static inline bool i915_request_started(const struct i915_request *rq)
 {
+	/* Remember: started but may have since been preempted! */
 	return i915_seqno_passed(hwsp_seqno(rq), rq->fence.seqno - 1);
 }
 
diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
index 92532f713dd7..189717db11a9 100644
--- a/drivers/gpu/drm/i915/i915_timeline.c
+++ b/drivers/gpu/drm/i915/i915_timeline.c
@@ -124,6 +124,7 @@ int i915_timeline_init(struct drm_i915_private *i915,
 	timeline->i915 = i915;
 	timeline->name = name;
 	timeline->pin_count = 0;
+	timeline->has_initial_breadcrumb = !global_hwsp;
 
 	if (global_hwsp) {
 		timeline->hwsp_ggtt = i915_vma_get(global_hwsp);
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index 0c3739d53d79..421eb34568de 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -47,6 +47,8 @@ struct i915_timeline {
 	struct i915_vma *hwsp_ggtt;
 	u32 hwsp_offset;
 
+	bool has_initial_breadcrumb;
+
 	/**
 	 * List of breadcrumbs associated with GPU requests currently
 	 * outstanding.
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index c850d131d8c3..ae455b874c9f 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1239,7 +1239,9 @@ static void print_request(struct drm_printer *m,
 	drm_printf(m, "%s%x%s [%llx:%llx]%s @ %dms: %s\n",
 		   prefix,
 		   rq->global_seqno,
-		   i915_request_completed(rq) ? "!" : "",
+		   i915_request_completed(rq) ? "!" :
+		   i915_request_started(rq) ? "*" :
+		   "",
 		   rq->fence.context, rq->fence.seqno,
 		   buf,
 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 041b64fb203c..740e09d3ca26 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1275,6 +1275,25 @@ execlists_context_pin(struct intel_engine_cs *engine,
 	return __execlists_context_pin(engine, ctx, ce);
 }
 
+static int emit_initial_breadcrumb(struct i915_request *rq)
+{
+	u32 *cs;
+
+	GEM_BUG_ON(!rq->timeline->has_initial_breadcrumb);
+
+	cs = intel_ring_begin(rq, 4);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+	*cs++ = i915_timeline_seqno_address(rq->timeline);
+	*cs++ = 0;
+	*cs++ = rq->fence.seqno - 1;
+
+	intel_ring_advance(rq, cs);
+	return 0;
+}
+
 static int emit_pdps(struct i915_request *rq)
 {
 	const struct intel_engine_cs * const engine = rq->engine;
@@ -1349,6 +1368,10 @@ static int execlists_request_alloc(struct i915_request *request)
 	 * to cancel/unwind this request now.
 	 */
 
+	ret = emit_initial_breadcrumb(request);
+	if (ret)
+		return ret;
+
 	/* Unconditionally invalidate GPU caches and TLBs. */
 	if (i915_vm_is_48bit(&request->gem_context->ppgtt->vm))
 		ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index bcc700e7037b..2b4beb15a271 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1593,6 +1593,7 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
 		err = PTR_ERR(timeline);
 		goto err;
 	}
+	GEM_BUG_ON(timeline->has_initial_breadcrumb);
 
 	ring = intel_engine_create_ring(engine, timeline, 32 * PAGE_SIZE);
 	i915_timeline_put(timeline);
@@ -1946,6 +1947,7 @@ static int ring_request_alloc(struct i915_request *request)
 	int ret;
 
 	GEM_BUG_ON(!request->hw_context->pin_count);
+	GEM_BUG_ON(request->timeline->has_initial_breadcrumb);
 
 	/*
 	 * Flush enough space to reduce the likelihood of waiting after
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 20/38] drm/i915: Remove the intel_engine_notify tracepoint
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (18 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 19/38] drm/i915: Identify active requests Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 21/38] drm/i915: Replace global breadcrumbs with per-context interrupt tracking Chris Wilson
                   ` (19 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

The global seqno is defunct and so we have no meaningful indicator of
forward progress for an engine. You need to listen to the request
signaling tracepoints instead.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_irq.c   |  2 --
 drivers/gpu/drm/i915/i915_trace.h | 25 -------------------------
 2 files changed, 27 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 1abfc3fa76ad..8da5816e2854 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1210,8 +1210,6 @@ static void notify_ring(struct intel_engine_cs *engine)
 		wake_up_process(tsk);
 
 	rcu_read_unlock();
-
-	trace_intel_engine_notify(engine, wait);
 }
 
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index 33d90eca9cdd..cb5bc65d575d 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -750,31 +750,6 @@ trace_i915_request_out(struct i915_request *rq)
 #endif
 #endif
 
-TRACE_EVENT(intel_engine_notify,
-	    TP_PROTO(struct intel_engine_cs *engine, bool waiters),
-	    TP_ARGS(engine, waiters),
-
-	    TP_STRUCT__entry(
-			     __field(u32, dev)
-			     __field(u16, class)
-			     __field(u16, instance)
-			     __field(u32, seqno)
-			     __field(bool, waiters)
-			     ),
-
-	    TP_fast_assign(
-			   __entry->dev = engine->i915->drm.primary->index;
-			   __entry->class = engine->uabi_class;
-			   __entry->instance = engine->instance;
-			   __entry->seqno = intel_engine_get_seqno(engine);
-			   __entry->waiters = waiters;
-			   ),
-
-	    TP_printk("dev=%u, engine=%u:%u, seqno=%u, waiters=%u",
-		      __entry->dev, __entry->class, __entry->instance,
-		      __entry->seqno, __entry->waiters)
-);
-
 DEFINE_EVENT(i915_request, i915_request_retire,
 	    TP_PROTO(struct i915_request *rq),
 	    TP_ARGS(rq)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 21/38] drm/i915: Replace global breadcrumbs with per-context interrupt tracking
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (19 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 20/38] drm/i915: Remove the intel_engine_notify tracepoint Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 22/38] drm/i915: Drop fake breadcrumb irq Chris Wilson
                   ` (18 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

A few years ago, see commit 688e6c725816 ("drm/i915: Slaughter the
thundering i915_wait_request herd"), the issue of handling multiple
clients waiting in parallel was brought to our attention. The
requirement was that every client should be woken immediately upon its
request being signaled, without incurring any cpu overhead.

To handle certain fragility of our hw meant that we could not do a
simple check inside the irq handler (some generations required almost
unbounded delays before we could be sure of seqno coherency) and so
request completion checking required delegation.

Before commit 688e6c725816, the solution was simple. Every client waking
on a request would be woken on every interrupt and each would do a
heavyweight check to see if their request was complete. Commit
688e6c725816 introduced an rbtree so that only the earliest waiter on
the global timeline would woken, and would wake the next and so on.
(Along with various complications to handle requests being reordered
along the global timeline, and also a requirement for kthread to provide
a delegate for fence signaling that had no process context.)

The global rbtree depends on knowing the execution timeline (and global
seqno). Without knowing that order, we must instead check all contexts
queued to the HW to see which may have advanced. We trim that list by
only checking queued contexts that are being waited on, but still we
keep a list of all active contexts and their active signalers that we
inspect from inside the irq handler. By moving the waiters onto the fence
signal list, we can combine the client wakeup with the dma_fence
signaling (a dramatic reduction in complexity, but does require the HW
being coherent, the seqno must be visible from the cpu before the
interrupt is raised - we keep a timer backup just in case).

Having previously fixed all the issues with irq-seqno serialisation (by
inserting delays onto the GPU after each request instead of random delays
on the CPU after each interrupt), we can rely on the seqno state to
perfom direct wakeups from the interrupt handler. This allows us to
preserve our single context switch behaviour of the current routine,
with the only downside that we lose the RT priority sorting of wakeups.
In general, direct wakeup latency of multiple clients is about the same
(about 10% better in most cases) with a reduction in total CPU time spent
in the waiter (about 20-50% depending on gen). Average herd behaviour is
improved, but at the cost of not delegating wakeups on task_prio.

References: 688e6c725816 ("drm/i915: Slaughter the thundering i915_wait_request herd")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  28 +-
 drivers/gpu/drm/i915/i915_gem_context.c       |   2 +
 drivers/gpu/drm/i915/i915_gem_context.h       |   2 +
 drivers/gpu/drm/i915/i915_gpu_error.c         |  73 --
 drivers/gpu/drm/i915/i915_gpu_error.h         |   8 -
 drivers/gpu/drm/i915/i915_irq.c               |  88 +-
 drivers/gpu/drm/i915/i915_request.c           | 128 +--
 drivers/gpu/drm/i915/i915_request.h           |  22 +-
 drivers/gpu/drm/i915/i915_reset.c             |  13 +-
 drivers/gpu/drm/i915/intel_breadcrumbs.c      | 793 +++++-------------
 drivers/gpu/drm/i915/intel_engine_cs.c        |  34 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c       |   6 +-
 drivers/gpu/drm/i915/intel_ringbuffer.h       |  95 +--
 .../drm/i915/selftests/i915_mock_selftests.h  |   1 -
 drivers/gpu/drm/i915/selftests/i915_request.c | 398 +++++++++
 drivers/gpu/drm/i915/selftests/igt_spinner.c  |   5 -
 .../drm/i915/selftests/intel_breadcrumbs.c    | 470 -----------
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |   2 +-
 drivers/gpu/drm/i915/selftests/lib_sw_fence.c |  54 ++
 drivers/gpu/drm/i915/selftests/lib_sw_fence.h |   3 +
 drivers/gpu/drm/i915/selftests/mock_context.c |   2 +
 drivers/gpu/drm/i915/selftests/mock_engine.c  |  26 +-
 drivers/gpu/drm/i915/selftests/mock_engine.h  |   6 -
 23 files changed, 774 insertions(+), 1485 deletions(-)
 delete mode 100644 drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 2a6e4044f25b..d7764e62e9b4 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1315,29 +1315,16 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	seq_printf(m, "GT active? %s\n", yesno(dev_priv->gt.awake));
 
 	for_each_engine(engine, dev_priv, id) {
-		struct intel_breadcrumbs *b = &engine->breadcrumbs;
-		struct rb_node *rb;
-
 		seq_printf(m, "%s:\n", engine->name);
 		seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
 			   engine->hangcheck.seqno, seqno[id],
 			   intel_engine_last_submit(engine),
 			   jiffies_to_msecs(jiffies -
 					    engine->hangcheck.action_timestamp));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
-			   yesno(intel_engine_has_waiter(engine)),
+		seq_printf(m, "\tfake irq active? %s\n",
 			   yesno(test_bit(engine->id,
 					  &dev_priv->gpu_error.missed_irq_rings)));
 
-		spin_lock_irq(&b->rb_lock);
-		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
-			struct intel_wait *w = rb_entry(rb, typeof(*w), node);
-
-			seq_printf(m, "\t%s [%d] waiting for %x\n",
-				   w->tsk->comm, w->tsk->pid, w->seqno);
-		}
-		spin_unlock_irq(&b->rb_lock);
-
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
@@ -2021,18 +2008,6 @@ static int i915_swizzle_info(struct seq_file *m, void *data)
 	return 0;
 }
 
-static int count_irq_waiters(struct drm_i915_private *i915)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-	int count = 0;
-
-	for_each_engine(engine, i915, id)
-		count += intel_engine_has_waiter(engine);
-
-	return count;
-}
-
 static const char *rps_power_to_str(unsigned int power)
 {
 	static const char * const strings[] = {
@@ -2072,7 +2047,6 @@ static int i915_rps_boost_info(struct seq_file *m, void *data)
 	seq_printf(m, "RPS enabled? %d\n", rps->enabled);
 	seq_printf(m, "GPU busy? %s [%d requests]\n",
 		   yesno(dev_priv->gt.awake), dev_priv->gt.active_requests);
-	seq_printf(m, "CPU waiting? %d\n", count_irq_waiters(dev_priv));
 	seq_printf(m, "Boosts outstanding? %d\n",
 		   atomic_read(&rps->num_waiters));
 	seq_printf(m, "Interactive? %d\n", READ_ONCE(rps->power.interactive));
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 5933adbe3d99..054d3e1bfe00 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -342,6 +342,8 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 		struct intel_context *ce = &ctx->__engine[n];
 
 		ce->gem_context = ctx;
+		INIT_LIST_HEAD(&ce->signal_link);
+		INIT_LIST_HEAD(&ce->signals);
 	}
 
 	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index f6d870b1f73e..e5eca29cd373 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -164,6 +164,8 @@ struct i915_gem_context {
 	struct intel_context {
 		struct i915_gem_context *gem_context;
 		struct intel_engine_cs *active;
+		struct list_head signal_link;
+		struct list_head signals;
 		struct i915_vma *state;
 		struct intel_ring *ring;
 		u32 *lrc_reg_state;
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 6f2fcad0a6ee..7c2510ce81d2 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -530,7 +530,6 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	}
 	err_printf(m, "  seqno: 0x%08x\n", ee->seqno);
 	err_printf(m, "  last_seqno: 0x%08x\n", ee->last_seqno);
-	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
 	err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
@@ -804,21 +803,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 						    error->epoch);
 		}
 
-		if (IS_ERR(ee->waiters)) {
-			err_printf(m, "%s --- ? waiters [unable to acquire spinlock]\n",
-				   m->i915->engine[i]->name);
-		} else if (ee->num_waiters) {
-			err_printf(m, "%s --- %d waiters\n",
-				   m->i915->engine[i]->name,
-				   ee->num_waiters);
-			for (j = 0; j < ee->num_waiters; j++) {
-				err_printf(m, " seqno 0x%08x for %s [%d]\n",
-					   ee->waiters[j].seqno,
-					   ee->waiters[j].comm,
-					   ee->waiters[j].pid);
-			}
-		}
-
 		print_error_obj(m, m->i915->engine[i],
 				"ringbuffer", ee->ringbuffer);
 
@@ -1000,8 +984,6 @@ void __i915_gpu_state_free(struct kref *error_ref)
 		i915_error_object_free(ee->wa_ctx);
 
 		kfree(ee->requests);
-		if (!IS_ERR_OR_NULL(ee->waiters))
-			kfree(ee->waiters);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
@@ -1201,59 +1183,6 @@ static void gen6_record_semaphore_state(struct intel_engine_cs *engine,
 			I915_READ(RING_SYNC_2(engine->mmio_base));
 }
 
-static void error_record_engine_waiters(struct intel_engine_cs *engine,
-					struct drm_i915_error_engine *ee)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	struct drm_i915_error_waiter *waiter;
-	struct rb_node *rb;
-	int count;
-
-	ee->num_waiters = 0;
-	ee->waiters = NULL;
-
-	if (RB_EMPTY_ROOT(&b->waiters))
-		return;
-
-	if (!spin_trylock_irq(&b->rb_lock)) {
-		ee->waiters = ERR_PTR(-EDEADLK);
-		return;
-	}
-
-	count = 0;
-	for (rb = rb_first(&b->waiters); rb != NULL; rb = rb_next(rb))
-		count++;
-	spin_unlock_irq(&b->rb_lock);
-
-	waiter = NULL;
-	if (count)
-		waiter = kmalloc_array(count,
-				       sizeof(struct drm_i915_error_waiter),
-				       GFP_ATOMIC);
-	if (!waiter)
-		return;
-
-	if (!spin_trylock_irq(&b->rb_lock)) {
-		kfree(waiter);
-		ee->waiters = ERR_PTR(-EDEADLK);
-		return;
-	}
-
-	ee->waiters = waiter;
-	for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
-		struct intel_wait *w = rb_entry(rb, typeof(*w), node);
-
-		strcpy(waiter->comm, w->tsk->comm);
-		waiter->pid = w->tsk->pid;
-		waiter->seqno = w->seqno;
-		waiter++;
-
-		if (++ee->num_waiters == count)
-			break;
-	}
-	spin_unlock_irq(&b->rb_lock);
-}
-
 static void error_record_engine_registers(struct i915_gpu_state *error,
 					  struct intel_engine_cs *engine,
 					  struct drm_i915_error_engine *ee)
@@ -1289,7 +1218,6 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
 
 	intel_engine_get_instdone(engine, &ee->instdone);
 
-	ee->waiting = intel_engine_has_waiter(engine);
 	ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base));
 	ee->acthd = intel_engine_get_active_head(engine);
 	ee->seqno = intel_engine_get_seqno(engine);
@@ -1538,7 +1466,6 @@ static void gem_record_rings(struct i915_gpu_state *error)
 		ee->engine_id = i;
 
 		error_record_engine_registers(error, engine, ee);
-		error_record_engine_waiters(engine, ee);
 		error_record_engine_execlists(engine, ee);
 
 		request = i915_gem_find_active_request(engine);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 231173786eae..0e184712cbcc 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -82,8 +82,6 @@ struct i915_gpu_state {
 		int engine_id;
 		/* Software tracked state */
 		bool idle;
-		bool waiting;
-		int num_waiters;
 		unsigned long hangcheck_timestamp;
 		struct i915_address_space *vm;
 		int num_requests;
@@ -159,12 +157,6 @@ struct i915_gpu_state {
 		} *requests, execlist[EXECLIST_MAX_PORTS];
 		unsigned int num_ports;
 
-		struct drm_i915_error_waiter {
-			char comm[TASK_COMM_LEN];
-			pid_t pid;
-			u32 seqno;
-		} *waiters;
-
 		struct {
 			u32 gfx_mode;
 			union {
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 8da5816e2854..7669b1caeef0 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -28,9 +28,10 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/sysrq.h>
-#include <linux/slab.h>
 #include <linux/circ_buf.h>
+#include <linux/slab.h>
+#include <linux/sysrq.h>
+
 #include <drm/i915_drm.h>
 #include "i915_drv.h"
 #include "i915_trace.h"
@@ -1151,67 +1152,6 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
 	return;
 }
 
-static void notify_ring(struct intel_engine_cs *engine)
-{
-	const u32 seqno = intel_engine_get_seqno(engine);
-	struct i915_request *rq = NULL;
-	struct task_struct *tsk = NULL;
-	struct intel_wait *wait;
-
-	if (unlikely(!engine->breadcrumbs.irq_armed))
-		return;
-
-	rcu_read_lock();
-
-	spin_lock(&engine->breadcrumbs.irq_lock);
-	wait = engine->breadcrumbs.irq_wait;
-	if (wait) {
-		/*
-		 * We use a callback from the dma-fence to submit
-		 * requests after waiting on our own requests. To
-		 * ensure minimum delay in queuing the next request to
-		 * hardware, signal the fence now rather than wait for
-		 * the signaler to be woken up. We still wake up the
-		 * waiter in order to handle the irq-seqno coherency
-		 * issues (we may receive the interrupt before the
-		 * seqno is written, see __i915_request_irq_complete())
-		 * and to handle coalescing of multiple seqno updates
-		 * and many waiters.
-		 */
-		if (i915_seqno_passed(seqno, wait->seqno)) {
-			struct i915_request *waiter = wait->request;
-
-			if (waiter &&
-			    !test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
-				      &waiter->fence.flags) &&
-			    intel_wait_check_request(wait, waiter))
-				rq = i915_request_get(waiter);
-
-			tsk = wait->tsk;
-		}
-
-		engine->breadcrumbs.irq_count++;
-	} else {
-		if (engine->breadcrumbs.irq_armed)
-			__intel_engine_disarm_breadcrumbs(engine);
-	}
-	spin_unlock(&engine->breadcrumbs.irq_lock);
-
-	if (rq) {
-		spin_lock(&rq->lock);
-		dma_fence_signal_locked(&rq->fence);
-		GEM_BUG_ON(!i915_request_completed(rq));
-		spin_unlock(&rq->lock);
-
-		i915_request_put(rq);
-	}
-
-	if (tsk && tsk->state & TASK_NORMAL)
-		wake_up_process(tsk);
-
-	rcu_read_unlock();
-}
-
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
 			struct intel_rps_ei *ei)
 {
@@ -1456,20 +1396,20 @@ static void ilk_gt_irq_handler(struct drm_i915_private *dev_priv,
 			       u32 gt_iir)
 {
 	if (gt_iir & GT_RENDER_USER_INTERRUPT)
-		notify_ring(dev_priv->engine[RCS]);
+		intel_engine_breadcrumbs_irq(dev_priv->engine[RCS]);
 	if (gt_iir & ILK_BSD_USER_INTERRUPT)
-		notify_ring(dev_priv->engine[VCS]);
+		intel_engine_breadcrumbs_irq(dev_priv->engine[VCS]);
 }
 
 static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 			       u32 gt_iir)
 {
 	if (gt_iir & GT_RENDER_USER_INTERRUPT)
-		notify_ring(dev_priv->engine[RCS]);
+		intel_engine_breadcrumbs_irq(dev_priv->engine[RCS]);
 	if (gt_iir & GT_BSD_USER_INTERRUPT)
-		notify_ring(dev_priv->engine[VCS]);
+		intel_engine_breadcrumbs_irq(dev_priv->engine[VCS]);
 	if (gt_iir & GT_BLT_USER_INTERRUPT)
-		notify_ring(dev_priv->engine[BCS]);
+		intel_engine_breadcrumbs_irq(dev_priv->engine[BCS]);
 
 	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
 		      GT_BSD_CS_ERROR_INTERRUPT |
@@ -1489,7 +1429,7 @@ gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
 		tasklet = true;
 
 	if (iir & GT_RENDER_USER_INTERRUPT) {
-		notify_ring(engine);
+		intel_engine_breadcrumbs_irq(engine);
 		tasklet |= USES_GUC_SUBMISSION(engine->i915);
 	}
 
@@ -1835,7 +1775,7 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
 
 	if (HAS_VEBOX(dev_priv)) {
 		if (pm_iir & PM_VEBOX_USER_INTERRUPT)
-			notify_ring(dev_priv->engine[VECS]);
+			intel_engine_breadcrumbs_irq(dev_priv->engine[VECS]);
 
 		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT)
 			DRM_DEBUG("Command parser error, pm_iir 0x%08x\n", pm_iir);
@@ -4258,7 +4198,7 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
 		I915_WRITE16(IIR, iir);
 
 		if (iir & I915_USER_INTERRUPT)
-			notify_ring(dev_priv->engine[RCS]);
+			intel_engine_breadcrumbs_irq(dev_priv->engine[RCS]);
 
 		if (iir & I915_MASTER_ERROR_INTERRUPT)
 			i8xx_error_irq_handler(dev_priv, eir, eir_stuck);
@@ -4366,7 +4306,7 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
 		I915_WRITE(IIR, iir);
 
 		if (iir & I915_USER_INTERRUPT)
-			notify_ring(dev_priv->engine[RCS]);
+			intel_engine_breadcrumbs_irq(dev_priv->engine[RCS]);
 
 		if (iir & I915_MASTER_ERROR_INTERRUPT)
 			i9xx_error_irq_handler(dev_priv, eir, eir_stuck);
@@ -4511,10 +4451,10 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
 		I915_WRITE(IIR, iir);
 
 		if (iir & I915_USER_INTERRUPT)
-			notify_ring(dev_priv->engine[RCS]);
+			intel_engine_breadcrumbs_irq(dev_priv->engine[RCS]);
 
 		if (iir & I915_BSD_USER_INTERRUPT)
-			notify_ring(dev_priv->engine[VCS]);
+			intel_engine_breadcrumbs_irq(dev_priv->engine[VCS]);
 
 		if (iir & I915_MASTER_ERROR_INTERRUPT)
 			i9xx_error_irq_handler(dev_priv, eir, eir_stuck);
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index f61cc5c1bf08..5ad14409b52d 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -60,7 +60,7 @@ static bool i915_fence_signaled(struct dma_fence *fence)
 
 static bool i915_fence_enable_signaling(struct dma_fence *fence)
 {
-	return intel_engine_enable_signaling(to_request(fence), true);
+	return intel_engine_enable_signaling(to_request(fence));
 }
 
 static signed long i915_fence_wait(struct dma_fence *fence,
@@ -370,9 +370,11 @@ void __i915_request_submit(struct i915_request *request)
 
 	/* We may be recursing from the signal callback of another i915 fence */
 	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+	set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
 	request->global_seqno = seqno;
-	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
-		intel_engine_enable_signaling(request, false);
+	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
+	    !intel_engine_enable_signaling(request))
+		intel_engine_queue_breadcrumbs(engine);
 	spin_unlock(&request->lock);
 
 	engine->emit_breadcrumb(request,
@@ -382,8 +384,6 @@ void __i915_request_submit(struct i915_request *request)
 	move_to_timeline(request, &engine->timeline);
 
 	trace_i915_request_execute(request);
-
-	wake_up_all(&request->execute);
 }
 
 void i915_request_submit(struct i915_request *request)
@@ -427,6 +427,7 @@ void __i915_request_unsubmit(struct i915_request *request)
 	request->global_seqno = 0;
 	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
 		intel_engine_cancel_signaling(request);
+	clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
 	spin_unlock(&request->lock);
 
 	/* Transfer back from the global per-engine timeline to per-context */
@@ -626,13 +627,11 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 
 	/* We bump the ref for the fence chain */
 	i915_sw_fence_init(&i915_request_get(rq)->submit, submit_notify);
-	init_waitqueue_head(&rq->execute);
 
 	i915_sched_node_init(&rq->sched);
 
 	/* No zalloc, must clear what we need by hand */
 	rq->global_seqno = 0;
-	rq->signaling.wait.seqno = 0;
 	rq->file_priv = NULL;
 	rq->batch = NULL;
 	rq->capture_list = NULL;
@@ -1023,13 +1022,10 @@ static bool busywait_stop(unsigned long timeout, unsigned int cpu)
 	return this_cpu != cpu;
 }
 
-static bool __i915_spin_request(const struct i915_request *rq,
-				u32 seqno, int state, unsigned long timeout_us)
+static bool __i915_spin_request(const struct i915_request * const rq,
+				int state, unsigned long timeout_us)
 {
-	struct intel_engine_cs *engine = rq->engine;
-	unsigned int irq, cpu;
-
-	GEM_BUG_ON(!seqno);
+	unsigned int cpu;
 
 	/*
 	 * Only wait for the request if we know it is likely to complete.
@@ -1042,7 +1038,7 @@ static bool __i915_spin_request(const struct i915_request *rq,
 	 * it is a fair assumption that it will not complete within our
 	 * relatively short timeout.
 	 */
-	if (!intel_engine_has_started(engine, seqno))
+	if (!i915_request_started(rq))
 		return false;
 
 	/*
@@ -1056,20 +1052,10 @@ static bool __i915_spin_request(const struct i915_request *rq,
 	 * takes to sleep on a request, on the order of a microsecond.
 	 */
 
-	irq = READ_ONCE(engine->breadcrumbs.irq_count);
 	timeout_us += local_clock_us(&cpu);
 	do {
-		if (intel_engine_has_completed(engine, seqno))
-			return seqno == i915_request_global_seqno(rq);
-
-		/*
-		 * Seqno are meant to be ordered *before* the interrupt. If
-		 * we see an interrupt without a corresponding seqno advance,
-		 * assume we won't see one in the near future but require
-		 * the engine->seqno_barrier() to fixup coherency.
-		 */
-		if (READ_ONCE(engine->breadcrumbs.irq_count) != irq)
-			break;
+		if (i915_request_completed(rq))
+			return true;
 
 		if (signal_pending_state(state, current))
 			break;
@@ -1083,6 +1069,18 @@ static bool __i915_spin_request(const struct i915_request *rq,
 	return false;
 }
 
+struct request_wait {
+	struct dma_fence_cb cb;
+	struct task_struct *tsk;
+};
+
+static void request_wait_wake(struct dma_fence *fence, struct dma_fence_cb *cb)
+{
+	struct request_wait *wait = container_of(cb, typeof(*wait), cb);
+
+	wake_up_process(wait->tsk);
+}
+
 /**
  * i915_request_wait - wait until execution of request has finished
  * @rq: the request to wait upon
@@ -1108,8 +1106,7 @@ long i915_request_wait(struct i915_request *rq,
 {
 	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
 		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
-	DEFINE_WAIT_FUNC(exec, default_wake_function);
-	struct intel_wait wait;
+	struct request_wait wait;
 
 	might_sleep();
 	GEM_BUG_ON(timeout < 0);
@@ -1121,47 +1118,24 @@ long i915_request_wait(struct i915_request *rq,
 		return -ETIME;
 
 	trace_i915_request_wait_begin(rq, flags);
-	add_wait_queue(&rq->execute, &exec);
-	intel_wait_init(&wait);
-	if (flags & I915_WAIT_PRIORITY)
-		i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
-
-restart:
-	do {
-		set_current_state(state);
-		if (intel_wait_update_request(&wait, rq))
-			break;
-
-		if (signal_pending_state(state, current)) {
-			timeout = -ERESTARTSYS;
-			goto complete;
-		}
 
-		if (!timeout) {
-			timeout = -ETIME;
-			goto complete;
-		}
+	/* Optimistic short spin before touching IRQs */
+	if (__i915_spin_request(rq, state, 5))
+		goto out;
 
-		timeout = io_schedule_timeout(timeout);
-	} while (1);
+	if (flags & I915_WAIT_PRIORITY)
+		i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
 
-	GEM_BUG_ON(!intel_wait_has_seqno(&wait));
-	GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
+	wait.tsk = current;
+	if (dma_fence_add_callback(&rq->fence, &wait.cb, request_wait_wake))
+		goto out;
 
-	/* Optimistic short spin before touching IRQs */
-	if (__i915_spin_request(rq, wait.seqno, state, 5))
-		goto complete;
+	for (;;) {
+		set_current_state(state);
 
-	set_current_state(state);
-	if (intel_engine_add_wait(rq->engine, &wait))
-		/*
-		 * In order to check that we haven't missed the interrupt
-		 * as we enabled it, we need to kick ourselves to do a
-		 * coherent check on the seqno before we sleep.
-		 */
-		goto wakeup;
+		if (i915_request_completed(rq))
+			break;
 
-	for (;;) {
 		if (signal_pending_state(state, current)) {
 			timeout = -ERESTARTSYS;
 			break;
@@ -1173,33 +1147,13 @@ long i915_request_wait(struct i915_request *rq,
 		}
 
 		timeout = io_schedule_timeout(timeout);
-
-		if (intel_wait_complete(&wait) &&
-		    intel_wait_check_request(&wait, rq))
-			break;
-
-		set_current_state(state);
-
-wakeup:
-		if (i915_request_completed(rq))
-			break;
-
-		/* Only spin if we know the GPU is processing this request */
-		if (__i915_spin_request(rq, wait.seqno, state, 2))
-			break;
-
-		if (!intel_wait_check_request(&wait, rq)) {
-			intel_engine_remove_wait(rq->engine, &wait);
-			goto restart;
-		}
 	}
-
-	intel_engine_remove_wait(rq->engine, &wait);
-complete:
 	__set_current_state(TASK_RUNNING);
-	remove_wait_queue(&rq->execute, &exec);
-	trace_i915_request_wait_end(rq);
 
+	dma_fence_remove_callback(&rq->fence, &wait.cb);
+
+out:
+	trace_i915_request_wait_end(rq);
 	return timeout;
 }
 
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 83ce982dcbd9..b6d473923506 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -38,23 +38,16 @@ struct drm_i915_gem_object;
 struct i915_request;
 struct i915_timeline;
 
-struct intel_wait {
-	struct rb_node node;
-	struct task_struct *tsk;
-	struct i915_request *request;
-	u32 seqno;
-};
-
-struct intel_signal_node {
-	struct intel_wait wait;
-	struct list_head link;
-};
-
 struct i915_capture_list {
 	struct i915_capture_list *next;
 	struct i915_vma *vma;
 };
 
+enum {
+	I915_FENCE_FLAG_ACTIVE = DMA_FENCE_FLAG_USER_BITS,
+	I915_FENCE_FLAG_SIGNAL,
+};
+
 /**
  * Request queue structure.
  *
@@ -97,7 +90,7 @@ struct i915_request {
 	struct intel_context *hw_context;
 	struct intel_ring *ring;
 	struct i915_timeline *timeline;
-	struct intel_signal_node signaling;
+	struct list_head signal_link;
 
 	/*
 	 * The rcu epoch of when this request was allocated. Used to judiciously
@@ -116,7 +109,6 @@ struct i915_request {
 	 */
 	struct i915_sw_fence submit;
 	wait_queue_entry_t submitq;
-	wait_queue_head_t execute;
 
 	/*
 	 * A list of everyone we wait upon, and everyone who waits upon us.
@@ -255,7 +247,7 @@ i915_request_put(struct i915_request *rq)
  * that it has passed the global seqno and the global seqno is unchanged
  * after the read, it is indeed complete).
  */
-static u32
+static inline u32
 i915_request_global_seqno(const struct i915_request *request)
 {
 	return READ_ONCE(request->global_seqno);
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index eff76558b958..d846f49401a6 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -747,18 +747,19 @@ static void reset_restart(struct drm_i915_private *i915)
 
 static void nop_submit_request(struct i915_request *request)
 {
+	struct intel_engine_cs *engine = request->engine;
 	unsigned long flags;
 
 	GEM_TRACE("%s fence %llx:%lld -> -EIO\n",
-		  request->engine->name,
-		  request->fence.context, request->fence.seqno);
+		  engine->name, request->fence.context, request->fence.seqno);
 	dma_fence_set_error(&request->fence, -EIO);
 
-	spin_lock_irqsave(&request->engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->timeline.lock, flags);
 	__i915_request_submit(request);
 	i915_request_fake_complete(request);
-	intel_engine_write_global_seqno(request->engine, request->global_seqno);
-	spin_unlock_irqrestore(&request->engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+
+	intel_engine_queue_breadcrumbs(engine);
 }
 
 void i915_gem_set_wedged(struct drm_i915_private *i915)
@@ -813,7 +814,7 @@ void i915_gem_set_wedged(struct drm_i915_private *i915)
 
 	for_each_engine(engine, i915, id) {
 		reset_finish_engine(engine);
-		intel_engine_wakeup(engine);
+		intel_engine_signal_breadcrumbs(engine);
 	}
 
 	smp_mb__before_atomic();
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 4fad93fe3678..1187d2edd330 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -29,48 +29,127 @@
 
 #define task_asleep(tsk) ((tsk)->state & TASK_NORMAL && !(tsk)->on_rq)
 
-static unsigned int __intel_breadcrumbs_wakeup(struct intel_breadcrumbs *b)
+static void irq_enable(struct intel_engine_cs *engine)
 {
-	struct intel_wait *wait;
-	unsigned int result = 0;
+	if (!engine->irq_enable)
+		return;
+
+	/* Caller disables interrupts */
+	spin_lock(&engine->i915->irq_lock);
+	engine->irq_enable(engine);
+	spin_unlock(&engine->i915->irq_lock);
+}
+
+static void irq_disable(struct intel_engine_cs *engine)
+{
+	if (!engine->irq_disable)
+		return;
+
+	/* Caller disables interrupts */
+	spin_lock(&engine->i915->irq_lock);
+	engine->irq_disable(engine);
+	spin_unlock(&engine->i915->irq_lock);
+}
 
+static void __intel_breadcrumbs_disarm_irq(struct intel_breadcrumbs *b)
+{
 	lockdep_assert_held(&b->irq_lock);
 
-	wait = b->irq_wait;
-	if (wait) {
-		/*
-		 * N.B. Since task_asleep() and ttwu are not atomic, the
-		 * waiter may actually go to sleep after the check, causing
-		 * us to suppress a valid wakeup. We prefer to reduce the
-		 * number of false positive missed_breadcrumb() warnings
-		 * at the expense of a few false negatives, as it it easy
-		 * to trigger a false positive under heavy load. Enough
-		 * signal should remain from genuine missed_breadcrumb()
-		 * for us to detect in CI.
-		 */
-		bool was_asleep = task_asleep(wait->tsk);
-
-		result = ENGINE_WAKEUP_WAITER;
-		if (wake_up_process(wait->tsk) && was_asleep)
-			result |= ENGINE_WAKEUP_ASLEEP;
-	}
+	GEM_BUG_ON(!b->irq_enabled);
+	if (!--b->irq_enabled)
+		irq_disable(container_of(b,
+					 struct intel_engine_cs,
+					 breadcrumbs));
 
-	return result;
+	b->irq_armed = false;
 }
 
-unsigned int intel_engine_wakeup(struct intel_engine_cs *engine)
+void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine)
 {
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	unsigned long flags;
-	unsigned int result;
 
-	spin_lock_irqsave(&b->irq_lock, flags);
-	result = __intel_breadcrumbs_wakeup(b);
-	spin_unlock_irqrestore(&b->irq_lock, flags);
+	if (!b->irq_armed)
+		return;
+
+	spin_lock_irq(&b->irq_lock);
+	if (b->irq_armed)
+		__intel_breadcrumbs_disarm_irq(b);
+	spin_unlock_irq(&b->irq_lock);
+}
+
+bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine)
+{
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+	struct intel_context *ce, *cn;
+	struct i915_request *rq, *rn;
+	LIST_HEAD(signal);
+
+	spin_lock(&b->irq_lock);
+
+	b->irq_fired = true;
+	if (b->irq_armed && list_empty(&b->signalers))
+		__intel_breadcrumbs_disarm_irq(b);
+
+	list_for_each_entry_safe(ce, cn, &b->signalers, signal_link) {
+		GEM_BUG_ON(list_empty(&ce->signals));
+
+		list_for_each_entry_safe(rq, rn, &ce->signals, signal_link) {
+			if (!i915_request_completed(rq))
+				break;
+
+			GEM_BUG_ON(!test_bit(I915_FENCE_FLAG_SIGNAL,
+					     &rq->fence.flags));
+			clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
+
+			if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
+				     &rq->fence.flags))
+				continue;
+
+			/*
+			 * Queue for execution after dropping the signaling
+			 * spinlock as the callback chain may end adding
+			 * more signalers to the same context or engine.
+			 */
+			i915_request_get(rq);
+			list_add_tail(&rq->signal_link, &signal);
+		}
+
+		if (!list_is_first(&rq->signal_link, &ce->signals)) {
+			__list_del_many(&ce->signals, &rq->signal_link);
+			if (&ce->signals == &rq->signal_link)
+				list_del_init(&ce->signal_link);
+		}
+	}
+
+	spin_unlock(&b->irq_lock);
+
+	list_for_each_entry_safe(rq, rn, &signal, signal_link) {
+		dma_fence_signal(&rq->fence);
+		i915_request_put(rq);
+	}
+
+	return !list_empty(&signal);
+}
+
+bool intel_engine_signal_breadcrumbs(struct intel_engine_cs *engine)
+{
+	bool result;
+
+	local_irq_disable();
+	result = intel_engine_breadcrumbs_irq(engine);
+	local_irq_enable();
 
 	return result;
 }
 
+static void signal_irq_work(struct irq_work *work)
+{
+	struct intel_engine_cs *engine =
+		container_of(work, typeof(*engine), breadcrumbs.irq_work);
+
+	intel_engine_breadcrumbs_irq(engine);
+}
+
 static unsigned long wait_timeout(void)
 {
 	return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
@@ -94,19 +173,15 @@ static void intel_breadcrumbs_hangcheck(struct timer_list *t)
 	struct intel_engine_cs *engine =
 		from_timer(engine, t, breadcrumbs.hangcheck);
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	unsigned int irq_count;
 
 	if (!b->irq_armed)
 		return;
 
-	irq_count = READ_ONCE(b->irq_count);
-	if (b->hangcheck_interrupts != irq_count) {
-		b->hangcheck_interrupts = irq_count;
-		mod_timer(&b->hangcheck, wait_timeout());
-		return;
-	}
+	if (b->irq_fired)
+		goto rearm;
 
-	/* We keep the hangcheck timer alive until we disarm the irq, even
+	/*
+	 * We keep the hangcheck timer alive until we disarm the irq, even
 	 * if there are no waiters at present.
 	 *
 	 * If the waiter was currently running, assume it hasn't had a chance
@@ -118,10 +193,13 @@ static void intel_breadcrumbs_hangcheck(struct timer_list *t)
 	 * but we still have a waiter. Assuming all batches complete within
 	 * DRM_I915_HANGCHECK_JIFFIES [1.5s]!
 	 */
-	if (intel_engine_wakeup(engine) & ENGINE_WAKEUP_ASLEEP) {
+	synchronize_hardirq(engine->i915->drm.irq);
+	if (intel_engine_signal_breadcrumbs(engine)) {
 		missed_breadcrumb(engine);
 		mod_timer(&b->fake_irq, jiffies + 1);
 	} else {
+rearm:
+		b->irq_fired = false;
 		mod_timer(&b->hangcheck, wait_timeout());
 	}
 }
@@ -140,11 +218,7 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t)
 	 * oldest waiter to do the coherent seqno check.
 	 */
 
-	spin_lock_irq(&b->irq_lock);
-	if (b->irq_armed && !__intel_breadcrumbs_wakeup(b))
-		__intel_engine_disarm_breadcrumbs(engine);
-	spin_unlock_irq(&b->irq_lock);
-	if (!b->irq_armed)
+	if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed)
 		return;
 
 	/* If the user has disabled the fake-irq, restore the hangchecking */
@@ -156,43 +230,6 @@ static void intel_breadcrumbs_fake_irq(struct timer_list *t)
 	mod_timer(&b->fake_irq, jiffies + 1);
 }
 
-static void irq_enable(struct intel_engine_cs *engine)
-{
-	if (!engine->irq_enable)
-		return;
-
-	/* Caller disables interrupts */
-	spin_lock(&engine->i915->irq_lock);
-	engine->irq_enable(engine);
-	spin_unlock(&engine->i915->irq_lock);
-}
-
-static void irq_disable(struct intel_engine_cs *engine)
-{
-	if (!engine->irq_disable)
-		return;
-
-	/* Caller disables interrupts */
-	spin_lock(&engine->i915->irq_lock);
-	engine->irq_disable(engine);
-	spin_unlock(&engine->i915->irq_lock);
-}
-
-void __intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	lockdep_assert_held(&b->irq_lock);
-	GEM_BUG_ON(b->irq_wait);
-	GEM_BUG_ON(!b->irq_armed);
-
-	GEM_BUG_ON(!b->irq_enabled);
-	if (!--b->irq_enabled)
-		irq_disable(engine);
-
-	b->irq_armed = false;
-}
-
 void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine)
 {
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
@@ -215,40 +252,6 @@ void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine)
 	spin_unlock_irq(&b->irq_lock);
 }
 
-void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	struct intel_wait *wait, *n;
-
-	if (!b->irq_armed)
-		return;
-
-	/*
-	 * We only disarm the irq when we are idle (all requests completed),
-	 * so if the bottom-half remains asleep, it missed the request
-	 * completion.
-	 */
-	if (intel_engine_wakeup(engine) & ENGINE_WAKEUP_ASLEEP)
-		missed_breadcrumb(engine);
-
-	spin_lock_irq(&b->rb_lock);
-
-	spin_lock(&b->irq_lock);
-	b->irq_wait = NULL;
-	if (b->irq_armed)
-		__intel_engine_disarm_breadcrumbs(engine);
-	spin_unlock(&b->irq_lock);
-
-	rbtree_postorder_for_each_entry_safe(wait, n, &b->waiters, node) {
-		GEM_BUG_ON(!intel_engine_signaled(engine, wait->seqno));
-		RB_CLEAR_NODE(&wait->node);
-		wake_up_process(wait->tsk);
-	}
-	b->waiters = RB_ROOT;
-
-	spin_unlock_irq(&b->rb_lock);
-}
-
 static bool use_fake_irq(const struct intel_breadcrumbs *b)
 {
 	const struct intel_engine_cs *engine =
@@ -264,7 +267,7 @@ static bool use_fake_irq(const struct intel_breadcrumbs *b)
 	 * engine->seqno_barrier(), a timing error that should be transient
 	 * and unlikely to reoccur.
 	 */
-	return READ_ONCE(b->irq_count) == b->hangcheck_interrupts;
+	return !b->irq_fired;
 }
 
 static void enable_fake_irq(struct intel_breadcrumbs *b)
@@ -276,7 +279,7 @@ static void enable_fake_irq(struct intel_breadcrumbs *b)
 		mod_timer(&b->hangcheck, wait_timeout());
 }
 
-static bool __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
+static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
 {
 	struct intel_engine_cs *engine =
 		container_of(b, struct intel_engine_cs, breadcrumbs);
@@ -315,537 +318,135 @@ static bool __intel_breadcrumbs_enable_irq(struct intel_breadcrumbs *b)
 	return enabled;
 }
 
-static inline struct intel_wait *to_wait(struct rb_node *node)
+void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
 {
-	return rb_entry(node, struct intel_wait, node);
-}
+	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 
-static inline void __intel_breadcrumbs_finish(struct intel_breadcrumbs *b,
-					      struct intel_wait *wait)
-{
-	lockdep_assert_held(&b->rb_lock);
-	GEM_BUG_ON(b->irq_wait == wait);
+	spin_lock_init(&b->irq_lock);
+	INIT_LIST_HEAD(&b->signalers);
 
-	/*
-	 * This request is completed, so remove it from the tree, mark it as
-	 * complete, and *then* wake up the associated task. N.B. when the
-	 * task wakes up, it will find the empty rb_node, discern that it
-	 * has already been removed from the tree and skip the serialisation
-	 * of the b->rb_lock and b->irq_lock. This means that the destruction
-	 * of the intel_wait is not serialised with the interrupt handler
-	 * by the waiter - it must instead be serialised by the caller.
-	 */
-	rb_erase(&wait->node, &b->waiters);
-	RB_CLEAR_NODE(&wait->node);
+	init_irq_work(&b->irq_work, signal_irq_work);
 
-	if (wait->tsk->state != TASK_RUNNING)
-		wake_up_process(wait->tsk); /* implicit smp_wmb() */
+	timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
+	timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
 }
 
-static inline void __intel_breadcrumbs_next(struct intel_engine_cs *engine,
-					    struct rb_node *next)
+static void cancel_fake_irq(struct intel_engine_cs *engine)
 {
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
 
-	spin_lock(&b->irq_lock);
-	GEM_BUG_ON(!b->irq_armed);
-	GEM_BUG_ON(!b->irq_wait);
-	b->irq_wait = to_wait(next);
-	spin_unlock(&b->irq_lock);
-
-	/* We always wake up the next waiter that takes over as the bottom-half
-	 * as we may delegate not only the irq-seqno barrier to the next waiter
-	 * but also the task of waking up concurrent waiters.
-	 */
-	if (next)
-		wake_up_process(to_wait(next)->tsk);
+	del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */
+	del_timer_sync(&b->hangcheck);
+	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
 }
 
-static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
-				    struct intel_wait *wait)
+void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
 {
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	struct rb_node **p, *parent, *completed;
-	bool first, armed;
-	u32 seqno;
+	unsigned long flags;
 
-	GEM_BUG_ON(!wait->seqno);
+	spin_lock_irqsave(&b->irq_lock, flags);
 
-	/* Insert the request into the retirement ordered list
-	 * of waiters by walking the rbtree. If we are the oldest
-	 * seqno in the tree (the first to be retired), then
-	 * set ourselves as the bottom-half.
-	 *
-	 * As we descend the tree, prune completed branches since we hold the
-	 * spinlock we know that the first_waiter must be delayed and can
-	 * reduce some of the sequential wake up latency if we take action
-	 * ourselves and wake up the completed tasks in parallel. Also, by
-	 * removing stale elements in the tree, we may be able to reduce the
-	 * ping-pong between the old bottom-half and ourselves as first-waiter.
+	/*
+	 * Leave the fake_irq timer enabled (if it is running), but clear the
+	 * bit so that it turns itself off on its next wake up and goes back
+	 * to the long hangcheck interval if still required.
 	 */
-	armed = false;
-	first = true;
-	parent = NULL;
-	completed = NULL;
-	seqno = intel_engine_get_seqno(engine);
-
-	 /* If the request completed before we managed to grab the spinlock,
-	  * return now before adding ourselves to the rbtree. We let the
-	  * current bottom-half handle any pending wakeups and instead
-	  * try and get out of the way quickly.
-	  */
-	if (i915_seqno_passed(seqno, wait->seqno)) {
-		RB_CLEAR_NODE(&wait->node);
-		return first;
-	}
-
-	p = &b->waiters.rb_node;
-	while (*p) {
-		parent = *p;
-		if (wait->seqno == to_wait(parent)->seqno) {
-			/* We have multiple waiters on the same seqno, select
-			 * the highest priority task (that with the smallest
-			 * task->prio) to serve as the bottom-half for this
-			 * group.
-			 */
-			if (wait->tsk->prio > to_wait(parent)->tsk->prio) {
-				p = &parent->rb_right;
-				first = false;
-			} else {
-				p = &parent->rb_left;
-			}
-		} else if (i915_seqno_passed(wait->seqno,
-					     to_wait(parent)->seqno)) {
-			p = &parent->rb_right;
-			if (i915_seqno_passed(seqno, to_wait(parent)->seqno))
-				completed = parent;
-			else
-				first = false;
-		} else {
-			p = &parent->rb_left;
-		}
-	}
-	rb_link_node(&wait->node, parent, p);
-	rb_insert_color(&wait->node, &b->waiters);
-
-	if (first) {
-		spin_lock(&b->irq_lock);
-		b->irq_wait = wait;
-		/* After assigning ourselves as the new bottom-half, we must
-		 * perform a cursory check to prevent a missed interrupt.
-		 * Either we miss the interrupt whilst programming the hardware,
-		 * or if there was a previous waiter (for a later seqno) they
-		 * may be woken instead of us (due to the inherent race
-		 * in the unlocked read of b->irq_seqno_bh in the irq handler)
-		 * and so we miss the wake up.
-		 */
-		armed = __intel_breadcrumbs_enable_irq(b);
-		spin_unlock(&b->irq_lock);
-	}
-
-	if (completed) {
-		/* Advance the bottom-half (b->irq_wait) before we wake up
-		 * the waiters who may scribble over their intel_wait
-		 * just as the interrupt handler is dereferencing it via
-		 * b->irq_wait.
-		 */
-		if (!first) {
-			struct rb_node *next = rb_next(completed);
-			GEM_BUG_ON(next == &wait->node);
-			__intel_breadcrumbs_next(engine, next);
-		}
-
-		do {
-			struct intel_wait *crumb = to_wait(completed);
-			completed = rb_prev(completed);
-			__intel_breadcrumbs_finish(b, crumb);
-		} while (completed);
-	}
-
-	GEM_BUG_ON(!b->irq_wait);
-	GEM_BUG_ON(!b->irq_armed);
-	GEM_BUG_ON(rb_first(&b->waiters) != &b->irq_wait->node);
-
-	return armed;
-}
-
-bool intel_engine_add_wait(struct intel_engine_cs *engine,
-			   struct intel_wait *wait)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	bool armed;
-
-	spin_lock_irq(&b->rb_lock);
-	armed = __intel_engine_add_wait(engine, wait);
-	spin_unlock_irq(&b->rb_lock);
-	if (armed)
-		return armed;
-
-	/* Make the caller recheck if its request has already started. */
-	return intel_engine_has_started(engine, wait->seqno);
-}
-
-static inline bool chain_wakeup(struct rb_node *rb, int priority)
-{
-	return rb && to_wait(rb)->tsk->prio <= priority;
-}
+	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
 
-static inline int wakeup_priority(struct intel_breadcrumbs *b,
-				  struct task_struct *tsk)
-{
-	if (tsk == b->signaler)
-		return INT_MIN;
+	if (b->irq_enabled)
+		irq_enable(engine);
 	else
-		return tsk->prio;
-}
-
-static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
-				       struct intel_wait *wait)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	lockdep_assert_held(&b->rb_lock);
-
-	if (RB_EMPTY_NODE(&wait->node))
-		goto out;
-
-	if (b->irq_wait == wait) {
-		const int priority = wakeup_priority(b, wait->tsk);
-		struct rb_node *next;
-
-		/* We are the current bottom-half. Find the next candidate,
-		 * the first waiter in the queue on the remaining oldest
-		 * request. As multiple seqnos may complete in the time it
-		 * takes us to wake up and find the next waiter, we have to
-		 * wake up that waiter for it to perform its own coherent
-		 * completion check.
-		 */
-		next = rb_next(&wait->node);
-		if (chain_wakeup(next, priority)) {
-			/* If the next waiter is already complete,
-			 * wake it up and continue onto the next waiter. So
-			 * if have a small herd, they will wake up in parallel
-			 * rather than sequentially, which should reduce
-			 * the overall latency in waking all the completed
-			 * clients.
-			 *
-			 * However, waking up a chain adds extra latency to
-			 * the first_waiter. This is undesirable if that
-			 * waiter is a high priority task.
-			 */
-			u32 seqno = intel_engine_get_seqno(engine);
-
-			while (i915_seqno_passed(seqno, to_wait(next)->seqno)) {
-				struct rb_node *n = rb_next(next);
-
-				__intel_breadcrumbs_finish(b, to_wait(next));
-				next = n;
-				if (!chain_wakeup(next, priority))
-					break;
-			}
-		}
-
-		__intel_breadcrumbs_next(engine, next);
-	} else {
-		GEM_BUG_ON(rb_first(&b->waiters) == &wait->node);
-	}
-
-	GEM_BUG_ON(RB_EMPTY_NODE(&wait->node));
-	rb_erase(&wait->node, &b->waiters);
-	RB_CLEAR_NODE(&wait->node);
-
-out:
-	GEM_BUG_ON(b->irq_wait == wait);
-	GEM_BUG_ON(rb_first(&b->waiters) !=
-		   (b->irq_wait ? &b->irq_wait->node : NULL));
-}
-
-void intel_engine_remove_wait(struct intel_engine_cs *engine,
-			      struct intel_wait *wait)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	/* Quick check to see if this waiter was already decoupled from
-	 * the tree by the bottom-half to avoid contention on the spinlock
-	 * by the herd.
-	 */
-	if (RB_EMPTY_NODE(&wait->node)) {
-		GEM_BUG_ON(READ_ONCE(b->irq_wait) == wait);
-		return;
-	}
+		irq_disable(engine);
 
-	spin_lock_irq(&b->rb_lock);
-	__intel_engine_remove_wait(engine, wait);
-	spin_unlock_irq(&b->rb_lock);
+	spin_unlock_irqrestore(&b->irq_lock, flags);
 }
 
-static void signaler_set_rtpriority(void)
+void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
 {
-	 struct sched_param param = { .sched_priority = 1 };
-
-	 sched_setscheduler_nocheck(current, SCHED_FIFO, &param);
+	cancel_fake_irq(engine);
 }
 
-static int intel_breadcrumbs_signaler(void *arg)
+bool intel_engine_enable_signaling(struct i915_request *rq)
 {
-	struct intel_engine_cs *engine = arg;
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	struct i915_request *rq, *n;
-
-	/* Install ourselves with high priority to reduce signalling latency */
-	signaler_set_rtpriority();
-
-	do {
-		bool do_schedule = true;
-		LIST_HEAD(list);
-		u32 seqno;
-
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (list_empty(&b->signals))
-			goto sleep;
-
-		/*
-		 * We are either woken up by the interrupt bottom-half,
-		 * or by a client adding a new signaller. In both cases,
-		 * the GPU seqno may have advanced beyond our oldest signal.
-		 * If it has, propagate the signal, remove the waiter and
-		 * check again with the next oldest signal. Otherwise we
-		 * need to wait for a new interrupt from the GPU or for
-		 * a new client.
-		 */
-		seqno = intel_engine_get_seqno(engine);
-
-		spin_lock_irq(&b->rb_lock);
-		list_for_each_entry_safe(rq, n, &b->signals, signaling.link) {
-			u32 this = rq->signaling.wait.seqno;
-
-			GEM_BUG_ON(!rq->signaling.wait.seqno);
-
-			if (!i915_seqno_passed(seqno, this))
-				break;
+	struct intel_breadcrumbs *b = &rq->engine->breadcrumbs;
 
-			if (likely(this == i915_request_global_seqno(rq))) {
-				__intel_engine_remove_wait(engine,
-							   &rq->signaling.wait);
+	GEM_BUG_ON(test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags));
 
-				rq->signaling.wait.seqno = 0;
-				__list_del_entry(&rq->signaling.link);
-
-				if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
-					      &rq->fence.flags)) {
-					list_add_tail(&rq->signaling.link,
-						      &list);
-					i915_request_get(rq);
-				}
-			}
-		}
-		spin_unlock_irq(&b->rb_lock);
+	if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags))
+		return true;
 
-		if (!list_empty(&list)) {
-			local_bh_disable();
-			list_for_each_entry_safe(rq, n, &list, signaling.link) {
-				dma_fence_signal(&rq->fence);
-				GEM_BUG_ON(!i915_request_completed(rq));
-				i915_request_put(rq);
-			}
-			local_bh_enable(); /* kick start the tasklets */
+	spin_lock(&b->irq_lock);
+	if (test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags) &&
+	    !i915_request_completed(rq)) {
+		struct intel_context *ce = rq->hw_context;
+		struct list_head *pos;
 
-			/*
-			 * If the engine is saturated we may be continually
-			 * processing completed requests. This angers the
-			 * NMI watchdog if we never let anything else
-			 * have access to the CPU. Let's pretend to be nice
-			 * and relinquish the CPU if we burn through the
-			 * entire RT timeslice!
-			 */
-			do_schedule = need_resched();
-		}
+		__intel_breadcrumbs_arm_irq(b);
 
-		if (unlikely(do_schedule)) {
-sleep:
-			if (kthread_should_park())
-				kthread_parkme();
+		list_for_each_prev(pos, &ce->signals) {
+			struct i915_request *it =
+				list_entry(pos, typeof(*it), signal_link);
 
-			if (unlikely(kthread_should_stop()))
+			if (i915_seqno_passed(rq->fence.seqno, it->fence.seqno))
 				break;
-
-			schedule();
 		}
-	} while (1);
-	__set_current_state(TASK_RUNNING);
+		list_add(&rq->signal_link, pos);
+		if (pos == &ce->signals)
+			list_move_tail(&ce->signal_link, &b->signalers);
 
-	return 0;
-}
-
-static void insert_signal(struct intel_breadcrumbs *b,
-			  struct i915_request *request,
-			  const u32 seqno)
-{
-	struct i915_request *iter;
-
-	lockdep_assert_held(&b->rb_lock);
-
-	/*
-	 * A reasonable assumption is that we are called to add signals
-	 * in sequence, as the requests are submitted for execution and
-	 * assigned a global_seqno. This will be the case for the majority
-	 * of internally generated signals (inter-engine signaling).
-	 *
-	 * Out of order waiters triggering random signaling enabling will
-	 * be more problematic, but hopefully rare enough and the list
-	 * small enough that the O(N) insertion sort is not an issue.
-	 */
-
-	list_for_each_entry_reverse(iter, &b->signals, signaling.link)
-		if (i915_seqno_passed(seqno, iter->signaling.wait.seqno))
-			break;
-
-	list_add(&request->signaling.link, &iter->signaling.link);
-}
-
-bool intel_engine_enable_signaling(struct i915_request *request, bool wakeup)
-{
-	struct intel_engine_cs *engine = request->engine;
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	struct intel_wait *wait = &request->signaling.wait;
-	u32 seqno;
-
-	/*
-	 * Note that we may be called from an interrupt handler on another
-	 * device (e.g. nouveau signaling a fence completion causing us
-	 * to submit a request, and so enable signaling). As such,
-	 * we need to make sure that all other users of b->rb_lock protect
-	 * against interrupts, i.e. use spin_lock_irqsave.
-	 */
-
-	/* locked by dma_fence_enable_sw_signaling() (irqsafe fence->lock) */
-	GEM_BUG_ON(!irqs_disabled());
-	lockdep_assert_held(&request->lock);
-
-	seqno = i915_request_global_seqno(request);
-	if (!seqno) /* will be enabled later upon execution */
-		return true;
-
-	GEM_BUG_ON(wait->seqno);
-	wait->tsk = b->signaler;
-	wait->request = request;
-	wait->seqno = seqno;
-
-	/*
-	 * Add ourselves into the list of waiters, but registering our
-	 * bottom-half as the signaller thread. As per usual, only the oldest
-	 * waiter (not just signaller) is tasked as the bottom-half waking
-	 * up all completed waiters after the user interrupt.
-	 *
-	 * If we are the oldest waiter, enable the irq (after which we
-	 * must double check that the seqno did not complete).
-	 */
-	spin_lock(&b->rb_lock);
-	insert_signal(b, request, seqno);
-	wakeup &= __intel_engine_add_wait(engine, wait);
-	spin_unlock(&b->rb_lock);
-
-	if (wakeup) {
-		wake_up_process(b->signaler);
-		return !intel_wait_complete(wait);
+		set_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
 	}
+	spin_unlock(&b->irq_lock);
 
-	return true;
+	return !i915_request_completed(rq);
 }
 
-void intel_engine_cancel_signaling(struct i915_request *request)
+void intel_engine_cancel_signaling(struct i915_request *rq)
 {
-	struct intel_engine_cs *engine = request->engine;
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	GEM_BUG_ON(!irqs_disabled());
-	lockdep_assert_held(&request->lock);
+	struct intel_breadcrumbs *b = &rq->engine->breadcrumbs;
 
-	if (!READ_ONCE(request->signaling.wait.seqno))
+	if (!test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags))
 		return;
 
-	spin_lock(&b->rb_lock);
-	__intel_engine_remove_wait(engine, &request->signaling.wait);
-	if (fetch_and_zero(&request->signaling.wait.seqno))
-		__list_del_entry(&request->signaling.link);
-	spin_unlock(&b->rb_lock);
-}
-
-int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	struct task_struct *tsk;
-
-	spin_lock_init(&b->rb_lock);
-	spin_lock_init(&b->irq_lock);
-
-	timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
-	timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
-
-	INIT_LIST_HEAD(&b->signals);
-
-	/* Spawn a thread to provide a common bottom-half for all signals.
-	 * As this is an asynchronous interface we cannot steal the current
-	 * task for handling the bottom-half to the user interrupt, therefore
-	 * we create a thread to do the coherent seqno dance after the
-	 * interrupt and then signal the waitqueue (via the dma-buf/fence).
-	 */
-	tsk = kthread_run(intel_breadcrumbs_signaler, engine,
-			  "i915/signal:%d", engine->id);
-	if (IS_ERR(tsk))
-		return PTR_ERR(tsk);
-
-	b->signaler = tsk;
-
-	return 0;
-}
+	spin_lock(&b->irq_lock);
+	if (test_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags)) {
+		struct intel_context *ce = rq->hw_context;
 
-static void cancel_fake_irq(struct intel_engine_cs *engine)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+		list_del(&rq->signal_link);
+		if (list_empty(&ce->signals))
+			list_del_init(&ce->signal_link);
 
-	del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */
-	del_timer_sync(&b->hangcheck);
-	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
+		clear_bit(I915_FENCE_FLAG_SIGNAL, &rq->fence.flags);
+	}
+	spin_unlock(&b->irq_lock);
 }
 
-void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
+void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine,
+				    struct drm_printer *p)
 {
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	unsigned long flags;
-
-	spin_lock_irqsave(&b->irq_lock, flags);
-
-	/*
-	 * Leave the fake_irq timer enabled (if it is running), but clear the
-	 * bit so that it turns itself off on its next wake up and goes back
-	 * to the long hangcheck interval if still required.
-	 */
-	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
+	struct intel_context *ce;
+	struct i915_request *rq;
 
-	if (b->irq_enabled)
-		irq_enable(engine);
-	else
-		irq_disable(engine);
-
-	spin_unlock_irqrestore(&b->irq_lock, flags);
-}
-
-void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
+	if (list_empty(&b->signalers))
+		return;
 
-	/* The engines should be idle and all requests accounted for! */
-	WARN_ON(READ_ONCE(b->irq_wait));
-	WARN_ON(!RB_EMPTY_ROOT(&b->waiters));
-	WARN_ON(!list_empty(&b->signals));
+	drm_printf(p, "Signals:\n");
 
-	if (!IS_ERR_OR_NULL(b->signaler))
-		kthread_stop(b->signaler);
+	spin_lock_irq(&b->irq_lock);
+	list_for_each_entry(ce, &b->signalers, signal_link) {
+		list_for_each_entry(rq, &ce->signals, signal_link) {
+			drm_printf(p, "\t[%llx:%llx%s] @ %dms\n",
+				   rq->fence.context, rq->fence.seqno,
+				   i915_request_completed(rq) ? "!" :
+				   i915_request_started(rq) ? "*" :
+				   "",
+				   jiffies_to_msecs(jiffies - rq->emitted_jiffies));
+		}
+	}
+	spin_unlock_irq(&b->irq_lock);
 
-	cancel_fake_irq(engine);
+	if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
+		drm_printf(p, "Fake irq active\n");
 }
-
-#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
-#include "selftests/intel_breadcrumbs.c"
-#endif
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index ae455b874c9f..429f21e16428 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -458,12 +458,6 @@ int intel_engines_init(struct drm_i915_private *dev_priv)
 void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno)
 {
 	intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
-
-	/* After manually advancing the seqno, fake the interrupt in case
-	 * there are any waiters for that seqno.
-	 */
-	intel_engine_wakeup(engine);
-
 	GEM_BUG_ON(intel_engine_get_seqno(engine) != seqno);
 }
 
@@ -667,16 +661,10 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
 		}
 	}
 
-	ret = intel_engine_init_breadcrumbs(engine);
-	if (ret)
-		goto err_unpin_preempt;
+	intel_engine_init_breadcrumbs(engine);
 
 	return 0;
 
-err_unpin_preempt:
-	if (i915->preempt_context)
-		__intel_context_unpin(i915->preempt_context, engine);
-
 err_unpin_kernel:
 	__intel_context_unpin(i915->kernel_context, engine);
 	return ret;
@@ -1236,12 +1224,14 @@ static void print_request(struct drm_printer *m,
 
 	x = print_sched_attr(rq->i915, &rq->sched.attr, buf, x, sizeof(buf));
 
-	drm_printf(m, "%s%x%s [%llx:%llx]%s @ %dms: %s\n",
+	drm_printf(m, "%s%x%s%s [%llx:%llx]%s @ %dms: %s\n",
 		   prefix,
 		   rq->global_seqno,
 		   i915_request_completed(rq) ? "!" :
 		   i915_request_started(rq) ? "*" :
 		   "",
+		   test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+			    &rq->fence.flags) ?  "+" : "",
 		   rq->fence.context, rq->fence.seqno,
 		   buf,
 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
@@ -1433,12 +1423,9 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 		       struct drm_printer *m,
 		       const char *header, ...)
 {
-	struct intel_breadcrumbs * const b = &engine->breadcrumbs;
 	struct i915_gpu_error * const error = &engine->i915->gpu_error;
 	struct i915_request *rq;
 	intel_wakeref_t wakeref;
-	unsigned long flags;
-	struct rb_node *rb;
 
 	if (header) {
 		va_list ap;
@@ -1504,21 +1491,12 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 
 	intel_execlists_show_requests(engine, m, print_request, 8);
 
-	spin_lock_irqsave(&b->rb_lock, flags);
-	for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
-		struct intel_wait *w = rb_entry(rb, typeof(*w), node);
-
-		drm_printf(m, "\t%s [%d:%c] waiting for %x\n",
-			   w->tsk->comm, w->tsk->pid,
-			   task_state_to_char(w->tsk),
-			   w->seqno);
-	}
-	spin_unlock_irqrestore(&b->rb_lock, flags);
-
 	drm_printf(m, "HWSP:\n");
 	hexdump(m, engine->status_page.addr, PAGE_SIZE);
 
 	drm_printf(m, "Idle? %s\n", yesno(intel_engine_is_idle(engine)));
+
+	intel_engine_print_breadcrumbs(engine, m);
 }
 
 static u8 user_class_map[] = {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 2b4beb15a271..97732e387821 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -482,8 +482,8 @@ static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 
 	for (i = 0; i < GEN7_XCS_WA; i++) {
 		*cs++ = MI_STORE_DWORD_INDEX;
-		*cs++ = I915_GEM_HWS_INDEX_ADDR;
-		*cs++ = rq->global_seqno;
+		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
+		*cs++ = rq->fence.seqno;
 	}
 
 	*cs++ = MI_FLUSH_DW;
@@ -733,7 +733,7 @@ static int init_ring_common(struct intel_engine_cs *engine)
 	}
 
 	/* Papering over lost _interrupts_ immediately following the restart */
-	intel_engine_wakeup(engine);
+	intel_engine_queue_breadcrumbs(engine);
 out:
 	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index de0099ea926b..6d1485208342 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -5,6 +5,7 @@
 #include <drm/drm_util.h>
 
 #include <linux/hashtable.h>
+#include <linux/irq_work.h>
 #include <linux/seqlock.h>
 
 #include "i915_gem_batch_pool.h"
@@ -384,22 +385,19 @@ struct intel_engine_cs {
 	 * the overhead of waking that client is much preferred.
 	 */
 	struct intel_breadcrumbs {
-		spinlock_t irq_lock; /* protects irq_*; irqsafe */
-		struct intel_wait *irq_wait; /* oldest waiter by retirement */
+		spinlock_t irq_lock;
+		struct list_head signalers;
 
-		spinlock_t rb_lock; /* protects the rb and wraps irq_lock */
-		struct rb_root waiters; /* sorted by retirement, priority */
-		struct list_head signals; /* sorted by retirement */
-		struct task_struct *signaler; /* used for fence signalling */
+		struct irq_work irq_work;
 
 		struct timer_list fake_irq; /* used after a missed interrupt */
 		struct timer_list hangcheck; /* detect missed interrupts */
 
 		unsigned int hangcheck_interrupts;
 		unsigned int irq_enabled;
-		unsigned int irq_count;
 
-		bool irq_armed : 1;
+		bool irq_armed;
+		bool irq_fired;
 	} breadcrumbs;
 
 	struct {
@@ -886,83 +884,32 @@ static inline bool intel_engine_has_started(struct intel_engine_cs *engine,
 void intel_engine_get_instdone(struct intel_engine_cs *engine,
 			       struct intel_instdone *instdone);
 
-/* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
-int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
-
-static inline void intel_wait_init(struct intel_wait *wait)
-{
-	wait->tsk = current;
-	wait->request = NULL;
-}
-
-static inline void intel_wait_init_for_seqno(struct intel_wait *wait, u32 seqno)
-{
-	wait->tsk = current;
-	wait->seqno = seqno;
-}
-
-static inline bool intel_wait_has_seqno(const struct intel_wait *wait)
-{
-	return wait->seqno;
-}
-
-static inline bool
-intel_wait_update_seqno(struct intel_wait *wait, u32 seqno)
-{
-	wait->seqno = seqno;
-	return intel_wait_has_seqno(wait);
-}
-
-static inline bool
-intel_wait_update_request(struct intel_wait *wait,
-			  const struct i915_request *rq)
-{
-	return intel_wait_update_seqno(wait, i915_request_global_seqno(rq));
-}
-
-static inline bool
-intel_wait_check_seqno(const struct intel_wait *wait, u32 seqno)
-{
-	return wait->seqno == seqno;
-}
-
-static inline bool
-intel_wait_check_request(const struct intel_wait *wait,
-			 const struct i915_request *rq)
-{
-	return intel_wait_check_seqno(wait, i915_request_global_seqno(rq));
-}
-
-static inline bool intel_wait_complete(const struct intel_wait *wait)
-{
-	return RB_EMPTY_NODE(&wait->node);
-}
+void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
+void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine);
 
-bool intel_engine_add_wait(struct intel_engine_cs *engine,
-			   struct intel_wait *wait);
-void intel_engine_remove_wait(struct intel_engine_cs *engine,
-			      struct intel_wait *wait);
-bool intel_engine_enable_signaling(struct i915_request *request, bool wakeup);
+bool intel_engine_enable_signaling(struct i915_request *request);
 void intel_engine_cancel_signaling(struct i915_request *request);
 
-static inline bool intel_engine_has_waiter(const struct intel_engine_cs *engine)
-{
-	return READ_ONCE(engine->breadcrumbs.irq_wait);
-}
-
-unsigned int intel_engine_wakeup(struct intel_engine_cs *engine);
-#define ENGINE_WAKEUP_WAITER BIT(0)
-#define ENGINE_WAKEUP_ASLEEP BIT(1)
-
 void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine);
 void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine);
 
-void __intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine);
+bool intel_engine_signal_breadcrumbs(struct intel_engine_cs *engine);
 void intel_engine_disarm_breadcrumbs(struct intel_engine_cs *engine);
 
+static inline void
+intel_engine_queue_breadcrumbs(struct intel_engine_cs *engine)
+{
+	irq_work_queue(&engine->breadcrumbs.irq_work);
+}
+
+bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine);
+
 void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine);
 void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine);
 
+void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine,
+				    struct drm_printer *p);
+
 static inline u32 *gen8_emit_pipe_control(u32 *batch, u32 flags, u32 offset)
 {
 	memset(batch, 0, 6 * sizeof(u32));
diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
index 4a83a1c6c406..88e5ab586337 100644
--- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
@@ -15,7 +15,6 @@ selftest(scatterlist, scatterlist_mock_selftests)
 selftest(syncmap, i915_syncmap_mock_selftests)
 selftest(uncore, intel_uncore_mock_selftests)
 selftest(engine, intel_engine_cs_mock_selftests)
-selftest(breadcrumbs, intel_breadcrumbs_mock_selftests)
 selftest(timelines, i915_timeline_mock_selftests)
 selftest(requests, i915_request_mock_selftests)
 selftest(objects, i915_gem_object_mock_selftests)
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 2e14d6d3bad7..48b6159afe64 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -25,8 +25,11 @@
 #include <linux/prime_numbers.h>
 
 #include "../i915_selftest.h"
+#include "i915_random.h"
+#include "lib_sw_fence.h"
 
 #include "mock_context.h"
+#include "mock_drm.h"
 #include "mock_gem_device.h"
 
 static int igt_add_request(void *arg)
@@ -246,6 +249,239 @@ static int igt_request_rewind(void *arg)
 	return err;
 }
 
+struct smoketest {
+	struct intel_engine_cs *engine;
+	struct i915_gem_context **contexts;
+	unsigned int ncontexts, max_batch;
+	atomic_long_t num_waits, num_fences;
+	struct i915_request *(*request_alloc)(struct i915_gem_context *,
+					      struct intel_engine_cs *);
+
+};
+
+static struct i915_request *
+__mock_request_alloc(struct i915_gem_context *ctx,
+		     struct intel_engine_cs *engine)
+{
+	return mock_request(engine, ctx, 0);
+}
+
+static struct i915_request *
+__live_request_alloc(struct i915_gem_context *ctx,
+		     struct intel_engine_cs *engine)
+{
+	return i915_request_alloc(engine, ctx);
+}
+
+static int __igt_breadcrumbs_smoketest(void *arg)
+{
+	struct smoketest *t = arg;
+	struct mutex *BKL = &t->engine->i915->drm.struct_mutex;
+	struct i915_request **requests;
+	I915_RND_STATE(prng);
+	const unsigned int total = 4 * t->ncontexts + 1;
+	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
+	unsigned int num_waits = 0, num_fences = 0;
+	unsigned int *order;
+	int err = 0;
+
+	requests = kmalloc_array(total, sizeof(*requests), GFP_KERNEL);
+	if (!requests)
+		return -ENOMEM;
+
+	order = i915_random_order(total, &prng);
+	if (!order) {
+		err = -ENOMEM;
+		goto out_requests;
+	}
+
+	while (!kthread_should_stop()) {
+		struct i915_sw_fence *submit, *wait;
+		unsigned int n, count;
+
+		submit = heap_fence_create(GFP_KERNEL);
+		if (!submit) {
+			err = -ENOMEM;
+			break;
+		}
+
+		wait = heap_fence_create(GFP_KERNEL);
+		if (!wait) {
+			i915_sw_fence_commit(submit);
+			heap_fence_put(submit);
+			err = ENOMEM;
+			break;
+		}
+
+		i915_random_reorder(order, total, &prng);
+		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
+
+		for (n = 0; n < count; n++) {
+			struct i915_gem_context *ctx =
+				t->contexts[order[n] % t->ncontexts];
+			struct i915_request *rq;
+
+			mutex_lock(BKL);
+
+			rq = t->request_alloc(ctx, t->engine);
+			if (IS_ERR(rq)) {
+				mutex_unlock(BKL);
+				err = PTR_ERR(rq);
+				count = n;
+				break;
+			}
+
+			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
+							       submit,
+							       GFP_KERNEL);
+
+			requests[n] = i915_request_get(rq);
+			i915_request_add(rq);
+
+			mutex_unlock(BKL);
+
+			if (err >= 0)
+				err = i915_sw_fence_await_dma_fence(wait,
+								    &rq->fence,
+								    0,
+								    GFP_KERNEL);
+			if (err < 0) {
+				i915_request_put(rq);
+				count = n;
+				break;
+			}
+		}
+
+		i915_sw_fence_commit(submit);
+		i915_sw_fence_commit(wait);
+
+		if (!wait_event_timeout(wait->wait,
+					i915_sw_fence_done(wait),
+					HZ / 2)) {
+			struct i915_request *rq = requests[count - 1];
+
+			pr_err("waiting for %d fences (last %llx:%lld) on %s timed out!\n",
+			       count,
+			       rq->fence.context, rq->fence.seqno,
+			       t->engine->name);
+			i915_gem_set_wedged(t->engine->i915);
+			GEM_BUG_ON(!i915_request_completed(rq));
+			i915_sw_fence_wait(wait);
+			err = -EIO;
+		}
+
+		for (n = 0; n < count; n++) {
+			struct i915_request *rq = requests[n];
+
+			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
+				      &rq->fence.flags)) {
+				pr_err("%llu:%llu was not signaled!\n",
+				       rq->fence.context, rq->fence.seqno);
+				err = -EINVAL;
+			}
+
+			i915_request_put(rq);
+		}
+
+		heap_fence_put(wait);
+		heap_fence_put(submit);
+
+		if (err < 0)
+			break;
+
+		num_fences += count;
+		num_waits++;
+
+		cond_resched();
+	}
+
+	atomic_long_add(num_fences, &t->num_fences);
+	atomic_long_add(num_waits, &t->num_waits);
+
+	kfree(order);
+out_requests:
+	kfree(requests);
+	return err;
+}
+
+static int mock_breadcrumbs_smoketest(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct smoketest t = {
+		.engine = i915->engine[RCS],
+		.ncontexts = 1024,
+		.max_batch = 1024,
+		.request_alloc = __mock_request_alloc
+	};
+	unsigned int ncpus = num_online_cpus();
+	struct task_struct **threads;
+	unsigned int n;
+	int ret = 0;
+
+	threads = kmalloc_array(ncpus, sizeof(*threads), GFP_KERNEL);
+	if (!threads)
+		return -ENOMEM;
+
+	t.contexts =
+		kmalloc_array(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
+	if (!t.contexts) {
+		ret = -ENOMEM;
+		goto out_threads;
+	}
+
+	mutex_lock(&t.engine->i915->drm.struct_mutex);
+	for (n = 0; n < t.ncontexts; n++) {
+		t.contexts[n] = mock_context(t.engine->i915, "mock");
+		if (!t.contexts[n]) {
+			ret = -ENOMEM;
+			goto out_contexts;
+		}
+	}
+
+	for (n = 0; n < ncpus; n++) {
+		threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
+					 &t, "igt/%d", n);
+		if (IS_ERR(threads[n])) {
+			ret = PTR_ERR(threads[n]);
+			ncpus = n;
+			break;
+		}
+
+		get_task_struct(threads[n]);
+	}
+	mutex_unlock(&t.engine->i915->drm.struct_mutex);
+
+	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
+
+	for (n = 0; n < ncpus; n++) {
+		int err;
+
+		err = kthread_stop(threads[n]);
+		if (err < 0 && !ret)
+			ret = err;
+
+		put_task_struct(threads[n]);
+	}
+	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
+		atomic_long_read(&t.num_waits),
+		atomic_long_read(&t.num_fences),
+		ncpus);
+
+	mutex_lock(&t.engine->i915->drm.struct_mutex);
+out_contexts:
+	for (n = 0; n < t.ncontexts; n++) {
+		if (!t.contexts[n])
+			break;
+		mock_context_close(t.contexts[n]);
+	}
+	mutex_unlock(&t.engine->i915->drm.struct_mutex);
+	kfree(t.contexts);
+out_threads:
+	kfree(threads);
+
+	return ret;
+}
+
 int i915_request_mock_selftests(void)
 {
 	static const struct i915_subtest tests[] = {
@@ -253,6 +489,7 @@ int i915_request_mock_selftests(void)
 		SUBTEST(igt_wait_request),
 		SUBTEST(igt_fence_wait),
 		SUBTEST(igt_request_rewind),
+		SUBTEST(mock_breadcrumbs_smoketest),
 	};
 	struct drm_i915_private *i915;
 	intel_wakeref_t wakeref;
@@ -872,6 +1109,166 @@ static int live_sequential_engines(void *arg)
 	return err;
 }
 
+static int
+max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
+{
+	struct i915_request *rq;
+	int ret;
+
+	/*
+	 * Before execlists, all contexts share the same ringbuffer. With
+	 * execlists, each context/engine has a separate ringbuffer and
+	 * for the purposes of this test, inexhaustible.
+	 *
+	 * For the global ringbuffer though, we have to be very careful
+	 * that we do not wrap while preventing the execution of requests
+	 * with a unsignaled fence.
+	 */
+	if (HAS_EXECLISTS(ctx->i915))
+		return INT_MAX;
+
+	rq = i915_request_alloc(engine, ctx);
+	if (IS_ERR(rq)) {
+		ret = PTR_ERR(rq);
+	} else {
+		int sz;
+
+		ret = rq->ring->size - rq->reserved_space;
+		i915_request_add(rq);
+
+		sz = rq->ring->emit - rq->head;
+		if (sz < 0)
+			sz += rq->ring->size;
+		ret /= sz;
+		ret /= 2; /* leave half spare, in case of emergency! */
+
+		/* One ring interleaved between requests from all cpus */
+		ret /= num_online_cpus() + 1;
+	}
+
+	return ret;
+}
+
+static int live_breadcrumbs_smoketest(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct smoketest t[I915_NUM_ENGINES];
+	unsigned int ncpus = num_online_cpus();
+	unsigned long num_waits, num_fences;
+	struct intel_engine_cs *engine;
+	struct task_struct **threads;
+	enum intel_engine_id id;
+	intel_wakeref_t wakeref;
+	struct drm_file *file;
+	struct live_test live;
+	unsigned int n;
+	int ret = 0;
+
+	wakeref = intel_runtime_pm_get(i915);
+
+	file = mock_file(i915);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto out_rpm;
+	}
+
+	threads = kcalloc(ncpus * I915_NUM_ENGINES,
+			  sizeof(*threads),
+			  GFP_KERNEL);
+	if (!threads)
+		return -ENOMEM;
+
+	memset(&t[0], 0, sizeof(t[0]));
+	t[0].request_alloc = __live_request_alloc;
+	t[0].ncontexts = 64;
+	t[0].contexts = kmalloc_array(t[0].ncontexts,
+				      sizeof(*t[0].contexts),
+				      GFP_KERNEL);
+	if (!t[0].contexts) {
+		ret = -ENOMEM;
+		goto out_threads;
+	}
+
+	mutex_lock(&i915->drm.struct_mutex);
+	for (n = 0; n < t[0].ncontexts; n++) {
+		t[0].contexts[n] = live_context(i915, file);
+		if (!t[0].contexts[n]) {
+			ret = -ENOMEM;
+			goto out_contexts;
+		}
+	}
+
+	ret = begin_live_test(&live, i915, __func__, "");
+	if (ret)
+		goto out_contexts;
+
+	for_each_engine(engine, i915, id) {
+		t[id] = t[0];
+		t[id].engine = engine;
+		t[id].max_batch = max_batches(t[0].contexts[0], engine);
+		if (t[id].max_batch < 0) {
+			ret = t[id].max_batch;
+			goto out_flush;
+		}
+		pr_debug("Limiting batches to %d requests on %s\n",
+			 t[id].max_batch, engine->name);
+
+		for (n = 0; n < ncpus; n++) {
+			struct task_struct *tsk;
+
+			tsk = kthread_run(__igt_breadcrumbs_smoketest,
+					  &t[id], "igt/%d.%d", id, n);
+			if (IS_ERR(tsk)) {
+				ret = PTR_ERR(tsk);
+				goto out_flush;
+			}
+
+			get_task_struct(tsk);
+			threads[id * ncpus + n] = tsk;
+		}
+	}
+	mutex_unlock(&i915->drm.struct_mutex);
+
+	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
+
+out_flush:
+	num_waits = 0;
+	num_fences = 0;
+	for_each_engine(engine, i915, id) {
+		for (n = 0; n < ncpus; n++) {
+			struct task_struct *tsk = threads[id * ncpus + n];
+			int err;
+
+			if (!tsk)
+				continue;
+
+			err = kthread_stop(tsk);
+			if (err < 0 && !ret)
+				ret = err;
+
+			put_task_struct(tsk);
+		}
+
+		num_waits += atomic_long_read(&t[id].num_waits);
+		num_fences += atomic_long_read(&t[id].num_fences);
+	}
+	pr_info("Completed %lu waits for %lu fence across %d engines and %d cpus\n",
+		num_waits, num_fences, RUNTIME_INFO(i915)->num_rings, ncpus);
+
+	mutex_lock(&i915->drm.struct_mutex);
+	ret = end_live_test(&live) ?: ret;
+out_contexts:
+	mutex_unlock(&i915->drm.struct_mutex);
+	kfree(t[0].contexts);
+out_threads:
+	kfree(threads);
+	mock_file_free(i915, file);
+out_rpm:
+	intel_runtime_pm_put(i915, wakeref);
+
+	return ret;
+}
+
 int i915_request_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
@@ -879,6 +1276,7 @@ int i915_request_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_all_engines),
 		SUBTEST(live_sequential_engines),
 		SUBTEST(live_empty_request),
+		SUBTEST(live_breadcrumbs_smoketest),
 	};
 
 	if (i915_terminally_wedged(&i915->gpu_error))
diff --git a/drivers/gpu/drm/i915/selftests/igt_spinner.c b/drivers/gpu/drm/i915/selftests/igt_spinner.c
index 0e70df0230b8..9ebd9225684e 100644
--- a/drivers/gpu/drm/i915/selftests/igt_spinner.c
+++ b/drivers/gpu/drm/i915/selftests/igt_spinner.c
@@ -185,11 +185,6 @@ void igt_spinner_fini(struct igt_spinner *spin)
 
 bool igt_wait_for_spinner(struct igt_spinner *spin, struct i915_request *rq)
 {
-	if (!wait_event_timeout(rq->execute,
-				READ_ONCE(rq->global_seqno),
-				msecs_to_jiffies(10)))
-		return false;
-
 	return !(wait_for_us(i915_seqno_passed(hws_seqno(spin, rq),
 					       rq->fence.seqno),
 			     10) &&
diff --git a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c b/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
deleted file mode 100644
index f03b407fdbe2..000000000000
--- a/drivers/gpu/drm/i915/selftests/intel_breadcrumbs.c
+++ /dev/null
@@ -1,470 +0,0 @@
-/*
- * Copyright © 2016 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- *
- */
-
-#include "../i915_selftest.h"
-#include "i915_random.h"
-
-#include "mock_gem_device.h"
-#include "mock_engine.h"
-
-static int check_rbtree(struct intel_engine_cs *engine,
-			const unsigned long *bitmap,
-			const struct intel_wait *waiters,
-			const int count)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-	struct rb_node *rb;
-	int n;
-
-	if (&b->irq_wait->node != rb_first(&b->waiters)) {
-		pr_err("First waiter does not match first element of wait-tree\n");
-		return -EINVAL;
-	}
-
-	n = find_first_bit(bitmap, count);
-	for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
-		struct intel_wait *w = container_of(rb, typeof(*w), node);
-		int idx = w - waiters;
-
-		if (!test_bit(idx, bitmap)) {
-			pr_err("waiter[%d, seqno=%d] removed but still in wait-tree\n",
-			       idx, w->seqno);
-			return -EINVAL;
-		}
-
-		if (n != idx) {
-			pr_err("waiter[%d, seqno=%d] does not match expected next element in tree [%d]\n",
-			       idx, w->seqno, n);
-			return -EINVAL;
-		}
-
-		n = find_next_bit(bitmap, count, n + 1);
-	}
-
-	return 0;
-}
-
-static int check_completion(struct intel_engine_cs *engine,
-			    const unsigned long *bitmap,
-			    const struct intel_wait *waiters,
-			    const int count)
-{
-	int n;
-
-	for (n = 0; n < count; n++) {
-		if (intel_wait_complete(&waiters[n]) != !!test_bit(n, bitmap))
-			continue;
-
-		pr_err("waiter[%d, seqno=%d] is %s, but expected %s\n",
-		       n, waiters[n].seqno,
-		       intel_wait_complete(&waiters[n]) ? "complete" : "active",
-		       test_bit(n, bitmap) ? "active" : "complete");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int check_rbtree_empty(struct intel_engine_cs *engine)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	if (b->irq_wait) {
-		pr_err("Empty breadcrumbs still has a waiter\n");
-		return -EINVAL;
-	}
-
-	if (!RB_EMPTY_ROOT(&b->waiters)) {
-		pr_err("Empty breadcrumbs, but wait-tree not empty\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int igt_random_insert_remove(void *arg)
-{
-	const u32 seqno_bias = 0x1000;
-	I915_RND_STATE(prng);
-	struct intel_engine_cs *engine = arg;
-	struct intel_wait *waiters;
-	const int count = 4096;
-	unsigned int *order;
-	unsigned long *bitmap;
-	int err = -ENOMEM;
-	int n;
-
-	mock_engine_reset(engine);
-
-	waiters = kvmalloc_array(count, sizeof(*waiters), GFP_KERNEL);
-	if (!waiters)
-		goto out_engines;
-
-	bitmap = kcalloc(DIV_ROUND_UP(count, BITS_PER_LONG), sizeof(*bitmap),
-			 GFP_KERNEL);
-	if (!bitmap)
-		goto out_waiters;
-
-	order = i915_random_order(count, &prng);
-	if (!order)
-		goto out_bitmap;
-
-	for (n = 0; n < count; n++)
-		intel_wait_init_for_seqno(&waiters[n], seqno_bias + n);
-
-	err = check_rbtree(engine, bitmap, waiters, count);
-	if (err)
-		goto out_order;
-
-	/* Add and remove waiters into the rbtree in random order. At each
-	 * step, we verify that the rbtree is correctly ordered.
-	 */
-	for (n = 0; n < count; n++) {
-		int i = order[n];
-
-		intel_engine_add_wait(engine, &waiters[i]);
-		__set_bit(i, bitmap);
-
-		err = check_rbtree(engine, bitmap, waiters, count);
-		if (err)
-			goto out_order;
-	}
-
-	i915_random_reorder(order, count, &prng);
-	for (n = 0; n < count; n++) {
-		int i = order[n];
-
-		intel_engine_remove_wait(engine, &waiters[i]);
-		__clear_bit(i, bitmap);
-
-		err = check_rbtree(engine, bitmap, waiters, count);
-		if (err)
-			goto out_order;
-	}
-
-	err = check_rbtree_empty(engine);
-out_order:
-	kfree(order);
-out_bitmap:
-	kfree(bitmap);
-out_waiters:
-	kvfree(waiters);
-out_engines:
-	mock_engine_flush(engine);
-	return err;
-}
-
-static int igt_insert_complete(void *arg)
-{
-	const u32 seqno_bias = 0x1000;
-	struct intel_engine_cs *engine = arg;
-	struct intel_wait *waiters;
-	const int count = 4096;
-	unsigned long *bitmap;
-	int err = -ENOMEM;
-	int n, m;
-
-	mock_engine_reset(engine);
-
-	waiters = kvmalloc_array(count, sizeof(*waiters), GFP_KERNEL);
-	if (!waiters)
-		goto out_engines;
-
-	bitmap = kcalloc(DIV_ROUND_UP(count, BITS_PER_LONG), sizeof(*bitmap),
-			 GFP_KERNEL);
-	if (!bitmap)
-		goto out_waiters;
-
-	for (n = 0; n < count; n++) {
-		intel_wait_init_for_seqno(&waiters[n], n + seqno_bias);
-		intel_engine_add_wait(engine, &waiters[n]);
-		__set_bit(n, bitmap);
-	}
-	err = check_rbtree(engine, bitmap, waiters, count);
-	if (err)
-		goto out_bitmap;
-
-	/* On each step, we advance the seqno so that several waiters are then
-	 * complete (we increase the seqno by increasingly larger values to
-	 * retire more and more waiters at once). All retired waiters should
-	 * be woken and removed from the rbtree, and so that we check.
-	 */
-	for (n = 0; n < count; n = m) {
-		int seqno = 2 * n;
-
-		GEM_BUG_ON(find_first_bit(bitmap, count) != n);
-
-		if (intel_wait_complete(&waiters[n])) {
-			pr_err("waiter[%d, seqno=%d] completed too early\n",
-			       n, waiters[n].seqno);
-			err = -EINVAL;
-			goto out_bitmap;
-		}
-
-		/* complete the following waiters */
-		mock_seqno_advance(engine, seqno + seqno_bias);
-		for (m = n; m <= seqno; m++) {
-			if (m == count)
-				break;
-
-			GEM_BUG_ON(!test_bit(m, bitmap));
-			__clear_bit(m, bitmap);
-		}
-
-		intel_engine_remove_wait(engine, &waiters[n]);
-		RB_CLEAR_NODE(&waiters[n].node);
-
-		err = check_rbtree(engine, bitmap, waiters, count);
-		if (err) {
-			pr_err("rbtree corrupt after seqno advance to %d\n",
-			       seqno + seqno_bias);
-			goto out_bitmap;
-		}
-
-		err = check_completion(engine, bitmap, waiters, count);
-		if (err) {
-			pr_err("completions after seqno advance to %d failed\n",
-			       seqno + seqno_bias);
-			goto out_bitmap;
-		}
-	}
-
-	err = check_rbtree_empty(engine);
-out_bitmap:
-	kfree(bitmap);
-out_waiters:
-	kvfree(waiters);
-out_engines:
-	mock_engine_flush(engine);
-	return err;
-}
-
-struct igt_wakeup {
-	struct task_struct *tsk;
-	atomic_t *ready, *set, *done;
-	struct intel_engine_cs *engine;
-	unsigned long flags;
-#define STOP 0
-#define IDLE 1
-	wait_queue_head_t *wq;
-	u32 seqno;
-};
-
-static bool wait_for_ready(struct igt_wakeup *w)
-{
-	DEFINE_WAIT(ready);
-
-	set_bit(IDLE, &w->flags);
-	if (atomic_dec_and_test(w->done))
-		wake_up_var(w->done);
-
-	if (test_bit(STOP, &w->flags))
-		goto out;
-
-	for (;;) {
-		prepare_to_wait(w->wq, &ready, TASK_INTERRUPTIBLE);
-		if (atomic_read(w->ready) == 0)
-			break;
-
-		schedule();
-	}
-	finish_wait(w->wq, &ready);
-
-out:
-	clear_bit(IDLE, &w->flags);
-	if (atomic_dec_and_test(w->set))
-		wake_up_var(w->set);
-
-	return !test_bit(STOP, &w->flags);
-}
-
-static int igt_wakeup_thread(void *arg)
-{
-	struct igt_wakeup *w = arg;
-	struct intel_wait wait;
-
-	while (wait_for_ready(w)) {
-		GEM_BUG_ON(kthread_should_stop());
-
-		intel_wait_init_for_seqno(&wait, w->seqno);
-		intel_engine_add_wait(w->engine, &wait);
-		for (;;) {
-			set_current_state(TASK_UNINTERRUPTIBLE);
-			if (i915_seqno_passed(intel_engine_get_seqno(w->engine),
-					      w->seqno))
-				break;
-
-			if (test_bit(STOP, &w->flags)) /* emergency escape */
-				break;
-
-			schedule();
-		}
-		intel_engine_remove_wait(w->engine, &wait);
-		__set_current_state(TASK_RUNNING);
-	}
-
-	return 0;
-}
-
-static void igt_wake_all_sync(atomic_t *ready,
-			      atomic_t *set,
-			      atomic_t *done,
-			      wait_queue_head_t *wq,
-			      int count)
-{
-	atomic_set(set, count);
-	atomic_set(ready, 0);
-	wake_up_all(wq);
-
-	wait_var_event(set, !atomic_read(set));
-	atomic_set(ready, count);
-	atomic_set(done, count);
-}
-
-static int igt_wakeup(void *arg)
-{
-	I915_RND_STATE(prng);
-	struct intel_engine_cs *engine = arg;
-	struct igt_wakeup *waiters;
-	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
-	const int count = 4096;
-	const u32 max_seqno = count / 4;
-	atomic_t ready, set, done;
-	int err = -ENOMEM;
-	int n, step;
-
-	mock_engine_reset(engine);
-
-	waiters = kvmalloc_array(count, sizeof(*waiters), GFP_KERNEL);
-	if (!waiters)
-		goto out_engines;
-
-	/* Create a large number of threads, each waiting on a random seqno.
-	 * Multiple waiters will be waiting for the same seqno.
-	 */
-	atomic_set(&ready, count);
-	for (n = 0; n < count; n++) {
-		waiters[n].wq = &wq;
-		waiters[n].ready = &ready;
-		waiters[n].set = &set;
-		waiters[n].done = &done;
-		waiters[n].engine = engine;
-		waiters[n].flags = BIT(IDLE);
-
-		waiters[n].tsk = kthread_run(igt_wakeup_thread, &waiters[n],
-					     "i915/igt:%d", n);
-		if (IS_ERR(waiters[n].tsk))
-			goto out_waiters;
-
-		get_task_struct(waiters[n].tsk);
-	}
-
-	for (step = 1; step <= max_seqno; step <<= 1) {
-		u32 seqno;
-
-		/* The waiter threads start paused as we assign them a random
-		 * seqno and reset the engine. Once the engine is reset,
-		 * we signal that the threads may begin their wait upon their
-		 * seqno.
-		 */
-		for (n = 0; n < count; n++) {
-			GEM_BUG_ON(!test_bit(IDLE, &waiters[n].flags));
-			waiters[n].seqno =
-				1 + prandom_u32_state(&prng) % max_seqno;
-		}
-		mock_seqno_advance(engine, 0);
-		igt_wake_all_sync(&ready, &set, &done, &wq, count);
-
-		/* Simulate the GPU doing chunks of work, with one or more
-		 * seqno appearing to finish at the same time. A random number
-		 * of threads will be waiting upon the update and hopefully be
-		 * woken.
-		 */
-		for (seqno = 1; seqno <= max_seqno + step; seqno += step) {
-			usleep_range(50, 500);
-			mock_seqno_advance(engine, seqno);
-		}
-		GEM_BUG_ON(intel_engine_get_seqno(engine) < 1 + max_seqno);
-
-		/* With the seqno now beyond any of the waiting threads, they
-		 * should all be woken, see that they are complete and signal
-		 * that they are ready for the next test. We wait until all
-		 * threads are complete and waiting for us (i.e. not a seqno).
-		 */
-		if (!wait_var_event_timeout(&done,
-					    !atomic_read(&done), 10 * HZ)) {
-			pr_err("Timed out waiting for %d remaining waiters\n",
-			       atomic_read(&done));
-			err = -ETIMEDOUT;
-			break;
-		}
-
-		err = check_rbtree_empty(engine);
-		if (err)
-			break;
-	}
-
-out_waiters:
-	for (n = 0; n < count; n++) {
-		if (IS_ERR(waiters[n].tsk))
-			break;
-
-		set_bit(STOP, &waiters[n].flags);
-	}
-	mock_seqno_advance(engine, INT_MAX); /* wakeup any broken waiters */
-	igt_wake_all_sync(&ready, &set, &done, &wq, n);
-
-	for (n = 0; n < count; n++) {
-		if (IS_ERR(waiters[n].tsk))
-			break;
-
-		kthread_stop(waiters[n].tsk);
-		put_task_struct(waiters[n].tsk);
-	}
-
-	kvfree(waiters);
-out_engines:
-	mock_engine_flush(engine);
-	return err;
-}
-
-int intel_breadcrumbs_mock_selftests(void)
-{
-	static const struct i915_subtest tests[] = {
-		SUBTEST(igt_random_insert_remove),
-		SUBTEST(igt_insert_complete),
-		SUBTEST(igt_wakeup),
-	};
-	struct drm_i915_private *i915;
-	int err;
-
-	i915 = mock_gem_device();
-	if (!i915)
-		return -ENOMEM;
-
-	err = i915_subtests(tests, i915->engine[RCS]);
-	drm_dev_put(&i915->drm);
-
-	return err;
-}
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 9d0cc9d63a1e..128d37bba1ac 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -1127,7 +1127,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 
 	wait_for_completion(&arg.completion);
 
-	if (wait_for(waitqueue_active(&rq->execute), 10)) {
+	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
 		struct drm_printer p = drm_info_printer(i915->drm.dev);
 
 		pr_err("igt/evict_vma kthread did not wait\n");
diff --git a/drivers/gpu/drm/i915/selftests/lib_sw_fence.c b/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
index b26f07b55d86..2bfa72c1654b 100644
--- a/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
+++ b/drivers/gpu/drm/i915/selftests/lib_sw_fence.c
@@ -76,3 +76,57 @@ void timed_fence_fini(struct timed_fence *tf)
 	destroy_timer_on_stack(&tf->timer);
 	i915_sw_fence_fini(&tf->fence);
 }
+
+struct heap_fence {
+	struct i915_sw_fence fence;
+	union {
+		struct kref ref;
+		struct rcu_head rcu;
+	};
+};
+
+static int __i915_sw_fence_call
+heap_fence_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	struct heap_fence *h = container_of(fence, typeof(*h), fence);
+
+	switch (state) {
+	case FENCE_COMPLETE:
+		break;
+
+	case FENCE_FREE:
+		heap_fence_put(&h->fence);
+	}
+
+	return NOTIFY_DONE;
+}
+
+struct i915_sw_fence *heap_fence_create(gfp_t gfp)
+{
+	struct heap_fence *h;
+
+	h = kmalloc(sizeof(*h), gfp);
+	if (!h)
+		return NULL;
+
+	i915_sw_fence_init(&h->fence, heap_fence_notify);
+	refcount_set(&h->ref.refcount, 2);
+
+	return &h->fence;
+}
+
+static void heap_fence_release(struct kref *ref)
+{
+	struct heap_fence *h = container_of(ref, typeof(*h), ref);
+
+	i915_sw_fence_fini(&h->fence);
+
+	kfree_rcu(h, rcu);
+}
+
+void heap_fence_put(struct i915_sw_fence *fence)
+{
+	struct heap_fence *h = container_of(fence, typeof(*h), fence);
+
+	kref_put(&h->ref, heap_fence_release);
+}
diff --git a/drivers/gpu/drm/i915/selftests/lib_sw_fence.h b/drivers/gpu/drm/i915/selftests/lib_sw_fence.h
index 474aafb92ae1..1f9927e10f3a 100644
--- a/drivers/gpu/drm/i915/selftests/lib_sw_fence.h
+++ b/drivers/gpu/drm/i915/selftests/lib_sw_fence.h
@@ -39,4 +39,7 @@ struct timed_fence {
 void timed_fence_init(struct timed_fence *tf, unsigned long expires);
 void timed_fence_fini(struct timed_fence *tf);
 
+struct i915_sw_fence *heap_fence_create(gfp_t gfp);
+void heap_fence_put(struct i915_sw_fence *fence);
+
 #endif /* _LIB_SW_FENCE_H_ */
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index d937bdff26f9..e4db9a31b510 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -49,6 +49,8 @@ mock_context(struct drm_i915_private *i915,
 		struct intel_context *ce = &ctx->__engine[n];
 
 		ce->gem_context = ctx;
+		INIT_LIST_HEAD(&ce->signal_link);
+		INIT_LIST_HEAD(&ce->signals);
 	}
 
 	ret = i915_gem_context_pin_hw_id(ctx);
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index b4b61056b227..c1cd8b27b32a 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -75,25 +75,27 @@ static struct mock_request *first_request(struct mock_engine *engine)
 					link);
 }
 
-static void advance(struct mock_engine *engine,
-		    struct mock_request *request)
+static void advance(struct mock_request *request)
 {
 	list_del_init(&request->link);
 	i915_request_fake_complete(&request->base);
-	mock_seqno_advance(&engine->base, request->base.global_seqno);
+	intel_engine_write_global_seqno(request->base.engine,
+					request->base.global_seqno);
+	intel_engine_queue_breadcrumbs(request->base.engine);
 }
 
 static void hw_delay_complete(struct timer_list *t)
 {
 	struct mock_engine *engine = from_timer(engine, t, hw_delay);
 	struct mock_request *request;
+	unsigned long flags;
 
-	spin_lock(&engine->hw_lock);
+	spin_lock_irqsave(&engine->hw_lock, flags);
 
 	/* Timer fired, first request is complete */
 	request = first_request(engine);
 	if (request)
-		advance(engine, request);
+		advance(request);
 
 	/*
 	 * Also immediately signal any subsequent 0-delay requests, but
@@ -105,10 +107,10 @@ static void hw_delay_complete(struct timer_list *t)
 			break;
 		}
 
-		advance(engine, request);
+		advance(request);
 	}
 
-	spin_unlock(&engine->hw_lock);
+	spin_unlock_irqrestore(&engine->hw_lock, flags);
 }
 
 static void mock_context_unpin(struct intel_context *ce)
@@ -179,19 +181,20 @@ static void mock_submit_request(struct i915_request *request)
 	struct mock_request *mock = container_of(request, typeof(*mock), base);
 	struct mock_engine *engine =
 		container_of(request->engine, typeof(*engine), base);
+	unsigned long flags;
 
 	i915_request_submit(request);
 	GEM_BUG_ON(!request->global_seqno);
 
-	spin_lock_irq(&engine->hw_lock);
+	spin_lock_irqsave(&engine->hw_lock, flags);
 	list_add_tail(&mock->link, &engine->hw_queue);
 	if (mock->link.prev == &engine->hw_queue) {
 		if (mock->delay)
 			mod_timer(&engine->hw_delay, jiffies + mock->delay);
 		else
-			advance(engine, mock);
+			advance(mock);
 	}
-	spin_unlock_irq(&engine->hw_lock);
+	spin_unlock_irqrestore(&engine->hw_lock, flags);
 }
 
 struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
@@ -255,12 +258,13 @@ void mock_engine_flush(struct intel_engine_cs *engine)
 
 	spin_lock_irq(&mock->hw_lock);
 	list_for_each_entry_safe(request, rn, &mock->hw_queue, link)
-		advance(mock, request);
+		advance(request);
 	spin_unlock_irq(&mock->hw_lock);
 }
 
 void mock_engine_reset(struct intel_engine_cs *engine)
 {
+	intel_engine_write_global_seqno(engine, 0);
 }
 
 void mock_engine_free(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.h b/drivers/gpu/drm/i915/selftests/mock_engine.h
index 133d0c21790d..b9cc3a245f16 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.h
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.h
@@ -46,10 +46,4 @@ void mock_engine_flush(struct intel_engine_cs *engine);
 void mock_engine_reset(struct intel_engine_cs *engine);
 void mock_engine_free(struct intel_engine_cs *engine);
 
-static inline void mock_seqno_advance(struct intel_engine_cs *engine, u32 seqno)
-{
-	intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
-	intel_engine_wakeup(engine);
-}
-
 #endif /* !__MOCK_ENGINE_H__ */
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 22/38] drm/i915: Drop fake breadcrumb irq
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (20 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 21/38] drm/i915: Replace global breadcrumbs with per-context interrupt tracking Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 23/38] drm/i915: Replace global_seqno with a hangcheck heartbeat seqno Chris Wilson
                   ` (17 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

Missed breadcrumb detection is defunct due to the tight coupling with
dma_fence signaling and the myriad ways we may signal fences from
everywhere but from an interrupt, i.e. we frequently signal a fence
before we even see its interrupt. This means that even if we miss an
interrupt for a fence, it still is signaled before our breadcrumb
hangcheck fires, so simplify the breadcrumb hangchecking by moving it
into the GPU hangcheck and forgo fake interrupts.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |  93 -----------
 drivers/gpu/drm/i915/i915_gpu_error.c         |   2 -
 drivers/gpu/drm/i915/i915_gpu_error.h         |   5 -
 drivers/gpu/drm/i915/intel_breadcrumbs.c      | 147 +-----------------
 drivers/gpu/drm/i915/intel_hangcheck.c        |   2 +
 drivers/gpu/drm/i915/intel_ringbuffer.h       |   5 -
 .../gpu/drm/i915/selftests/i915_gem_context.c |   7 -
 drivers/gpu/drm/i915/selftests/i915_request.c |   7 -
 8 files changed, 5 insertions(+), 263 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index d7764e62e9b4..c2aaf010c3d1 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1321,9 +1321,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 			   intel_engine_last_submit(engine),
 			   jiffies_to_msecs(jiffies -
 					    engine->hangcheck.action_timestamp));
-		seq_printf(m, "\tfake irq active? %s\n",
-			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)));
 
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
@@ -3874,94 +3871,6 @@ DEFINE_SIMPLE_ATTRIBUTE(i915_wedged_fops,
 			i915_wedged_get, i915_wedged_set,
 			"%llu\n");
 
-static int
-fault_irq_set(struct drm_i915_private *i915,
-	      unsigned long *irq,
-	      unsigned long val)
-{
-	int err;
-
-	err = mutex_lock_interruptible(&i915->drm.struct_mutex);
-	if (err)
-		return err;
-
-	err = i915_gem_wait_for_idle(i915,
-				     I915_WAIT_LOCKED |
-				     I915_WAIT_INTERRUPTIBLE,
-				     MAX_SCHEDULE_TIMEOUT);
-	if (err)
-		goto err_unlock;
-
-	*irq = val;
-	mutex_unlock(&i915->drm.struct_mutex);
-
-	/* Flush idle worker to disarm irq */
-	drain_delayed_work(&i915->gt.idle_work);
-
-	return 0;
-
-err_unlock:
-	mutex_unlock(&i915->drm.struct_mutex);
-	return err;
-}
-
-static int
-i915_ring_missed_irq_get(void *data, u64 *val)
-{
-	struct drm_i915_private *dev_priv = data;
-
-	*val = dev_priv->gpu_error.missed_irq_rings;
-	return 0;
-}
-
-static int
-i915_ring_missed_irq_set(void *data, u64 val)
-{
-	struct drm_i915_private *i915 = data;
-
-	return fault_irq_set(i915, &i915->gpu_error.missed_irq_rings, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(i915_ring_missed_irq_fops,
-			i915_ring_missed_irq_get, i915_ring_missed_irq_set,
-			"0x%08llx\n");
-
-static int
-i915_ring_test_irq_get(void *data, u64 *val)
-{
-	struct drm_i915_private *dev_priv = data;
-
-	*val = dev_priv->gpu_error.test_irq_rings;
-
-	return 0;
-}
-
-static int
-i915_ring_test_irq_set(void *data, u64 val)
-{
-	struct drm_i915_private *i915 = data;
-
-	/* GuC keeps the user interrupt permanently enabled for submission */
-	if (USES_GUC_SUBMISSION(i915))
-		return -ENODEV;
-
-	/*
-	 * From icl, we can no longer individually mask interrupt generation
-	 * from each engine.
-	 */
-	if (INTEL_GEN(i915) >= 11)
-		return -ENODEV;
-
-	val &= INTEL_INFO(i915)->ring_mask;
-	DRM_DEBUG_DRIVER("Masking interrupts on rings 0x%08llx\n", val);
-
-	return fault_irq_set(i915, &i915->gpu_error.test_irq_rings, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(i915_ring_test_irq_fops,
-			i915_ring_test_irq_get, i915_ring_test_irq_set,
-			"0x%08llx\n");
-
 #define DROP_UNBOUND	BIT(0)
 #define DROP_BOUND	BIT(1)
 #define DROP_RETIRE	BIT(2)
@@ -4724,8 +4633,6 @@ static const struct i915_debugfs_files {
 } i915_debugfs_files[] = {
 	{"i915_wedged", &i915_wedged_fops},
 	{"i915_cache_sharing", &i915_cache_sharing_fops},
-	{"i915_ring_missed_irq", &i915_ring_missed_irq_fops},
-	{"i915_ring_test_irq", &i915_ring_test_irq_fops},
 	{"i915_gem_drop_caches", &i915_drop_caches_fops},
 #if IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR)
 	{"i915_error_state", &i915_error_state_fops},
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 7c2510ce81d2..7d26c25ee785 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -718,8 +718,6 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 	err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
 	err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
 	err_printf(m, "CCID: 0x%08x\n", error->ccid);
-	err_printf(m, "Missed interrupts: 0x%08lx\n",
-		   m->i915->gpu_error.missed_irq_rings);
 
 	for (i = 0; i < error->nfence; i++)
 		err_printf(m, "  fence[%d] = %08llx\n", i, error->fence[i]);
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 0e184712cbcc..99a53c0cd6da 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -203,8 +203,6 @@ struct i915_gpu_error {
 
 	atomic_t pending_fb_pin;
 
-	unsigned long missed_irq_rings;
-
 	/**
 	 * State variable controlling the reset flow and count
 	 *
@@ -273,9 +271,6 @@ struct i915_gpu_error {
 	 */
 	wait_queue_head_t reset_queue;
 
-	/* For missed irq/seqno simulation. */
-	unsigned long test_irq_rings;
-
 	struct i915_gpu_restart *restart;
 };
 
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index 1187d2edd330..6ae7f37cff35 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -86,7 +86,6 @@ bool intel_engine_breadcrumbs_irq(struct intel_engine_cs *engine)
 
 	spin_lock(&b->irq_lock);
 
-	b->irq_fired = true;
 	if (b->irq_armed && list_empty(&b->signalers))
 		__intel_breadcrumbs_disarm_irq(b);
 
@@ -150,86 +149,6 @@ static void signal_irq_work(struct irq_work *work)
 	intel_engine_breadcrumbs_irq(engine);
 }
 
-static unsigned long wait_timeout(void)
-{
-	return round_jiffies_up(jiffies + DRM_I915_HANGCHECK_JIFFIES);
-}
-
-static noinline void missed_breadcrumb(struct intel_engine_cs *engine)
-{
-	if (GEM_SHOW_DEBUG()) {
-		struct drm_printer p = drm_debug_printer(__func__);
-
-		intel_engine_dump(engine, &p,
-				  "%s missed breadcrumb at %pS\n",
-				  engine->name, __builtin_return_address(0));
-	}
-
-	set_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
-}
-
-static void intel_breadcrumbs_hangcheck(struct timer_list *t)
-{
-	struct intel_engine_cs *engine =
-		from_timer(engine, t, breadcrumbs.hangcheck);
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	if (!b->irq_armed)
-		return;
-
-	if (b->irq_fired)
-		goto rearm;
-
-	/*
-	 * We keep the hangcheck timer alive until we disarm the irq, even
-	 * if there are no waiters at present.
-	 *
-	 * If the waiter was currently running, assume it hasn't had a chance
-	 * to process the pending interrupt (e.g, low priority task on a loaded
-	 * system) and wait until it sleeps before declaring a missed interrupt.
-	 *
-	 * If the waiter was asleep (and not even pending a wakeup), then we
-	 * must have missed an interrupt as the GPU has stopped advancing
-	 * but we still have a waiter. Assuming all batches complete within
-	 * DRM_I915_HANGCHECK_JIFFIES [1.5s]!
-	 */
-	synchronize_hardirq(engine->i915->drm.irq);
-	if (intel_engine_signal_breadcrumbs(engine)) {
-		missed_breadcrumb(engine);
-		mod_timer(&b->fake_irq, jiffies + 1);
-	} else {
-rearm:
-		b->irq_fired = false;
-		mod_timer(&b->hangcheck, wait_timeout());
-	}
-}
-
-static void intel_breadcrumbs_fake_irq(struct timer_list *t)
-{
-	struct intel_engine_cs *engine =
-		from_timer(engine, t, breadcrumbs.fake_irq);
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	/*
-	 * The timer persists in case we cannot enable interrupts,
-	 * or if we have previously seen seqno/interrupt incoherency
-	 * ("missed interrupt" syndrome, better known as a "missed breadcrumb").
-	 * Here the worker will wake up every jiffie in order to kick the
-	 * oldest waiter to do the coherent seqno check.
-	 */
-
-	if (!intel_engine_signal_breadcrumbs(engine) && !b->irq_armed)
-		return;
-
-	/* If the user has disabled the fake-irq, restore the hangchecking */
-	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings)) {
-		mod_timer(&b->hangcheck, wait_timeout());
-		return;
-	}
-
-	mod_timer(&b->fake_irq, jiffies + 1);
-}
-
 void intel_engine_pin_breadcrumbs_irq(struct intel_engine_cs *engine)
 {
 	struct intel_breadcrumbs *b = &engine->breadcrumbs;
@@ -252,43 +171,14 @@ void intel_engine_unpin_breadcrumbs_irq(struct intel_engine_cs *engine)
 	spin_unlock_irq(&b->irq_lock);
 }
 
-static bool use_fake_irq(const struct intel_breadcrumbs *b)
-{
-	const struct intel_engine_cs *engine =
-		container_of(b, struct intel_engine_cs, breadcrumbs);
-
-	if (!test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
-		return false;
-
-	/*
-	 * Only start with the heavy weight fake irq timer if we have not
-	 * seen any interrupts since enabling it the first time. If the
-	 * interrupts are still arriving, it means we made a mistake in our
-	 * engine->seqno_barrier(), a timing error that should be transient
-	 * and unlikely to reoccur.
-	 */
-	return !b->irq_fired;
-}
-
-static void enable_fake_irq(struct intel_breadcrumbs *b)
-{
-	/* Ensure we never sleep indefinitely */
-	if (!b->irq_enabled || use_fake_irq(b))
-		mod_timer(&b->fake_irq, jiffies + 1);
-	else
-		mod_timer(&b->hangcheck, wait_timeout());
-}
-
-static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
+static void __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
 {
 	struct intel_engine_cs *engine =
 		container_of(b, struct intel_engine_cs, breadcrumbs);
-	struct drm_i915_private *i915 = engine->i915;
-	bool enabled;
 
 	lockdep_assert_held(&b->irq_lock);
 	if (b->irq_armed)
-		return false;
+		return;
 
 	/*
 	 * The breadcrumb irq will be disarmed on the interrupt after the
@@ -306,16 +196,8 @@ static bool __intel_breadcrumbs_arm_irq(struct intel_breadcrumbs *b)
 	 * the driver is idle) we disarm the breadcrumbs.
 	 */
 
-	/* No interrupts? Kick the waiter every jiffie! */
-	enabled = false;
-	if (!b->irq_enabled++ &&
-	    !test_bit(engine->id, &i915->gpu_error.test_irq_rings)) {
+	if (!b->irq_enabled++)
 		irq_enable(engine);
-		enabled = true;
-	}
-
-	enable_fake_irq(b);
-	return enabled;
 }
 
 void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
@@ -326,18 +208,6 @@ void intel_engine_init_breadcrumbs(struct intel_engine_cs *engine)
 	INIT_LIST_HEAD(&b->signalers);
 
 	init_irq_work(&b->irq_work, signal_irq_work);
-
-	timer_setup(&b->fake_irq, intel_breadcrumbs_fake_irq, 0);
-	timer_setup(&b->hangcheck, intel_breadcrumbs_hangcheck, 0);
-}
-
-static void cancel_fake_irq(struct intel_engine_cs *engine)
-{
-	struct intel_breadcrumbs *b = &engine->breadcrumbs;
-
-	del_timer_sync(&b->fake_irq); /* may queue b->hangcheck */
-	del_timer_sync(&b->hangcheck);
-	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
 }
 
 void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
@@ -347,13 +217,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
 
 	spin_lock_irqsave(&b->irq_lock, flags);
 
-	/*
-	 * Leave the fake_irq timer enabled (if it is running), but clear the
-	 * bit so that it turns itself off on its next wake up and goes back
-	 * to the long hangcheck interval if still required.
-	 */
-	clear_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings);
-
 	if (b->irq_enabled)
 		irq_enable(engine);
 	else
@@ -364,7 +227,6 @@ void intel_engine_reset_breadcrumbs(struct intel_engine_cs *engine)
 
 void intel_engine_fini_breadcrumbs(struct intel_engine_cs *engine)
 {
-	cancel_fake_irq(engine);
 }
 
 bool intel_engine_enable_signaling(struct i915_request *rq)
@@ -446,7 +308,4 @@ void intel_engine_print_breadcrumbs(struct intel_engine_cs *engine,
 		}
 	}
 	spin_unlock_irq(&b->irq_lock);
-
-	if (test_bit(engine->id, &engine->i915->gpu_error.missed_irq_rings))
-		drm_printf(p, "Fake irq active\n");
 }
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 925146dcc0f8..e92f48a40a59 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -279,6 +279,8 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 	for_each_engine(engine, dev_priv, id) {
 		struct hangcheck hc;
 
+		intel_engine_signal_breadcrumbs(engine);
+
 		hangcheck_load_sample(engine, &hc);
 		hangcheck_accumulate_sample(engine, &hc);
 		hangcheck_store_sample(engine, &hc);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 6d1485208342..3ab159529dc7 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -390,14 +390,9 @@ struct intel_engine_cs {
 
 		struct irq_work irq_work;
 
-		struct timer_list fake_irq; /* used after a missed interrupt */
-		struct timer_list hangcheck; /* detect missed interrupts */
-
-		unsigned int hangcheck_interrupts;
 		unsigned int irq_enabled;
 
 		bool irq_armed;
-		bool irq_fired;
 	} breadcrumbs;
 
 	struct {
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 4cba50679607..2cca234fd291 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -65,7 +65,6 @@ static int begin_live_test(struct live_test *t,
 		return err;
 	}
 
-	i915->gpu_error.missed_irq_rings = 0;
 	t->reset_global = i915_reset_count(&i915->gpu_error);
 
 	for_each_engine(engine, i915, id)
@@ -103,12 +102,6 @@ static int end_live_test(struct live_test *t)
 		return -EIO;
 	}
 
-	if (i915->gpu_error.missed_irq_rings) {
-		pr_err("%s(%s): Missed interrupts on engines %lx\n",
-		       t->func, t->name, i915->gpu_error.missed_irq_rings);
-		return -EIO;
-	}
-
 	return 0;
 }
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 48b6159afe64..5953a47827a7 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -535,7 +535,6 @@ static int begin_live_test(struct live_test *t,
 		return err;
 	}
 
-	i915->gpu_error.missed_irq_rings = 0;
 	t->reset_count = i915_reset_count(&i915->gpu_error);
 
 	return 0;
@@ -559,12 +558,6 @@ static int end_live_test(struct live_test *t)
 		return -EIO;
 	}
 
-	if (i915->gpu_error.missed_irq_rings) {
-		pr_err("%s(%s): Missed interrupts on engines %lx\n",
-		       t->func, t->name, i915->gpu_error.missed_irq_rings);
-		return -EIO;
-	}
-
 	return 0;
 }
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 23/38] drm/i915: Replace global_seqno with a hangcheck heartbeat seqno
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (21 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 22/38] drm/i915: Drop fake breadcrumb irq Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 24/38] drm/i915: Avoid presumption of execution ordering for kernel context switching Chris Wilson
                   ` (16 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

To determine whether an engine has 'struck', we simply check whether or
not is still on the same seqno for several seconds. To keep this simple
mechanism intact over the loss of a global seqno, we can simply add a
new global heartbeat seqno instead. As we cannot know the sequence in
which requests will then be completed, we use a primitive random number
generator instead (with a cycle long enough to not matter over an
interval of a few thousand requests between hangcheck samples).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |  7 ++++---
 drivers/gpu/drm/i915/intel_engine_cs.c  |  5 +++--
 drivers/gpu/drm/i915/intel_hangcheck.c  |  6 +++---
 drivers/gpu/drm/i915/intel_lrc.c        | 19 +++++++++++++++--
 drivers/gpu/drm/i915/intel_ringbuffer.c | 28 +++++++++++++++++++------
 drivers/gpu/drm/i915/intel_ringbuffer.h | 19 ++++++++++++++++-
 6 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index c2aaf010c3d1..16a9384de478 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1297,7 +1297,7 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	with_intel_runtime_pm(dev_priv, wakeref) {
 		for_each_engine(engine, dev_priv, id) {
 			acthd[id] = intel_engine_get_active_head(engine);
-			seqno[id] = intel_engine_get_seqno(engine);
+			seqno[id] = intel_engine_get_hangcheck_seqno(engine);
 		}
 
 		intel_engine_get_instdone(dev_priv->engine[RCS], &instdone);
@@ -1317,8 +1317,9 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	for_each_engine(engine, dev_priv, id) {
 		seq_printf(m, "%s:\n", engine->name);
 		seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
-			   engine->hangcheck.seqno, seqno[id],
-			   intel_engine_last_submit(engine),
+			   engine->hangcheck.last_seqno,
+			   seqno[id],
+			   engine->hangcheck.next_seqno,
 			   jiffies_to_msecs(jiffies -
 					    engine->hangcheck.action_timestamp));
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 429f21e16428..81349258c4d6 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -1438,10 +1438,11 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	if (i915_terminally_wedged(&engine->i915->gpu_error))
 		drm_printf(m, "*** WEDGED ***\n");
 
-	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
+	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x/%x [%d ms]\n",
 		   intel_engine_get_seqno(engine),
 		   intel_engine_last_submit(engine),
-		   engine->hangcheck.seqno,
+		   engine->hangcheck.last_seqno,
+		   engine->hangcheck.next_seqno,
 		   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
 	drm_printf(m, "\tReset count: %d (global %d)\n",
 		   i915_reset_engine_count(error, engine),
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index e92f48a40a59..2d0ef22518b1 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -133,21 +133,21 @@ static void hangcheck_load_sample(struct intel_engine_cs *engine,
 				  struct hangcheck *hc)
 {
 	hc->acthd = intel_engine_get_active_head(engine);
-	hc->seqno = intel_engine_get_seqno(engine);
+	hc->seqno = intel_engine_get_hangcheck_seqno(engine);
 }
 
 static void hangcheck_store_sample(struct intel_engine_cs *engine,
 				   const struct hangcheck *hc)
 {
 	engine->hangcheck.acthd = hc->acthd;
-	engine->hangcheck.seqno = hc->seqno;
+	engine->hangcheck.last_seqno = hc->seqno;
 }
 
 static enum intel_engine_hangcheck_action
 hangcheck_get_action(struct intel_engine_cs *engine,
 		     const struct hangcheck *hc)
 {
-	if (engine->hangcheck.seqno != hc->seqno)
+	if (engine->hangcheck.last_seqno != hc->seqno)
 		return ENGINE_ACTIVE_SEQNO;
 
 	if (intel_engine_is_idle(engine))
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 740e09d3ca26..541e0e9ee781 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -177,6 +177,12 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 	return i915_ggtt_offset(engine->status_page.vma) + I915_GEM_HWS_INDEX_ADDR;
 }
 
+static inline u32 intel_hws_hangcheck_address(struct intel_engine_cs *engine)
+{
+	return (i915_ggtt_offset(engine->status_page.vma) +
+		I915_GEM_HWS_HANGCHECK_ADDR);
+}
+
 static inline struct i915_priolist *to_priolist(struct rb_node *rb)
 {
 	return rb_entry(rb, struct i915_priolist, node);
@@ -2062,6 +2068,10 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 				  request->fence.seqno,
 				  i915_timeline_seqno_address(request->timeline));
 
+	cs = gen8_emit_ggtt_write(cs,
+				  intel_engine_next_hangcheck_seqno(request->engine),
+				  intel_hws_hangcheck_address(request->engine));
+
 	cs = gen8_emit_ggtt_write(cs,
 				  request->global_seqno,
 				  intel_hws_seqno_address(request->engine));
@@ -2074,7 +2084,7 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 
 	gen8_emit_wa_tail(request, cs);
 }
-static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_sz = 14 + WA_TAIL_DWORDS;
 
 static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 {
@@ -2087,6 +2097,11 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 				      PIPE_CONTROL_FLUSH_ENABLE |
 				      PIPE_CONTROL_CS_STALL);
 
+	cs = gen8_emit_ggtt_write_rcs(cs,
+				      intel_engine_next_hangcheck_seqno(request->engine),
+				      intel_hws_hangcheck_address(request->engine),
+				      PIPE_CONTROL_CS_STALL);
+
 	cs = gen8_emit_ggtt_write_rcs(cs,
 				      request->global_seqno,
 				      intel_hws_seqno_address(request->engine),
@@ -2100,7 +2115,7 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 
 	gen8_emit_wa_tail(request, cs);
 }
-static const int gen8_emit_breadcrumb_rcs_sz = 14 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_rcs_sz = 20 + WA_TAIL_DWORDS;
 
 static int gen8_init_rcs_context(struct i915_request *rq)
 {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 97732e387821..0bc513c1db33 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -455,17 +455,20 @@ static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->fence.seqno;
 
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
+	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->global_seqno;
 
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen6_xcs_emit_breadcrumb_sz = 8;
+static const int gen6_xcs_emit_breadcrumb_sz = 10;
 
 #define GEN7_XCS_WA 32
 static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
@@ -476,6 +479,10 @@ static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->fence.seqno;
 
+	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
+	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
+	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
 	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
 	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = rq->global_seqno;
@@ -491,11 +498,12 @@ static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = 0;
 
 	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen7_xcs_emit_breadcrumb_sz = 10 + GEN7_XCS_WA * 3;
+static const int gen7_xcs_emit_breadcrumb_sz = 14 + GEN7_XCS_WA * 3;
 #undef GEN7_XCS_WA
 
 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
@@ -932,16 +940,21 @@ static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
 	*cs++ = rq->fence.seqno;
 
+	*cs++ = MI_STORE_DWORD_INDEX;
+	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
+	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
 	*cs++ = MI_STORE_DWORD_INDEX;
 	*cs++ = I915_GEM_HWS_INDEX_ADDR;
 	*cs++ = rq->global_seqno;
 
 	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int i9xx_emit_breadcrumb_sz = 8;
+static const int i9xx_emit_breadcrumb_sz = 12;
 
 #define GEN5_WA_STORES 8 /* must be at least 1! */
 static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
@@ -956,6 +969,10 @@ static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_SEQNO_ADDR;
 	*cs++ = rq->fence.seqno;
 
+	*cs++ = MI_STORE_DWORD_INDEX;
+	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
+	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
+
 	BUILD_BUG_ON(GEN5_WA_STORES < 1);
 	for (i = 0; i < GEN5_WA_STORES; i++) {
 		*cs++ = MI_STORE_DWORD_INDEX;
@@ -964,12 +981,11 @@ static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	}
 
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 6;
+static const int gen5_emit_breadcrumb_sz = GEN5_WA_STORES * 3 + 8;
 #undef GEN5_WA_STORES
 
 static void
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 3ab159529dc7..126c5285fd01 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -6,6 +6,7 @@
 
 #include <linux/hashtable.h>
 #include <linux/irq_work.h>
+#include <linux/random.h>
 #include <linux/seqlock.h>
 
 #include "i915_gem_batch_pool.h"
@@ -119,7 +120,8 @@ struct intel_instdone {
 
 struct intel_engine_hangcheck {
 	u64 acthd;
-	u32 seqno;
+	u32 last_seqno;
+	u32 next_seqno;
 	unsigned long action_timestamp;
 	struct intel_instdone instdone;
 };
@@ -713,6 +715,8 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
 #define I915_GEM_HWS_INDEX_ADDR (I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
 #define I915_GEM_HWS_PREEMPT_INDEX	0x32
 #define I915_GEM_HWS_PREEMPT_ADDR (I915_GEM_HWS_PREEMPT_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
+#define I915_GEM_HWS_HANGCHECK		0x34
+#define I915_GEM_HWS_HANGCHECK_ADDR (I915_GEM_HWS_HANGCHECK << MI_STORE_DWORD_INDEX_SHIFT)
 #define I915_GEM_HWS_SEQNO		0x40
 #define I915_GEM_HWS_SEQNO_ADDR (I915_GEM_HWS_SEQNO << MI_STORE_DWORD_INDEX_SHIFT)
 #define I915_GEM_HWS_SCRATCH_INDEX	0x80
@@ -1064,4 +1068,17 @@ static inline bool inject_preempt_hang(struct intel_engine_execlists *execlists)
 
 #endif
 
+static inline u32 intel_engine_next_hangcheck_seqno(struct intel_engine_cs *engine)
+{
+	engine->hangcheck.next_seqno =
+		next_pseudo_random32(engine->hangcheck.next_seqno);
+
+	return engine->hangcheck.next_seqno;
+}
+
+static inline u32 intel_engine_get_hangcheck_seqno(struct intel_engine_cs *engine)
+{
+	return intel_read_status_page(engine, I915_GEM_HWS_HANGCHECK);
+}
+
 #endif /* _INTEL_RINGBUFFER_H_ */
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 24/38] drm/i915: Avoid presumption of execution ordering for kernel context switching
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (22 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 23/38] drm/i915: Replace global_seqno with a hangcheck heartbeat seqno Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 25/38] drm/i915/pmu: Always sample an active ringbuffer Chris Wilson
                   ` (15 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

For future GuC implementations, the execution order of individual
requests will be opaque. As such we will not have a single execution
timeline and will not know the last request/context to be run on each
engine. The major consequence for this is that we do not know which
context is still volatile on the HW and which have been saved and can be
swapped out. The only point at which we can know is after a synchronous
switch to the kernel context which we perform on idling. So we must keep
each context pinned from its first use until the next time we idle the
GPU. One consequence of this is that context eviction requires switching
to the kernel context and idling!

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h               |   4 +
 drivers/gpu/drm/i915/i915_gem.c               | 151 +++++++-----------
 drivers/gpu/drm/i915/i915_gem_context.c       |  93 +++--------
 drivers/gpu/drm/i915/i915_gem_context.h       |  10 +-
 drivers/gpu/drm/i915/i915_gem_evict.c         |  20 +--
 drivers/gpu/drm/i915/i915_request.c           |  41 ++---
 drivers/gpu/drm/i915/i915_request.h           |   1 +
 drivers/gpu/drm/i915/intel_engine_cs.c        |  60 +------
 drivers/gpu/drm/i915/intel_ringbuffer.h       |  14 --
 drivers/gpu/drm/i915/selftests/i915_gem.c     |   4 +-
 .../gpu/drm/i915/selftests/i915_gem_context.c |  60 +++----
 .../gpu/drm/i915/selftests/i915_gem_evict.c   | 148 -----------------
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |   2 +-
 drivers/gpu/drm/i915/selftests/intel_lrc.c    |   8 +-
 .../drm/i915/selftests/intel_workarounds.c    |   2 +-
 drivers/gpu/drm/i915/selftests/mock_context.c |   1 +
 drivers/gpu/drm/i915/selftests/mock_engine.c  |  11 +-
 .../gpu/drm/i915/selftests/mock_gem_device.c  |  15 +-
 18 files changed, 177 insertions(+), 468 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index cd2ea6e13fa6..74bccb153359 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1984,8 +1984,10 @@ struct drm_i915_private {
 			struct list_head hwsp_free_list;
 		} timelines;
 
+		struct list_head active_contexts;
 		struct list_head active_rings;
 		struct list_head closed_vma;
+		unsigned long active_engines;
 		u32 active_requests;
 
 		/**
@@ -2095,6 +2097,8 @@ static inline struct drm_i915_private *huc_to_i915(struct intel_huc *huc)
 	     (tmp__) ? \
 	     ((engine__) = (dev_priv__)->engine[__mask_next_bit(tmp__)]), 1 : \
 	     0;)
+#define for_each_active_engine(engine__, i915__, tmp__) \
+	for_each_engine_masked((engine__), (i915__), (i915__)->gt.active_engines, (tmp__))
 
 enum hdmi_force_audio {
 	HDMI_AUDIO_OFF_DVI = -2,	/* no aux data for HDMI-DVI converter */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 9d6edb0c8a75..96b33f8ba9a9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -168,11 +168,19 @@ static u32 __i915_gem_park(struct drm_i915_private *i915)
 	synchronize_irq(i915->drm.irq);
 
 	intel_engines_park(i915);
+	intel_contexts_park(i915);
 	i915_timelines_park(i915);
 
 	i915_pmu_gt_parked(i915);
 	i915_vma_parked(i915);
 
+	/*
+	 * A crude hack for mock selftesting to not release its fake powerwell.
+	 * A better hack would be to give it a real fake powerwell.
+	 */
+	if (I915_SELFTEST_ONLY(i915->gt.awake == -1 && i915->gt.epoch == -1))
+		return i915->gt.epoch;
+
 	wakeref = fetch_and_zero(&i915->gt.awake);
 	GEM_BUG_ON(!wakeref);
 
@@ -2986,43 +2994,23 @@ static void __sleep_rcu(struct rcu_head *rcu)
 	}
 }
 
-static inline bool
-new_requests_since_last_retire(const struct drm_i915_private *i915)
-{
-	return (READ_ONCE(i915->gt.active_requests) ||
-		work_pending(&i915->gt.idle_work.work));
-}
-
-static void assert_kernel_context_is_current(struct drm_i915_private *i915)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	if (i915_terminally_wedged(&i915->gpu_error))
-		return;
-
-	GEM_BUG_ON(i915->gt.active_requests);
-	for_each_engine(engine, i915, id) {
-		GEM_BUG_ON(__i915_gem_active_peek(&engine->timeline.last_request));
-		GEM_BUG_ON(engine->last_retired_context !=
-			   to_intel_context(i915->kernel_context, engine));
-	}
-}
-
 static void
 i915_gem_idle_work_handler(struct work_struct *work)
 {
-	struct drm_i915_private *dev_priv =
-		container_of(work, typeof(*dev_priv), gt.idle_work.work);
+	struct drm_i915_private *i915 =
+		container_of(work, typeof(*i915), gt.idle_work.work);
 	unsigned int epoch = I915_EPOCH_INVALID;
 	bool rearm_hangcheck;
 
-	if (!READ_ONCE(dev_priv->gt.awake))
+	if (!READ_ONCE(i915->gt.awake))
 		return;
 
-	if (READ_ONCE(dev_priv->gt.active_requests))
+	if (READ_ONCE(i915->gt.active_requests))
 		return;
 
+	rearm_hangcheck =
+		cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
+
 	/*
 	 * Flush out the last user context, leaving only the pinned
 	 * kernel context resident. When we are idling on the kernel_context,
@@ -3031,56 +3019,41 @@ i915_gem_idle_work_handler(struct work_struct *work)
 	 * always called at least twice before idling (and if the system is
 	 * idle that implies a round trip through the retire worker).
 	 */
-	mutex_lock(&dev_priv->drm.struct_mutex);
-	i915_gem_switch_to_kernel_context(dev_priv);
-	mutex_unlock(&dev_priv->drm.struct_mutex);
-
-	GEM_TRACE("active_requests=%d (after switch-to-kernel-context)\n",
-		  READ_ONCE(dev_priv->gt.active_requests));
-
-	/*
-	 * Wait for last execlists context complete, but bail out in case a
-	 * new request is submitted. As we don't trust the hardware, we
-	 * continue on if the wait times out. This is necessary to allow
-	 * the machine to suspend even if the hardware dies, and we will
-	 * try to recover in resume (after depriving the hardware of power,
-	 * it may be in a better mmod).
-	 */
-	__wait_for(if (new_requests_since_last_retire(dev_priv)) return,
-		   intel_engines_are_idle(dev_priv),
-		   I915_IDLE_ENGINES_TIMEOUT * 1000,
-		   10, 500);
-
-	rearm_hangcheck =
-		cancel_delayed_work_sync(&dev_priv->gpu_error.hangcheck_work);
-
-	if (!mutex_trylock(&dev_priv->drm.struct_mutex)) {
+	if (!mutex_trylock(&i915->drm.struct_mutex)) {
 		/* Currently busy, come back later */
-		mod_delayed_work(dev_priv->wq,
-				 &dev_priv->gt.idle_work,
+		mod_delayed_work(i915->wq,
+				 &i915->gt.idle_work,
 				 msecs_to_jiffies(50));
 		goto out_rearm;
 	}
 
-	/*
-	 * New request retired after this work handler started, extend active
-	 * period until next instance of the work.
-	 */
-	if (new_requests_since_last_retire(dev_priv))
-		goto out_unlock;
+	if (!i915->gt.active_requests &&
+	    !work_pending(&i915->gt.idle_work.work)) {
+		++i915->gt.active_requests; /* don't requeue idle! */
 
-	epoch = __i915_gem_park(dev_priv);
+		if (i915_gem_switch_to_kernel_context(i915) ||
+		    i915_gem_wait_for_idle(i915,
+					   I915_WAIT_LOCKED,
+					   HZ / 10)) {
+			dev_err(i915->drm.dev,
+				"Failed to idle engines, declaring wedged!\n");
+			GEM_TRACE_DUMP();
+			i915_gem_set_wedged(i915);
+			i915_retire_requests(i915);
+		}
 
-	assert_kernel_context_is_current(dev_priv);
+		if (!--i915->gt.active_requests) {
+			epoch = __i915_gem_park(i915);
+			rearm_hangcheck = false;
+		}
+	}
 
-	rearm_hangcheck = false;
-out_unlock:
-	mutex_unlock(&dev_priv->drm.struct_mutex);
+	mutex_unlock(&i915->drm.struct_mutex);
 
 out_rearm:
 	if (rearm_hangcheck) {
-		GEM_BUG_ON(!dev_priv->gt.awake);
-		i915_queue_hangcheck(dev_priv);
+		GEM_BUG_ON(!i915->gt.awake);
+		i915_queue_hangcheck(i915);
 	}
 
 	/*
@@ -3091,11 +3064,11 @@ i915_gem_idle_work_handler(struct work_struct *work)
 	 * period, and then queue a task (that will run last on the wq) to
 	 * shrink and re-optimize the caches.
 	 */
-	if (same_epoch(dev_priv, epoch)) {
+	if (same_epoch(i915, epoch)) {
 		struct sleep_rcu_work *s = kmalloc(sizeof(*s), GFP_KERNEL);
 		if (s) {
 			init_rcu_head(&s->rcu);
-			s->i915 = dev_priv;
+			s->i915 = i915;
 			s->epoch = epoch;
 			call_rcu(&s->rcu, __sleep_rcu);
 		}
@@ -3244,8 +3217,7 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
 		  flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
 		  timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
 
-	/* If the device is asleep, we have no requests outstanding */
-	if (!READ_ONCE(i915->gt.awake))
+	if (!READ_ONCE(i915->gt.active_requests))
 		return 0;
 
 	mutex_lock(&i915->gt.timelines.mutex);
@@ -3298,7 +3270,11 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
 			return err;
 
 		i915_retire_requests(i915);
-		GEM_BUG_ON(i915->gt.active_requests);
+		if (flags & I915_WAIT_FOR_IDLE_PARK) {
+			GEM_BUG_ON(i915->gt.active_requests);
+			__i915_gem_park(i915);
+			GEM_BUG_ON(i915->gt.active_engines);
+		}
 	}
 
 	return 0;
@@ -4504,10 +4480,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 
 	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 	intel_runtime_pm_put(i915, wakeref);
-
-	mutex_lock(&i915->drm.struct_mutex);
-	i915_gem_contexts_lost(i915);
-	mutex_unlock(&i915->drm.struct_mutex);
 }
 
 int i915_gem_suspend(struct drm_i915_private *i915)
@@ -4541,12 +4513,11 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 		ret = i915_gem_wait_for_idle(i915,
 					     I915_WAIT_INTERRUPTIBLE |
 					     I915_WAIT_LOCKED |
-					     I915_WAIT_FOR_IDLE_BOOST,
+					     I915_WAIT_FOR_IDLE_BOOST |
+					     I915_WAIT_FOR_IDLE_PARK,
 					     MAX_SCHEDULE_TIMEOUT);
 		if (ret && ret != -EIO)
 			goto err_unlock;
-
-		assert_kernel_context_is_current(i915);
 	}
 	i915_retire_requests(i915); /* ensure we flush after wedging */
 
@@ -4826,6 +4797,9 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
 	for_each_engine(engine, i915, id) {
 		struct i915_request *rq;
 
+		if (!engine->context_size)
+			continue;
+
 		rq = i915_request_alloc(engine, ctx);
 		if (IS_ERR(rq)) {
 			err = PTR_ERR(rq);
@@ -4845,21 +4819,19 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
 	if (err)
 		goto err_active;
 
-	if (i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, HZ / 5)) {
-		i915_gem_set_wedged(i915);
-		err = -EIO; /* Caller will declare us wedged */
-		goto err_active;
-	}
-
-	assert_kernel_context_is_current(i915);
-
 	/*
 	 * Immediately park the GPU so that we enable powersaving and
 	 * treat it as idle. The next time we issue a request, we will
 	 * unpark and start using the engine->pinned_default_state, otherwise
 	 * it is in limbo and an early reset may fail.
 	 */
-	__i915_gem_park(i915);
+	if (i915_gem_wait_for_idle(i915,
+				   I915_WAIT_LOCKED | I915_WAIT_FOR_IDLE_PARK,
+				   HZ / 5)) {
+		i915_gem_set_wedged(i915);
+		err = -EIO; /* Caller will declare us wedged */
+		goto err_active;
+	}
 
 	for_each_engine(engine, i915, id) {
 		struct i915_vma *state;
@@ -4934,11 +4906,11 @@ static int __intel_engines_record_defaults(struct drm_i915_private *i915)
 		goto out_ctx;
 
 	if (WARN_ON(i915_gem_wait_for_idle(i915,
-					   I915_WAIT_LOCKED,
+					   I915_WAIT_LOCKED |
+					   I915_WAIT_FOR_IDLE_PARK,
 					   MAX_SCHEDULE_TIMEOUT)))
 		goto out_ctx;
 
-	i915_gem_contexts_lost(i915);
 	goto out_ctx;
 }
 
@@ -5280,6 +5252,7 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
 	if (!dev_priv->priorities)
 		goto err_dependencies;
 
+	INIT_LIST_HEAD(&dev_priv->gt.active_contexts);
 	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
 	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 054d3e1bfe00..ecba3ee204a6 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -342,6 +342,7 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 		struct intel_context *ce = &ctx->__engine[n];
 
 		ce->gem_context = ctx;
+		INIT_LIST_HEAD(&ce->active_link);
 		INIT_LIST_HEAD(&ce->signal_link);
 		INIT_LIST_HEAD(&ce->signals);
 	}
@@ -576,17 +577,6 @@ int i915_gem_contexts_init(struct drm_i915_private *dev_priv)
 	return 0;
 }
 
-void i915_gem_contexts_lost(struct drm_i915_private *dev_priv)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
-	for_each_engine(engine, dev_priv, id)
-		intel_engine_lost_context(engine);
-}
-
 void i915_gem_contexts_fini(struct drm_i915_private *i915)
 {
 	lockdep_assert_held(&i915->drm.struct_mutex);
@@ -660,80 +650,28 @@ last_request_on_engine(struct i915_timeline *timeline,
 	return NULL;
 }
 
-static bool engine_has_kernel_context_barrier(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *i915 = engine->i915;
-	const struct intel_context * const ce =
-		to_intel_context(i915->kernel_context, engine);
-	struct i915_timeline *barrier = ce->ring->timeline;
-	struct intel_ring *ring;
-	bool any_active = false;
-
-	lockdep_assert_held(&i915->drm.struct_mutex);
-	list_for_each_entry(ring, &i915->gt.active_rings, active_link) {
-		struct i915_request *rq;
-
-		rq = last_request_on_engine(ring->timeline, engine);
-		if (!rq)
-			continue;
-
-		any_active = true;
-
-		if (rq->hw_context == ce)
-			continue;
-
-		/*
-		 * Was this request submitted after the previous
-		 * switch-to-kernel-context?
-		 */
-		if (!i915_timeline_sync_is_later(barrier, &rq->fence)) {
-			GEM_TRACE("%s needs barrier for %llx:%lld\n",
-				  ring->timeline->name,
-				  rq->fence.context,
-				  rq->fence.seqno);
-			return false;
-		}
-
-		GEM_TRACE("%s has barrier after %llx:%lld\n",
-			  ring->timeline->name,
-			  rq->fence.context,
-			  rq->fence.seqno);
-	}
-
-	/*
-	 * If any other timeline was still active and behind the last barrier,
-	 * then our last switch-to-kernel-context must still be queued and
-	 * will run last (leaving the engine in the kernel context when it
-	 * eventually idles).
-	 */
-	if (any_active)
-		return true;
-
-	/* The engine is idle; check that it is idling in the kernel context. */
-	return engine->last_retired_context == ce;
-}
-
 int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
+	unsigned long tmp;
 
 	GEM_TRACE("awake?=%s\n", yesno(i915->gt.awake));
 
 	lockdep_assert_held(&i915->drm.struct_mutex);
 	GEM_BUG_ON(!i915->kernel_context);
 
-	i915_retire_requests(i915);
+	/* Inoperable, so presume the GPU is safely pointing into the void! */
+	if (i915_terminally_wedged(&i915->gpu_error))
+		return 0;
 
-	for_each_engine(engine, i915, id) {
+	for_each_active_engine(engine, i915, tmp) {
 		struct intel_ring *ring;
 		struct i915_request *rq;
 
-		GEM_BUG_ON(!to_intel_context(i915->kernel_context, engine));
-		if (engine_has_kernel_context_barrier(engine))
+		if (!engine->context_size)
 			continue;
 
-		GEM_TRACE("emit barrier on %s\n", engine->name);
+		GEM_BUG_ON(!to_intel_context(i915->kernel_context, engine));
 
 		rq = i915_request_alloc(engine, i915->kernel_context);
 		if (IS_ERR(rq))
@@ -757,7 +695,6 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
 			i915_sw_fence_await_sw_fence_gfp(&rq->submit,
 							 &prev->submit,
 							 I915_FENCE_GFP);
-			i915_timeline_sync_set(rq->timeline, &prev->fence);
 		}
 
 		i915_request_add(rq);
@@ -1024,6 +961,20 @@ int __i915_gem_context_pin_hw_id(struct i915_gem_context *ctx)
 	return err;
 }
 
+void intel_contexts_park(struct drm_i915_private *i915)
+{
+	struct intel_context *ce, *cn;
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+
+	list_for_each_entry_safe(ce, cn,
+				 &i915->gt.active_contexts, active_link) {
+		INIT_LIST_HEAD(&ce->active_link);
+		intel_context_unpin(ce);
+	}
+	INIT_LIST_HEAD(&i915->gt.active_contexts);
+}
+
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
 #include "selftests/mock_context.c"
 #include "selftests/i915_gem_context.c"
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index e5eca29cd373..1e41a97b8007 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -164,6 +164,7 @@ struct i915_gem_context {
 	struct intel_context {
 		struct i915_gem_context *gem_context;
 		struct intel_engine_cs *active;
+		struct list_head active_link;
 		struct list_head signal_link;
 		struct list_head signals;
 		struct i915_vma *state;
@@ -314,6 +315,12 @@ static inline void __intel_context_pin(struct intel_context *ce)
 	ce->pin_count++;
 }
 
+static inline void __intel_context_unpin(struct intel_context *ce)
+{
+	GEM_BUG_ON(!ce->pin_count);
+	ce->pin_count--;
+}
+
 static inline void intel_context_unpin(struct intel_context *ce)
 {
 	GEM_BUG_ON(!ce->pin_count);
@@ -324,9 +331,10 @@ static inline void intel_context_unpin(struct intel_context *ce)
 	ce->ops->unpin(ce);
 }
 
+void intel_contexts_park(struct drm_i915_private *i915);
+
 /* i915_gem_context.c */
 int __must_check i915_gem_contexts_init(struct drm_i915_private *dev_priv);
-void i915_gem_contexts_lost(struct drm_i915_private *dev_priv);
 void i915_gem_contexts_fini(struct drm_i915_private *dev_priv);
 
 int i915_gem_context_open(struct drm_i915_private *i915,
diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
index dc137701acb8..d4a7d460f8e3 100644
--- a/drivers/gpu/drm/i915/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/i915_gem_evict.c
@@ -32,24 +32,9 @@
 #include "intel_drv.h"
 #include "i915_trace.h"
 
-I915_SELFTEST_DECLARE(static struct igt_evict_ctl {
-	bool fail_if_busy:1;
-} igt_evict_ctl;)
-
 static bool ggtt_is_idle(struct drm_i915_private *i915)
 {
-       struct intel_engine_cs *engine;
-       enum intel_engine_id id;
-
-       if (i915->gt.active_requests)
-	       return false;
-
-       for_each_engine(engine, i915, id) {
-	       if (!intel_engine_has_kernel_context(engine))
-		       return false;
-       }
-
-       return true;
+	return !i915->gt.active_requests;
 }
 
 static int ggtt_flush(struct drm_i915_private *i915)
@@ -235,9 +220,6 @@ i915_gem_evict_something(struct i915_address_space *vm,
 	 * the kernel's there is no more we can evict.
 	 */
 	if (!ggtt_is_idle(dev_priv)) {
-		if (I915_SELFTEST_ONLY(igt_evict_ctl.fail_if_busy))
-			return -EBUSY;
-
 		ret = ggtt_flush(dev_priv);
 		if (ret)
 			return ret;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 5ad14409b52d..6ff0c47c3a0d 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -210,18 +210,6 @@ static void __retire_engine_request(struct intel_engine_cs *engine,
 	spin_unlock(&rq->lock);
 
 	local_irq_enable();
-
-	/*
-	 * The backing object for the context is done after switching to the
-	 * *next* context. Therefore we cannot retire the previous context until
-	 * the next context has already started running. However, since we
-	 * cannot take the required locks at i915_request_submit() we
-	 * defer the unpinning of the active context to now, retirement of
-	 * the subsequent request.
-	 */
-	if (engine->last_retired_context)
-		intel_context_unpin(engine->last_retired_context);
-	engine->last_retired_context = rq->hw_context;
 }
 
 static void __retire_engine_upto(struct intel_engine_cs *engine,
@@ -294,7 +282,6 @@ static void i915_request_retire(struct i915_request *request)
 
 	/* Retirement decays the ban score as it is a sign of ctx progress */
 	atomic_dec_if_positive(&request->gem_context->ban_score);
-	intel_context_unpin(request->hw_context);
 
 	__retire_engine_upto(request->engine, request);
 
@@ -556,8 +543,15 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	 * ourselves.
 	 */
 	ce = intel_context_pin(ctx, engine);
-	if (IS_ERR(ce))
-		return ERR_CAST(ce);
+	if (IS_ERR(ce)) {
+		i915_gem_wait_for_idle(i915,
+				       I915_WAIT_LOCKED |
+				       I915_WAIT_FOR_IDLE_PARK,
+				       MAX_SCHEDULE_TIMEOUT);
+		ce = intel_context_pin(ctx, engine);
+		if (IS_ERR(ce))
+			return ERR_CAST(ce);
+	}
 
 	reserve_gt(i915);
 
@@ -663,9 +657,6 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	if (ret)
 		goto err_unwind;
 
-	/* Keep a second pin for the dual retirement along engine and ring */
-	__intel_context_pin(ce);
-
 	rq->infix = rq->ring->emit; /* end of header; start of user payload */
 
 	/* Check that we didn't interrupt ourselves with a new request */
@@ -860,6 +851,17 @@ void i915_request_skip(struct i915_request *rq, int error)
 	memset(vaddr + head, 0, rq->postfix - head);
 }
 
+static void pin_active_context(struct drm_i915_private *i915,
+			       struct intel_context *ce)
+{
+	if (unlikely(list_empty(&ce->active_link))) {
+		list_add(&ce->active_link, &i915->gt.active_contexts);
+		__intel_context_pin(ce);
+	}
+	__intel_context_unpin(ce);
+	GEM_BUG_ON(!ce->pin_count);
+}
+
 /*
  * NB: This function is not allowed to fail. Doing so would mean the the
  * request is not being tracked for completion but the work itself is
@@ -935,6 +937,9 @@ void i915_request_add(struct i915_request *request)
 		GEM_TRACE("marking %s as active\n", ring->timeline->name);
 		list_add(&ring->active_link, &request->i915->gt.active_rings);
 	}
+	pin_active_context(request->i915, request->hw_context);
+	request->i915->gt.active_engines |= BIT(engine->id);
+
 	request->emitted_jiffies = jiffies;
 
 	/*
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index b6d473923506..dd413d51cc61 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -278,6 +278,7 @@ long i915_request_wait(struct i915_request *rq,
 #define I915_WAIT_PRIORITY	BIT(2) /* small priority bump for the request */
 #define I915_WAIT_ALL		BIT(3) /* used by i915_gem_object_wait() */
 #define I915_WAIT_FOR_IDLE_BOOST BIT(4)
+#define I915_WAIT_FOR_IDLE_PARK BIT(5)
 
 /**
  * Returns true if seq1 is later than seq2.
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 81349258c4d6..b7f129504014 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -613,8 +613,8 @@ int intel_engine_setup_common(struct intel_engine_cs *engine)
 	return err;
 }
 
-static void __intel_context_unpin(struct i915_gem_context *ctx,
-				  struct intel_engine_cs *engine)
+static void context_unpin(struct i915_gem_context *ctx,
+			  struct intel_engine_cs *engine)
 {
 	intel_context_unpin(to_intel_context(ctx, engine));
 }
@@ -666,7 +666,7 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
 	return 0;
 
 err_unpin_kernel:
-	__intel_context_unpin(i915->kernel_context, engine);
+	context_unpin(i915->kernel_context, engine);
 	return ret;
 }
 
@@ -691,8 +691,8 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 		i915_gem_object_put(engine->default_state);
 
 	if (i915->preempt_context)
-		__intel_context_unpin(i915->preempt_context, engine);
-	__intel_context_unpin(i915->kernel_context, engine);
+		context_unpin(i915->preempt_context, engine);
+	context_unpin(i915->kernel_context, engine);
 
 	i915_timeline_fini(&engine->timeline);
 
@@ -1006,34 +1006,6 @@ bool intel_engines_are_idle(struct drm_i915_private *dev_priv)
 	return true;
 }
 
-/**
- * intel_engine_has_kernel_context:
- * @engine: the engine
- *
- * Returns true if the last context to be executed on this engine, or has been
- * executed if the engine is already idle, is the kernel context
- * (#i915.kernel_context).
- */
-bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine)
-{
-	const struct intel_context *kernel_context =
-		to_intel_context(engine->i915->kernel_context, engine);
-	struct i915_request *rq;
-
-	lockdep_assert_held(&engine->i915->drm.struct_mutex);
-
-	/*
-	 * Check the last context seen by the engine. If active, it will be
-	 * the last request that remains in the timeline. When idle, it is
-	 * the last executed context as tracked by retirement.
-	 */
-	rq = __i915_gem_active_peek(&engine->timeline.last_request);
-	if (rq)
-		return rq->hw_context == kernel_context;
-	else
-		return engine->last_retired_context == kernel_context;
-}
-
 void intel_engines_reset_default_submission(struct drm_i915_private *i915)
 {
 	struct intel_engine_cs *engine;
@@ -1121,6 +1093,8 @@ void intel_engines_park(struct drm_i915_private *i915)
 		i915_gem_batch_pool_fini(&engine->batch_pool);
 		engine->execlists.no_priolist = false;
 	}
+
+	i915->gt.active_engines = 0;
 }
 
 /**
@@ -1152,26 +1126,6 @@ void intel_engines_unpark(struct drm_i915_private *i915)
 	}
 }
 
-/**
- * intel_engine_lost_context: called when the GPU is reset into unknown state
- * @engine: the engine
- *
- * We have either reset the GPU or otherwise about to lose state tracking of
- * the current GPU logical state (e.g. suspend). On next use, it is therefore
- * imperative that we make no presumptions about the current state and load
- * from scratch.
- */
-void intel_engine_lost_context(struct intel_engine_cs *engine)
-{
-	struct intel_context *ce;
-
-	lockdep_assert_held(&engine->i915->drm.struct_mutex);
-
-	ce = fetch_and_zero(&engine->last_retired_context);
-	if (ce)
-		intel_context_unpin(ce);
-}
-
 bool intel_engine_can_store_dword(struct intel_engine_cs *engine)
 {
 	switch (INTEL_GEN(engine->i915)) {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 126c5285fd01..76e06ca0490b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -497,17 +497,6 @@ struct intel_engine_cs {
 
 	struct intel_engine_execlists execlists;
 
-	/* Contexts are pinned whilst they are active on the GPU. The last
-	 * context executed remains active whilst the GPU is idle - the
-	 * switch away and write to the context object only occurs on the
-	 * next execution.  Contexts are only unpinned on retirement of the
-	 * following request ensuring that we can always write to the object
-	 * on the context switch even after idling. Across suspend, we switch
-	 * to the kernel context and trash it as the save may not happen
-	 * before the hardware is powered down.
-	 */
-	struct intel_context *last_retired_context;
-
 	/* status_notifier: list of callbacks for context-switch changes */
 	struct atomic_notifier_head context_status_notifier;
 
@@ -969,9 +958,6 @@ void intel_engines_sanitize(struct drm_i915_private *i915, bool force);
 bool intel_engine_is_idle(struct intel_engine_cs *engine);
 bool intel_engines_are_idle(struct drm_i915_private *dev_priv);
 
-bool intel_engine_has_kernel_context(const struct intel_engine_cs *engine);
-void intel_engine_lost_context(struct intel_engine_cs *engine);
-
 void intel_engines_park(struct drm_i915_private *i915);
 void intel_engines_unpark(struct drm_i915_private *i915);
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem.c b/drivers/gpu/drm/i915/selftests/i915_gem.c
index e77b7ed449ae..378ad0adf2c6 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem.c
@@ -164,7 +164,7 @@ static int igt_gem_suspend(void *arg)
 
 	mutex_lock(&i915->drm.struct_mutex);
 	err = switch_to_context(i915, ctx);
-	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+	if (igt_flush_test(i915, I915_WAIT_LOCKED | I915_WAIT_FOR_IDLE_PARK))
 		err = -EIO;
 	mutex_unlock(&i915->drm.struct_mutex);
 out:
@@ -205,7 +205,7 @@ static int igt_gem_hibernate(void *arg)
 
 	mutex_lock(&i915->drm.struct_mutex);
 	err = switch_to_context(i915, ctx);
-	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+	if (igt_flush_test(i915, I915_WAIT_LOCKED | I915_WAIT_FOR_IDLE_PARK))
 		err = -EIO;
 	mutex_unlock(&i915->drm.struct_mutex);
 out:
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 2cca234fd291..6a241745e78a 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -1097,7 +1097,7 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
 {
 	struct intel_engine_cs *engine;
 	unsigned int tmp;
-	int err;
+	int pass;
 
 	GEM_TRACE("Testing %s\n", __engine_name(i915, engines));
 	for_each_engine_masked(engine, i915, engines, tmp) {
@@ -1110,50 +1110,34 @@ static int __igt_switch_to_kernel_context(struct drm_i915_private *i915,
 		i915_request_add(rq);
 	}
 
-	err = i915_gem_switch_to_kernel_context(i915);
-	if (err)
-		return err;
+	for (pass = 0; pass < 2; pass++) { /* Once busy; once idle */
+		int err;
 
-	for_each_engine_masked(engine, i915, engines, tmp) {
-		if (!engine_has_kernel_context_barrier(engine)) {
-			pr_err("kernel context not last on engine %s!\n",
-			       engine->name);
-			return -EINVAL;
-		}
-	}
+		err = i915_gem_switch_to_kernel_context(i915);
+		if (err)
+			return err;
 
-	err = i915_gem_wait_for_idle(i915,
-				     I915_WAIT_LOCKED,
-				     MAX_SCHEDULE_TIMEOUT);
-	if (err)
-		return err;
+		if (!pass) {
+			err = i915_gem_wait_for_idle(i915,
+						     I915_WAIT_LOCKED |
+						     I915_WAIT_FOR_IDLE_PARK,
+						     MAX_SCHEDULE_TIMEOUT);
+			if (err)
+				return err;
+		}
 
-	GEM_BUG_ON(i915->gt.active_requests);
-	for_each_engine_masked(engine, i915, engines, tmp) {
-		if (engine->last_retired_context->gem_context != i915->kernel_context) {
-			pr_err("engine %s not idling in kernel context!\n",
-			       engine->name);
+		if (i915->gt.active_requests) {
+			pr_err("%d active requests remain after switching to kernel context while %s\n",
+			       i915->gt.active_requests,
+			       pass ? "idle" : "busy");
 			return -EINVAL;
 		}
-	}
 
-	err = i915_gem_switch_to_kernel_context(i915);
-	if (err)
-		return err;
-
-	if (i915->gt.active_requests) {
-		pr_err("switch-to-kernel-context emitted %d requests even though it should already be idling in the kernel context\n",
-		       i915->gt.active_requests);
-		return -EINVAL;
+		/* XXX Bonus points for proving we are the kernel context! */
 	}
 
-	for_each_engine_masked(engine, i915, engines, tmp) {
-		if (!intel_engine_has_kernel_context(engine)) {
-			pr_err("kernel context not last on engine %s!\n",
-			       engine->name);
-			return -EINVAL;
-		}
-	}
+	if (igt_flush_test(i915, I915_WAIT_LOCKED | I915_WAIT_FOR_IDLE_PARK))
+		return -EIO;
 
 	return 0;
 }
@@ -1197,8 +1181,6 @@ static int igt_switch_to_kernel_context(void *arg)
 
 out_unlock:
 	GEM_TRACE_DUMP_ON(err);
-	if (igt_flush_test(i915, I915_WAIT_LOCKED))
-		err = -EIO;
 
 	intel_runtime_pm_put(i915, wakeref);
 	mutex_unlock(&i915->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
index c8deb961a020..1e9c9f53a830 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
@@ -371,153 +371,6 @@ static int igt_evict_vm(void *arg)
 	return err;
 }
 
-static int igt_evict_contexts(void *arg)
-{
-	const u64 PRETEND_GGTT_SIZE = 16ull << 20;
-	struct drm_i915_private *i915 = arg;
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-	struct reserved {
-		struct drm_mm_node node;
-		struct reserved *next;
-	} *reserved = NULL;
-	intel_wakeref_t wakeref;
-	struct drm_mm_node hole;
-	unsigned long count;
-	int err;
-
-	/*
-	 * The purpose of this test is to verify that we will trigger an
-	 * eviction in the GGTT when constructing a request that requires
-	 * additional space in the GGTT for pinning the context. This space
-	 * is not directly tied to the request so reclaiming it requires
-	 * extra work.
-	 *
-	 * As such this test is only meaningful for full-ppgtt environments
-	 * where the GTT space of the request is separate from the GGTT
-	 * allocation required to build the request.
-	 */
-	if (!HAS_FULL_PPGTT(i915))
-		return 0;
-
-	mutex_lock(&i915->drm.struct_mutex);
-	wakeref = intel_runtime_pm_get(i915);
-
-	/* Reserve a block so that we know we have enough to fit a few rq */
-	memset(&hole, 0, sizeof(hole));
-	err = i915_gem_gtt_insert(&i915->ggtt.vm, &hole,
-				  PRETEND_GGTT_SIZE, 0, I915_COLOR_UNEVICTABLE,
-				  0, i915->ggtt.vm.total,
-				  PIN_NOEVICT);
-	if (err)
-		goto out_locked;
-
-	/* Make the GGTT appear small by filling it with unevictable nodes */
-	count = 0;
-	do {
-		struct reserved *r;
-
-		r = kcalloc(1, sizeof(*r), GFP_KERNEL);
-		if (!r) {
-			err = -ENOMEM;
-			goto out_locked;
-		}
-
-		if (i915_gem_gtt_insert(&i915->ggtt.vm, &r->node,
-					1ul << 20, 0, I915_COLOR_UNEVICTABLE,
-					0, i915->ggtt.vm.total,
-					PIN_NOEVICT)) {
-			kfree(r);
-			break;
-		}
-
-		r->next = reserved;
-		reserved = r;
-
-		count++;
-	} while (1);
-	drm_mm_remove_node(&hole);
-	mutex_unlock(&i915->drm.struct_mutex);
-	pr_info("Filled GGTT with %lu 1MiB nodes\n", count);
-
-	/* Overfill the GGTT with context objects and so try to evict one. */
-	for_each_engine(engine, i915, id) {
-		struct i915_sw_fence fence;
-		struct drm_file *file;
-
-		file = mock_file(i915);
-		if (IS_ERR(file)) {
-			err = PTR_ERR(file);
-			break;
-		}
-
-		count = 0;
-		mutex_lock(&i915->drm.struct_mutex);
-		onstack_fence_init(&fence);
-		do {
-			struct i915_request *rq;
-			struct i915_gem_context *ctx;
-
-			ctx = live_context(i915, file);
-			if (!ctx)
-				break;
-
-			/* We will need some GGTT space for the rq's context */
-			igt_evict_ctl.fail_if_busy = true;
-			rq = i915_request_alloc(engine, ctx);
-			igt_evict_ctl.fail_if_busy = false;
-
-			if (IS_ERR(rq)) {
-				/* When full, fail_if_busy will trigger EBUSY */
-				if (PTR_ERR(rq) != -EBUSY) {
-					pr_err("Unexpected error from request alloc (ctx hw id %u, on %s): %d\n",
-					       ctx->hw_id, engine->name,
-					       (int)PTR_ERR(rq));
-					err = PTR_ERR(rq);
-				}
-				break;
-			}
-
-			/* Keep every request/ctx pinned until we are full */
-			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
-							       &fence,
-							       GFP_KERNEL);
-			if (err < 0)
-				break;
-
-			i915_request_add(rq);
-			count++;
-			err = 0;
-		} while(1);
-		mutex_unlock(&i915->drm.struct_mutex);
-
-		onstack_fence_fini(&fence);
-		pr_info("Submitted %lu contexts/requests on %s\n",
-			count, engine->name);
-
-		mock_file_free(i915, file);
-		if (err)
-			break;
-	}
-
-	mutex_lock(&i915->drm.struct_mutex);
-out_locked:
-	while (reserved) {
-		struct reserved *next = reserved->next;
-
-		drm_mm_remove_node(&reserved->node);
-		kfree(reserved);
-
-		reserved = next;
-	}
-	if (drm_mm_node_allocated(&hole))
-		drm_mm_remove_node(&hole);
-	intel_runtime_pm_put(i915, wakeref);
-	mutex_unlock(&i915->drm.struct_mutex);
-
-	return err;
-}
-
 int i915_gem_evict_mock_selftests(void)
 {
 	static const struct i915_subtest tests[] = {
@@ -548,7 +401,6 @@ int i915_gem_evict_mock_selftests(void)
 int i915_gem_evict_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
-		SUBTEST(igt_evict_contexts),
 	};
 
 	if (i915_terminally_wedged(&i915->gpu_error))
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 128d37bba1ac..04d66b4303ab 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -1669,7 +1669,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
 	err = i915_subtests(tests, i915);
 
 	mutex_lock(&i915->drm.struct_mutex);
-	igt_flush_test(i915, I915_WAIT_LOCKED);
+	igt_flush_test(i915, I915_WAIT_LOCKED | I915_WAIT_FOR_IDLE_PARK);
 	mutex_unlock(&i915->drm.struct_mutex);
 
 	i915_modparams.enable_hangcheck = saved_hangcheck;
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index 2b2ecd76c2ac..607c969ea605 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -67,7 +67,7 @@ static int live_sanitycheck(void *arg)
 err_spin:
 	igt_spinner_fini(&spin);
 err_unlock:
-	igt_flush_test(i915, I915_WAIT_LOCKED);
+	igt_flush_test(i915, I915_WAIT_LOCKED | I915_WAIT_FOR_IDLE_PARK);
 	intel_runtime_pm_put(i915, wakeref);
 	mutex_unlock(&i915->drm.struct_mutex);
 	return err;
@@ -161,7 +161,7 @@ static int live_preempt(void *arg)
 err_spin_hi:
 	igt_spinner_fini(&spin_hi);
 err_unlock:
-	igt_flush_test(i915, I915_WAIT_LOCKED);
+	igt_flush_test(i915, I915_WAIT_LOCKED | I915_WAIT_FOR_IDLE_PARK);
 	intel_runtime_pm_put(i915, wakeref);
 	mutex_unlock(&i915->drm.struct_mutex);
 	return err;
@@ -255,7 +255,7 @@ static int live_late_preempt(void *arg)
 err_spin_hi:
 	igt_spinner_fini(&spin_hi);
 err_unlock:
-	igt_flush_test(i915, I915_WAIT_LOCKED);
+	igt_flush_test(i915, I915_WAIT_LOCKED | I915_WAIT_FOR_IDLE_PARK);
 	intel_runtime_pm_put(i915, wakeref);
 	mutex_unlock(&i915->drm.struct_mutex);
 	return err;
@@ -379,7 +379,7 @@ static int live_preempt_hang(void *arg)
 err_spin_hi:
 	igt_spinner_fini(&spin_hi);
 err_unlock:
-	igt_flush_test(i915, I915_WAIT_LOCKED);
+	igt_flush_test(i915, I915_WAIT_LOCKED | I915_WAIT_FOR_IDLE_PARK);
 	intel_runtime_pm_put(i915, wakeref);
 	mutex_unlock(&i915->drm.struct_mutex);
 	return err;
diff --git a/drivers/gpu/drm/i915/selftests/intel_workarounds.c b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
index b15c4f26c593..e8664bf5c873 100644
--- a/drivers/gpu/drm/i915/selftests/intel_workarounds.c
+++ b/drivers/gpu/drm/i915/selftests/intel_workarounds.c
@@ -505,7 +505,7 @@ live_engine_reset_gt_engine_workarounds(void *arg)
 	igt_global_reset_unlock(i915);
 	kernel_context_close(ctx);
 
-	igt_flush_test(i915, I915_WAIT_LOCKED);
+	igt_flush_test(i915, I915_WAIT_LOCKED | I915_WAIT_FOR_IDLE_PARK);
 
 	return ret;
 }
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index e4db9a31b510..2009e776b5d8 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -49,6 +49,7 @@ mock_context(struct drm_i915_private *i915,
 		struct intel_context *ce = &ctx->__engine[n];
 
 		ce->gem_context = ctx;
+		INIT_LIST_HEAD(&ce->active_link);
 		INIT_LIST_HEAD(&ce->signal_link);
 		INIT_LIST_HEAD(&ce->signals);
 	}
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index c1cd8b27b32a..0c5649044bc9 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -228,7 +228,11 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 		goto err_free;
 	i915_timeline_set_subclass(&engine->base.timeline, TIMELINE_ENGINE);
 
+	engine->base.execlists.queue_priority = INT_MIN;
+	engine->base.execlists.queue = RB_ROOT_CACHED;
+
 	intel_engine_init_breadcrumbs(&engine->base);
+	intel_engine_init_batch_pool(&engine->base);
 
 	/* fake hw queue */
 	spin_lock_init(&engine->hw_lock);
@@ -271,15 +275,10 @@ void mock_engine_free(struct intel_engine_cs *engine)
 {
 	struct mock_engine *mock =
 		container_of(engine, typeof(*mock), base);
-	struct intel_context *ce;
 
 	GEM_BUG_ON(timer_pending(&mock->hw_delay));
 
-	ce = fetch_and_zero(&engine->last_retired_context);
-	if (ce)
-		intel_context_unpin(ce);
-
-	__intel_context_unpin(engine->i915->kernel_context, engine);
+	context_unpin(engine->i915->kernel_context, engine);
 
 	intel_engine_fini_breadcrumbs(engine);
 	i915_timeline_fini(&engine->timeline);
diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
index 41ae502361d7..42fa8b4cbd38 100644
--- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
+++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
@@ -47,6 +47,15 @@ void mock_device_flush(struct drm_i915_private *i915)
 	GEM_BUG_ON(i915->gt.active_requests);
 }
 
+static void mock_device_park(struct drm_i915_private *i915)
+{
+	intel_engines_park(i915);
+	intel_contexts_park(i915);
+	i915_timelines_park(i915);
+
+	i915_vma_parked(i915);
+}
+
 static void mock_device_release(struct drm_device *dev)
 {
 	struct drm_i915_private *i915 = to_i915(dev);
@@ -55,7 +64,7 @@ static void mock_device_release(struct drm_device *dev)
 
 	mutex_lock(&i915->drm.struct_mutex);
 	mock_device_flush(i915);
-	i915_gem_contexts_lost(i915);
+	mock_device_park(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 
 	drain_delayed_work(&i915->gt.retire_work);
@@ -200,7 +209,8 @@ struct drm_i915_private *mock_gem_device(void)
 	INIT_DELAYED_WORK(&i915->gt.retire_work, mock_retire_work_handler);
 	INIT_DELAYED_WORK(&i915->gt.idle_work, mock_idle_work_handler);
 
-	i915->gt.awake = true;
+	i915->gt.awake = -1;
+	i915->gt.epoch = -1;
 
 	i915->objects = KMEM_CACHE(mock_object, SLAB_HWCACHE_ALIGN);
 	if (!i915->objects)
@@ -229,6 +239,7 @@ struct drm_i915_private *mock_gem_device(void)
 
 	i915_timelines_init(i915);
 
+	INIT_LIST_HEAD(&i915->gt.active_contexts);
 	INIT_LIST_HEAD(&i915->gt.active_rings);
 	INIT_LIST_HEAD(&i915->gt.closed_vma);
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 25/38] drm/i915/pmu: Always sample an active ringbuffer
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (23 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 24/38] drm/i915: Avoid presumption of execution ordering for kernel context switching Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-22  9:20   ` Tvrtko Ursulin
  2019-01-18 14:00 ` [PATCH 26/38] drm/i915: Remove the global per-engine execution timeline Chris Wilson
                   ` (14 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

As we no longer have a precise indication of requests queued to an
engine, make no presumptions and just sample the ring registers to see
if the engine is busy.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_pmu.c | 47 +++++++++++----------------------
 1 file changed, 16 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
index b1cb2d3cae16..452589a7e473 100644
--- a/drivers/gpu/drm/i915/i915_pmu.c
+++ b/drivers/gpu/drm/i915/i915_pmu.c
@@ -148,14 +148,6 @@ void i915_pmu_gt_unparked(struct drm_i915_private *i915)
 	spin_unlock_irq(&i915->pmu.lock);
 }
 
-static bool grab_forcewake(struct drm_i915_private *i915, bool fw)
-{
-	if (!fw)
-		intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
-
-	return true;
-}
-
 static void
 add_sample(struct i915_pmu_sample *sample, u32 val)
 {
@@ -168,7 +160,6 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned int period_ns)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 	intel_wakeref_t wakeref;
-	bool fw = false;
 
 	if ((dev_priv->pmu.enable & ENGINE_SAMPLE_MASK) == 0)
 		return;
@@ -180,37 +171,31 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned int period_ns)
 	if (!wakeref)
 		return;
 
+	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
 	for_each_engine(engine, dev_priv, id) {
-		u32 current_seqno = intel_engine_get_seqno(engine);
-		u32 last_seqno = intel_engine_last_submit(engine);
+		typeof(engine->pmu) *pmu = &engine->pmu;
 		u32 val;
 
-		val = !i915_seqno_passed(current_seqno, last_seqno);
-
-		if (val)
-			add_sample(&engine->pmu.sample[I915_SAMPLE_BUSY],
-				   period_ns);
+		val = I915_READ_FW(RING_MI_MODE(engine->mmio_base));
+		if (val & MODE_IDLE)
+			continue;
 
-		if (val && (engine->pmu.enable &
-		    (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA)))) {
-			fw = grab_forcewake(dev_priv, fw);
+		add_sample(&pmu->sample[I915_SAMPLE_BUSY], period_ns);
 
+		if (pmu->enable &
+		    (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA))) {
 			val = I915_READ_FW(RING_CTL(engine->mmio_base));
-		} else {
-			val = 0;
-		}
 
-		if (val & RING_WAIT)
-			add_sample(&engine->pmu.sample[I915_SAMPLE_WAIT],
-				   period_ns);
+			if (val & RING_WAIT)
+				add_sample(&pmu->sample[I915_SAMPLE_WAIT],
+					   period_ns);
 
-		if (val & RING_WAIT_SEMAPHORE)
-			add_sample(&engine->pmu.sample[I915_SAMPLE_SEMA],
-				   period_ns);
+			if (val & RING_WAIT_SEMAPHORE)
+				add_sample(&pmu->sample[I915_SAMPLE_SEMA],
+					   period_ns);
+		}
 	}
-
-	if (fw)
-		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
+	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
 
 	intel_runtime_pm_put(dev_priv, wakeref);
 }
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 26/38] drm/i915: Remove the global per-engine execution timeline
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (24 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 25/38] drm/i915/pmu: Always sample an active ringbuffer Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-18 14:00 ` [PATCH 27/38] drm/i915: Introduce the i915_user_extension_method Chris Wilson
                   ` (13 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

For future GuC firmware, the intention is to submit a large number of
contexts and their interdependencies and leave the execution order to
the firmware. As such, we want to allow the firmware freedom to execute
independent contexts in whatever order suits it and so must forgo the
concept of a single execution timeline with a predefined global seqno.
As we have transitioned to per-context timelines, we should be agnostic
to the actual execution order, tracking execution along each timeline
independently.

Having made this transition to per-context timelines, we can remove the
engine->timeline and request->global_seqno.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c           |   1 -
 drivers/gpu/drm/i915/i915_gem.c               |  21 +++-
 drivers/gpu/drm/i915/i915_gem_context.c       |   2 -
 drivers/gpu/drm/i915/i915_gpu_error.c         |  47 ++------
 drivers/gpu/drm/i915/i915_gpu_error.h         |   6 -
 drivers/gpu/drm/i915/i915_request.c           | 106 +++++-------------
 drivers/gpu/drm/i915/i915_request.h           |  32 ------
 drivers/gpu/drm/i915/i915_reset.c             |  18 ++-
 drivers/gpu/drm/i915/i915_scheduler.c         |  12 +-
 drivers/gpu/drm/i915/i915_timeline.h          |   2 -
 drivers/gpu/drm/i915/i915_trace.h             |  25 ++---
 drivers/gpu/drm/i915/intel_engine_cs.c        |  54 +++------
 drivers/gpu/drm/i915/intel_guc_submission.c   |   8 +-
 drivers/gpu/drm/i915/intel_lrc.c              |  92 +++++----------
 drivers/gpu/drm/i915/intel_ringbuffer.c       |  72 +++---------
 drivers/gpu/drm/i915/intel_ringbuffer.h       |  49 +-------
 drivers/gpu/drm/i915/selftests/i915_request.c |   3 +-
 .../gpu/drm/i915/selftests/intel_hangcheck.c  |   5 +-
 drivers/gpu/drm/i915/selftests/mock_engine.c  |  15 +--
 19 files changed, 139 insertions(+), 431 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 16a9384de478..c308c2c589f0 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1322,7 +1322,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 			   engine->hangcheck.next_seqno,
 			   jiffies_to_msecs(jiffies -
 					    engine->hangcheck.action_timestamp));
-
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 96b33f8ba9a9..9a78e1c9b323 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2879,6 +2879,14 @@ i915_gem_object_pwrite_gtt(struct drm_i915_gem_object *obj,
 	return 0;
 }
 
+static bool match_ring(struct i915_request *rq)
+{
+	struct drm_i915_private *dev_priv = rq->i915;
+	u32 ring = I915_READ(RING_START(rq->engine->mmio_base));
+
+	return ring == i915_ggtt_offset(rq->ring->vma);
+}
+
 struct i915_request *
 i915_gem_find_active_request(struct intel_engine_cs *engine)
 {
@@ -2896,15 +2904,22 @@ i915_gem_find_active_request(struct intel_engine_cs *engine)
 	 * At all other times, we must assume the GPU is still running, but
 	 * we only care about the snapshot of this moment.
 	 */
-	spin_lock_irqsave(&engine->timeline.lock, flags);
-	list_for_each_entry(request, &engine->timeline.requests, link) {
+	spin_lock_irqsave(&engine->execution_lock, flags);
+	list_for_each_entry(request, &engine->requests, link) {
 		if (i915_request_completed(request))
 			continue;
 
+		if (!i915_request_started(request))
+			continue;
+
+		/* More than one preemptible request may match! */
+		if (!match_ring(request))
+			continue;
+
 		active = request;
 		break;
 	}
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 
 	return active;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index ecba3ee204a6..e8334c4bc130 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -635,8 +635,6 @@ last_request_on_engine(struct i915_timeline *timeline,
 {
 	struct i915_request *rq;
 
-	GEM_BUG_ON(timeline == &engine->timeline);
-
 	rq = i915_gem_active_raw(&timeline->last_request,
 				 &engine->i915->drm.struct_mutex);
 	if (rq && rq->engine == engine) {
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index 7d26c25ee785..ecab20b6d26e 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -380,19 +380,16 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
 	err_printf(m, "%s [%d]:\n", name, count);
 
 	while (count--) {
-		err_printf(m, "    %08x_%08x %8u %02x %02x %02x",
+		err_printf(m, "    %08x_%08x %8u %02x %02x",
 			   upper_32_bits(err->gtt_offset),
 			   lower_32_bits(err->gtt_offset),
 			   err->size,
 			   err->read_domains,
-			   err->write_domain,
-			   err->wseqno);
+			   err->write_domain);
 		err_puts(m, tiling_flag(err->tiling));
 		err_puts(m, dirty_flag(err->dirty));
 		err_puts(m, purgeable_flag(err->purgeable));
 		err_puts(m, err->userptr ? " userptr" : "");
-		err_puts(m, err->engine != -1 ? " " : "");
-		err_puts(m, engine_name(m->i915, err->engine));
 		err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
 
 		if (err->name)
@@ -444,12 +441,11 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
 				const struct drm_i915_error_request *erq,
 				const unsigned long epoch)
 {
-	if (!erq->seqno)
+	if (!erq->context)
 		return;
 
-	err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
-		   prefix, erq->pid, erq->ban_score,
-		   erq->context, erq->seqno, erq->sched_attr.priority,
+	err_printf(m, "%s pid %d, ban score %d, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
+		   prefix, erq->pid, erq->ban_score, erq->sched_attr.priority,
 		   jiffies_to_msecs(erq->jiffies - epoch),
 		   erq->start, erq->head, erq->tail);
 }
@@ -528,8 +524,6 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 				   ee->vm_info.pp_dir_base);
 		}
 	}
-	err_printf(m, "  seqno: 0x%08x\n", ee->seqno);
-	err_printf(m, "  last_seqno: 0x%08x\n", ee->last_seqno);
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
 	err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
@@ -1056,27 +1050,6 @@ i915_error_object_create(struct drm_i915_private *i915,
 	return dst;
 }
 
-/* The error capture is special as tries to run underneath the normal
- * locking rules - so we use the raw version of the i915_gem_active lookup.
- */
-static inline u32
-__active_get_seqno(struct i915_gem_active *active)
-{
-	struct i915_request *request;
-
-	request = __i915_gem_active_peek(active);
-	return request ? request->global_seqno : 0;
-}
-
-static inline int
-__active_get_engine_id(struct i915_gem_active *active)
-{
-	struct i915_request *request;
-
-	request = __i915_gem_active_peek(active);
-	return request ? request->engine->id : -1;
-}
-
 static void capture_bo(struct drm_i915_error_buffer *err,
 		       struct i915_vma *vma)
 {
@@ -1085,9 +1058,6 @@ static void capture_bo(struct drm_i915_error_buffer *err,
 	err->size = obj->base.size;
 	err->name = obj->base.name;
 
-	err->wseqno = __active_get_seqno(&obj->frontbuffer_write);
-	err->engine = __active_get_engine_id(&obj->frontbuffer_write);
-
 	err->gtt_offset = vma->node.start;
 	err->read_domains = obj->read_domains;
 	err->write_domain = obj->write_domain;
@@ -1218,8 +1188,6 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
 
 	ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base));
 	ee->acthd = intel_engine_get_active_head(engine);
-	ee->seqno = intel_engine_get_seqno(engine);
-	ee->last_seqno = intel_engine_last_submit(engine);
 	ee->start = I915_READ_START(engine);
 	ee->head = I915_READ_HEAD(engine);
 	ee->tail = I915_READ_TAIL(engine);
@@ -1292,7 +1260,6 @@ static void record_request(struct i915_request *request,
 	erq->context = ctx->hw_id;
 	erq->sched_attr = request->sched.attr;
 	erq->ban_score = atomic_read(&ctx->ban_score);
-	erq->seqno = request->global_seqno;
 	erq->jiffies = request->emitted_jiffies;
 	erq->start = i915_ggtt_offset(request->ring->vma);
 	erq->head = request->head;
@@ -1312,7 +1279,7 @@ static void engine_record_requests(struct intel_engine_cs *engine,
 
 	count = 0;
 	request = first;
-	list_for_each_entry_from(request, &engine->timeline.requests, link)
+	list_for_each_entry_from(request, &engine->requests, link)
 		count++;
 	if (!count)
 		return;
@@ -1325,7 +1292,7 @@ static void engine_record_requests(struct intel_engine_cs *engine,
 
 	count = 0;
 	request = first;
-	list_for_each_entry_from(request, &engine->timeline.requests, link) {
+	list_for_each_entry_from(request, &engine->requests, link) {
 		if (count >= ee->num_requests) {
 			/*
 			 * If the ring request list was changed in
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.h b/drivers/gpu/drm/i915/i915_gpu_error.h
index 99a53c0cd6da..59f5cf327edd 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.h
+++ b/drivers/gpu/drm/i915/i915_gpu_error.h
@@ -94,8 +94,6 @@ struct i915_gpu_state {
 		u32 cpu_ring_head;
 		u32 cpu_ring_tail;
 
-		u32 last_seqno;
-
 		/* Register state */
 		u32 start;
 		u32 tail;
@@ -108,7 +106,6 @@ struct i915_gpu_state {
 		u32 bbstate;
 		u32 instpm;
 		u32 instps;
-		u32 seqno;
 		u64 bbaddr;
 		u64 acthd;
 		u32 fault_reg;
@@ -149,7 +146,6 @@ struct i915_gpu_state {
 			pid_t pid;
 			u32 context;
 			int ban_score;
-			u32 seqno;
 			u32 start;
 			u32 head;
 			u32 tail;
@@ -169,7 +165,6 @@ struct i915_gpu_state {
 	struct drm_i915_error_buffer {
 		u32 size;
 		u32 name;
-		u32 wseqno;
 		u64 gtt_offset;
 		u32 read_domains;
 		u32 write_domain;
@@ -178,7 +173,6 @@ struct i915_gpu_state {
 		u32 dirty:1;
 		u32 purgeable:1;
 		u32 userptr:1;
-		s32 engine:4;
 		u32 cache_level:3;
 	} *active_bo[I915_NUM_ENGINES], *pinned_bo;
 	u32 active_bo_count[I915_NUM_ENGINES], pinned_bo_count;
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 6ff0c47c3a0d..7bccf578cd65 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -179,24 +179,15 @@ static void free_capture_list(struct i915_request *request)
 	}
 }
 
-static void __retire_engine_request(struct intel_engine_cs *engine,
-				    struct i915_request *rq)
+static void __retire_engine(struct i915_request *rq)
 {
-	GEM_TRACE("%s(%s) fence %llx:%lld, global=%d, current %d:%d\n",
-		  __func__, engine->name,
-		  rq->fence.context, rq->fence.seqno,
-		  rq->global_seqno,
-		  hwsp_seqno(rq),
-		  intel_engine_get_seqno(engine));
-
-	GEM_BUG_ON(!i915_request_completed(rq));
+	struct intel_engine_cs *engine = rq->engine;
 
 	local_irq_disable();
 
-	spin_lock(&engine->timeline.lock);
-	GEM_BUG_ON(!list_is_first(&rq->link, &engine->timeline.requests));
+	spin_lock(&engine->execution_lock);
 	list_del_init(&rq->link);
-	spin_unlock(&engine->timeline.lock);
+	spin_unlock(&engine->execution_lock);
 
 	spin_lock(&rq->lock);
 	if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
@@ -212,33 +203,14 @@ static void __retire_engine_request(struct intel_engine_cs *engine,
 	local_irq_enable();
 }
 
-static void __retire_engine_upto(struct intel_engine_cs *engine,
-				 struct i915_request *rq)
-{
-	struct i915_request *tmp;
-
-	if (list_empty(&rq->link))
-		return;
-
-	do {
-		tmp = list_first_entry(&engine->timeline.requests,
-				       typeof(*tmp), link);
-
-		GEM_BUG_ON(tmp->engine != engine);
-		__retire_engine_request(engine, tmp);
-	} while (tmp != rq);
-}
-
 static void i915_request_retire(struct i915_request *request)
 {
 	struct i915_gem_active *active, *next;
 
-	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d:%d\n",
+	GEM_TRACE("%s fence %llx:%lld, current %d\n",
 		  request->engine->name,
 		  request->fence.context, request->fence.seqno,
-		  request->global_seqno,
-		  hwsp_seqno(request),
-		  intel_engine_get_seqno(request->engine));
+		  hwsp_seqno(request));
 
 	lockdep_assert_held(&request->i915->drm.struct_mutex);
 	GEM_BUG_ON(!i915_sw_fence_signaled(&request->submit));
@@ -283,7 +255,7 @@ static void i915_request_retire(struct i915_request *request)
 	/* Retirement decays the ban score as it is a sign of ctx progress */
 	atomic_dec_if_positive(&request->gem_context->ban_score);
 
-	__retire_engine_upto(request->engine, request);
+	__retire_engine(request);
 
 	unreserve_gt(request->i915);
 
@@ -296,12 +268,10 @@ void i915_request_retire_upto(struct i915_request *rq)
 	struct intel_ring *ring = rq->ring;
 	struct i915_request *tmp;
 
-	GEM_TRACE("%s fence %llx:%lld, global=%d, current %d:%d\n",
+	GEM_TRACE("%s fence %llx:%lld, current %d\n",
 		  rq->engine->name,
 		  rq->fence.context, rq->fence.seqno,
-		  rq->global_seqno,
-		  hwsp_seqno(rq),
-		  intel_engine_get_seqno(rq->engine));
+		  hwsp_seqno(rq));
 
 	lockdep_assert_held(&rq->i915->drm.struct_mutex);
 	GEM_BUG_ON(!i915_request_completed(rq));
@@ -324,41 +294,30 @@ static u32 timeline_get_seqno(struct i915_timeline *tl)
 }
 
 static void move_to_timeline(struct i915_request *request,
-			     struct i915_timeline *timeline)
+			     struct list_head *timeline)
 {
-	GEM_BUG_ON(request->timeline == &request->engine->timeline);
-	lockdep_assert_held(&request->engine->timeline.lock);
+	lockdep_assert_held(&request->engine->execution_lock);
 
 	spin_lock(&request->timeline->lock);
-	list_move_tail(&request->link, &timeline->requests);
+	list_move_tail(&request->link, timeline);
 	spin_unlock(&request->timeline->lock);
 }
 
 void __i915_request_submit(struct i915_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
-	u32 seqno;
 
-	GEM_TRACE("%s fence %llx:%lld -> global=%d, current %d:%d\n",
+	GEM_TRACE("%s fence %llx:%lld -> current %d\n",
 		  engine->name,
 		  request->fence.context, request->fence.seqno,
-		  engine->timeline.seqno + 1,
-		  hwsp_seqno(request),
-		  intel_engine_get_seqno(engine));
+		  hwsp_seqno(request));
 
 	GEM_BUG_ON(!irqs_disabled());
-	lockdep_assert_held(&engine->timeline.lock);
-
-	GEM_BUG_ON(request->global_seqno);
-
-	seqno = timeline_get_seqno(&engine->timeline);
-	GEM_BUG_ON(!seqno);
-	GEM_BUG_ON(intel_engine_signaled(engine, seqno));
+	lockdep_assert_held(&engine->execution_lock);
 
 	/* We may be recursing from the signal callback of another i915 fence */
 	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
 	set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
-	request->global_seqno = seqno;
 	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
 	    !intel_engine_enable_signaling(request))
 		intel_engine_queue_breadcrumbs(engine);
@@ -368,7 +327,7 @@ void __i915_request_submit(struct i915_request *request)
 				request->ring->vaddr + request->postfix);
 
 	/* Transfer from per-context onto the global per-engine timeline */
-	move_to_timeline(request, &engine->timeline);
+	move_to_timeline(request, &engine->requests);
 
 	trace_i915_request_execute(request);
 }
@@ -379,54 +338,41 @@ void i915_request_submit(struct i915_request *request)
 	unsigned long flags;
 
 	/* Will be called from irq-context when using foreign fences. */
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 
 	__i915_request_submit(request);
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 void __i915_request_unsubmit(struct i915_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
 
-	GEM_TRACE("%s fence %llx:%lld <- global=%d, current %d:%d\n",
+	GEM_TRACE("%s fence %llx:%lld <- current %d\n",
 		  engine->name,
 		  request->fence.context, request->fence.seqno,
-		  request->global_seqno,
-		  hwsp_seqno(request),
-		  intel_engine_get_seqno(engine));
+		  hwsp_seqno(request));
 
 	GEM_BUG_ON(!irqs_disabled());
-	lockdep_assert_held(&engine->timeline.lock);
+	lockdep_assert_held(&engine->execution_lock);
 
 	/*
 	 * Only unwind in reverse order, required so that the per-context list
 	 * is kept in seqno/ring order.
 	 */
-	GEM_BUG_ON(!request->global_seqno);
-	GEM_BUG_ON(request->global_seqno != engine->timeline.seqno);
-	GEM_BUG_ON(intel_engine_has_completed(engine, request->global_seqno));
-	engine->timeline.seqno--;
 
 	/* We may be recursing from the signal callback of another i915 fence */
 	spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
-	request->global_seqno = 0;
 	if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags))
 		intel_engine_cancel_signaling(request);
 	clear_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
 	spin_unlock(&request->lock);
 
 	/* Transfer back from the global per-engine timeline to per-context */
-	move_to_timeline(request, request->timeline);
+	move_to_timeline(request, &request->timeline->requests);
 
-	/*
-	 * We don't need to wake_up any waiters on request->execute, they
-	 * will get woken by any other event or us re-adding this request
-	 * to the engine timeline (__i915_request_submit()). The waiters
-	 * should be quite adapt at finding that the request now has a new
-	 * global_seqno to the one they went to sleep on.
-	 */
+	GEM_BUG_ON(i915_request_completed(request));
 }
 
 void i915_request_unsubmit(struct i915_request *request)
@@ -435,11 +381,11 @@ void i915_request_unsubmit(struct i915_request *request)
 	unsigned long flags;
 
 	/* Will be called from irq-context when using foreign fences. */
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 
 	__i915_request_unsubmit(request);
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 static int __i915_sw_fence_call
@@ -609,7 +555,6 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	rq->hw_context = ce;
 	rq->ring = ce->ring;
 	rq->timeline = ce->ring->timeline;
-	GEM_BUG_ON(rq->timeline == &engine->timeline);
 	rq->hwsp_seqno = rq->timeline->hwsp_seqno;
 
 	spin_lock_init(&rq->lock);
@@ -625,7 +570,6 @@ i915_request_alloc(struct intel_engine_cs *engine, struct i915_gem_context *ctx)
 	i915_sched_node_init(&rq->sched);
 
 	/* No zalloc, must clear what we need by hand */
-	rq->global_seqno = 0;
 	rq->file_priv = NULL;
 	rq->batch = NULL;
 	rq->capture_list = NULL;
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index dd413d51cc61..679b4663f774 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -129,14 +129,6 @@ struct i915_request {
 	 */
 	const u32 *hwsp_seqno;
 
-	/**
-	 * GEM sequence number associated with this request on the
-	 * global execution timeline. It is zero when the request is not
-	 * on the HW queue (i.e. not on the engine timeline list).
-	 * Its value is guarded by the timeline spinlock.
-	 */
-	u32 global_seqno;
-
 	/** Position in the ring of the start of the request */
 	u32 head;
 
@@ -229,30 +221,6 @@ i915_request_put(struct i915_request *rq)
 	dma_fence_put(&rq->fence);
 }
 
-/**
- * i915_request_global_seqno - report the current global seqno
- * @request - the request
- *
- * A request is assigned a global seqno only when it is on the hardware
- * execution queue. The global seqno can be used to maintain a list of
- * requests on the same engine in retirement order, for example for
- * constructing a priority queue for waiting. Prior to its execution, or
- * if it is subsequently removed in the event of preemption, its global
- * seqno is zero. As both insertion and removal from the execution queue
- * may operate in IRQ context, it is not guarded by the usual struct_mutex
- * BKL. Instead those relying on the global seqno must be prepared for its
- * value to change between reads. Only when the request is complete can
- * the global seqno be stable (due to the memory barriers on submitting
- * the commands to the hardware to write the breadcrumb, if the HWS shows
- * that it has passed the global seqno and the global seqno is unchanged
- * after the read, it is indeed complete).
- */
-static inline u32
-i915_request_global_seqno(const struct i915_request *request)
-{
-	return READ_ONCE(request->global_seqno);
-}
-
 int i915_request_await_object(struct i915_request *to,
 			      struct drm_i915_gem_object *obj,
 			      bool write);
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index d846f49401a6..e0d87e287b10 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -19,17 +19,13 @@ static void engine_skip_context(struct i915_request *rq)
 	struct i915_gem_context *hung_ctx = rq->gem_context;
 	struct i915_timeline *timeline = rq->timeline;
 
-	lockdep_assert_held(&engine->timeline.lock);
-	GEM_BUG_ON(timeline == &engine->timeline);
+	lockdep_assert_held(&engine->execution_lock);
 
 	spin_lock(&timeline->lock);
 
-	if (rq->global_seqno) {
-		list_for_each_entry_continue(rq,
-					     &engine->timeline.requests, link)
-			if (rq->gem_context == hung_ctx)
-				i915_request_skip(rq, -EIO);
-	}
+	list_for_each_entry(rq, &engine->requests, link)
+		if (!i915_request_completed(rq) && rq->gem_context == hung_ctx)
+			i915_request_skip(rq, -EIO);
 
 	list_for_each_entry(rq, &timeline->requests, link)
 		i915_request_skip(rq, -EIO);
@@ -96,7 +92,7 @@ static void context_mark_innocent(struct i915_gem_context *ctx)
 
 void i915_reset_request(struct i915_request *rq, bool guilty)
 {
-	lockdep_assert_held(&rq->engine->timeline.lock);
+	lockdep_assert_held(&rq->engine->execution_lock);
 	GEM_BUG_ON(i915_request_completed(rq));
 
 	if (guilty) {
@@ -754,10 +750,10 @@ static void nop_submit_request(struct i915_request *request)
 		  engine->name, request->fence.context, request->fence.seqno);
 	dma_fence_set_error(&request->fence, -EIO);
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 	__i915_request_submit(request);
 	i915_request_fake_complete(request);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 
 	intel_engine_queue_breadcrumbs(engine);
 }
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index e0b177687bec..53e76a91ad75 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -165,7 +165,7 @@ i915_sched_lookup_priolist(struct intel_engine_cs *engine, int prio)
 	bool first = true;
 	int idx, i;
 
-	lockdep_assert_held(&engine->timeline.lock);
+	lockdep_assert_held(&engine->execution_lock);
 	assert_priolists(execlists, INT_MAX);
 
 	/* buckets sorted from highest [in slot 0] to lowest priority */
@@ -232,8 +232,8 @@ sched_lock_engine(struct i915_sched_node *node, struct intel_engine_cs *locked)
 	GEM_BUG_ON(!locked);
 
 	if (engine != locked) {
-		spin_unlock(&locked->timeline.lock);
-		spin_lock(&engine->timeline.lock);
+		spin_unlock(&locked->execution_lock);
+		spin_lock(&engine->execution_lock);
 	}
 
 	return engine;
@@ -244,7 +244,7 @@ static bool inflight(const struct i915_request *rq,
 {
 	const struct i915_request *active;
 
-	if (!rq->global_seqno)
+	if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags))
 		return false;
 
 	active = port_request(engine->execlists.port);
@@ -331,7 +331,7 @@ static void __i915_schedule(struct i915_request *rq,
 
 	last = NULL;
 	engine = rq->engine;
-	spin_lock_irq(&engine->timeline.lock);
+	spin_lock_irq(&engine->execution_lock);
 
 	/* Fifo and depth-first replacement ensure our deps execute before us */
 	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
@@ -383,7 +383,7 @@ static void __i915_schedule(struct i915_request *rq,
 		tasklet_hi_schedule(&engine->execlists.tasklet);
 	}
 
-	spin_unlock_irq(&engine->timeline.lock);
+	spin_unlock_irq(&engine->execution_lock);
 }
 
 void i915_schedule(struct i915_request *rq, const struct i915_sched_attr *attr)
diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
index 421eb34568de..d06fa094a7be 100644
--- a/drivers/gpu/drm/i915/i915_timeline.h
+++ b/drivers/gpu/drm/i915/i915_timeline.h
@@ -39,8 +39,6 @@ struct i915_timeline {
 	u32 seqno;
 
 	spinlock_t lock;
-#define TIMELINE_CLIENT 0 /* default subclass */
-#define TIMELINE_ENGINE 1
 
 	unsigned int pin_count;
 	const u32 *hwsp_seqno;
diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h
index cb5bc65d575d..d8fe328ada36 100644
--- a/drivers/gpu/drm/i915/i915_trace.h
+++ b/drivers/gpu/drm/i915/i915_trace.h
@@ -625,7 +625,6 @@ DECLARE_EVENT_CLASS(i915_request,
 			     __field(u16, class)
 			     __field(u16, instance)
 			     __field(u32, seqno)
-			     __field(u32, global)
 			     ),
 
 	    TP_fast_assign(
@@ -635,13 +634,11 @@ DECLARE_EVENT_CLASS(i915_request,
 			   __entry->instance = rq->engine->instance;
 			   __entry->ctx = rq->fence.context;
 			   __entry->seqno = rq->fence.seqno;
-			   __entry->global = rq->global_seqno;
 			   ),
 
-	    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u, global=%u",
+	    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u",
 		      __entry->dev, __entry->class, __entry->instance,
-		      __entry->hw_id, __entry->ctx, __entry->seqno,
-		      __entry->global)
+		      __entry->hw_id, __entry->ctx, __entry->seqno)
 );
 
 DEFINE_EVENT(i915_request, i915_request_add,
@@ -671,7 +668,6 @@ TRACE_EVENT(i915_request_in,
 			     __field(u16, class)
 			     __field(u16, instance)
 			     __field(u32, seqno)
-			     __field(u32, global_seqno)
 			     __field(u32, port)
 			     __field(u32, prio)
 			    ),
@@ -683,15 +679,14 @@ TRACE_EVENT(i915_request_in,
 			   __entry->instance = rq->engine->instance;
 			   __entry->ctx = rq->fence.context;
 			   __entry->seqno = rq->fence.seqno;
-			   __entry->global_seqno = rq->global_seqno;
 			   __entry->prio = rq->sched.attr.priority;
 			   __entry->port = port;
 			   ),
 
-	    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u, prio=%u, global=%u, port=%u",
+	    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u, prio=%u, port=%u",
 		      __entry->dev, __entry->class, __entry->instance,
 		      __entry->hw_id, __entry->ctx, __entry->seqno,
-		      __entry->prio, __entry->global_seqno, __entry->port)
+		      __entry->prio, __entry->port)
 );
 
 TRACE_EVENT(i915_request_out,
@@ -705,7 +700,6 @@ TRACE_EVENT(i915_request_out,
 			     __field(u16, class)
 			     __field(u16, instance)
 			     __field(u32, seqno)
-			     __field(u32, global_seqno)
 			     __field(u32, completed)
 			    ),
 
@@ -716,14 +710,13 @@ TRACE_EVENT(i915_request_out,
 			   __entry->instance = rq->engine->instance;
 			   __entry->ctx = rq->fence.context;
 			   __entry->seqno = rq->fence.seqno;
-			   __entry->global_seqno = rq->global_seqno;
 			   __entry->completed = i915_request_completed(rq);
 			   ),
 
-		    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u, global=%u, completed?=%u",
+		    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u, completed?=%u",
 			      __entry->dev, __entry->class, __entry->instance,
 			      __entry->hw_id, __entry->ctx, __entry->seqno,
-			      __entry->global_seqno, __entry->completed)
+			      __entry->completed)
 );
 
 #else
@@ -766,7 +759,6 @@ TRACE_EVENT(i915_request_wait_begin,
 			     __field(u16, class)
 			     __field(u16, instance)
 			     __field(u32, seqno)
-			     __field(u32, global)
 			     __field(unsigned int, flags)
 			     ),
 
@@ -783,14 +775,13 @@ TRACE_EVENT(i915_request_wait_begin,
 			   __entry->instance = rq->engine->instance;
 			   __entry->ctx = rq->fence.context;
 			   __entry->seqno = rq->fence.seqno;
-			   __entry->global = rq->global_seqno;
 			   __entry->flags = flags;
 			   ),
 
-	    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u, global=%u, blocking=%u, flags=0x%x",
+	    TP_printk("dev=%u, engine=%u:%u, hw_id=%u, ctx=%llu, seqno=%u, blocking=%u, flags=0x%x",
 		      __entry->dev, __entry->class, __entry->instance,
 		      __entry->hw_id, __entry->ctx, __entry->seqno,
-		      __entry->global, !!(__entry->flags & I915_WAIT_LOCKED),
+		      !!(__entry->flags & I915_WAIT_LOCKED),
 		      __entry->flags)
 );
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index b7f129504014..47f3cad6e861 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -455,12 +455,6 @@ int intel_engines_init(struct drm_i915_private *dev_priv)
 	return err;
 }
 
-void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno)
-{
-	intel_write_status_page(engine, I915_GEM_HWS_INDEX, seqno);
-	GEM_BUG_ON(intel_engine_get_seqno(engine) != seqno);
-}
-
 static void intel_engine_init_batch_pool(struct intel_engine_cs *engine)
 {
 	i915_gem_batch_pool_init(&engine->batch_pool, engine);
@@ -588,29 +582,19 @@ int intel_engine_setup_common(struct intel_engine_cs *engine)
 {
 	int err;
 
+	spin_lock_init(&engine->execution_lock);
+	INIT_LIST_HEAD(&engine->requests);
+
 	err = init_status_page(engine);
 	if (err)
 		return err;
 
-	err = i915_timeline_init(engine->i915,
-				 &engine->timeline,
-				 engine->name,
-				 engine->status_page.vma);
-	if (err)
-		goto err_hwsp;
-
-	i915_timeline_set_subclass(&engine->timeline, TIMELINE_ENGINE);
-
 	intel_engine_init_execlist(engine);
 	intel_engine_init_hangcheck(engine);
 	intel_engine_init_batch_pool(engine);
 	intel_engine_init_cmd_parser(engine);
 
 	return 0;
-
-err_hwsp:
-	cleanup_status_page(engine);
-	return err;
 }
 
 static void context_unpin(struct i915_gem_context *ctx,
@@ -694,8 +678,6 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine)
 		context_unpin(i915->preempt_context, engine);
 	context_unpin(i915->kernel_context, engine);
 
-	i915_timeline_fini(&engine->timeline);
-
 	intel_wa_list_free(&engine->ctx_wa_list);
 	intel_wa_list_free(&engine->wa_list);
 	intel_wa_list_free(&engine->whitelist);
@@ -954,10 +936,6 @@ bool intel_engine_is_idle(struct intel_engine_cs *engine)
 	if (i915_terminally_wedged(&dev_priv->gpu_error))
 		return true;
 
-	/* Any inflight/incomplete requests? */
-	if (!intel_engine_signaled(engine, intel_engine_last_submit(engine)))
-		return false;
-
 	/* Waiting to drain ELSP? */
 	if (READ_ONCE(engine->execlists.active)) {
 		struct tasklet_struct *t = &engine->execlists.tasklet;
@@ -1178,15 +1156,14 @@ static void print_request(struct drm_printer *m,
 
 	x = print_sched_attr(rq->i915, &rq->sched.attr, buf, x, sizeof(buf));
 
-	drm_printf(m, "%s%x%s%s [%llx:%llx]%s @ %dms: %s\n",
+	drm_printf(m, "%s %llx:%llx%s%s %s @ %dms: %s\n",
 		   prefix,
-		   rq->global_seqno,
+		   rq->fence.context, rq->fence.seqno,
 		   i915_request_completed(rq) ? "!" :
 		   i915_request_started(rq) ? "*" :
 		   "",
 		   test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
 			    &rq->fence.flags) ?  "+" : "",
-		   rq->fence.context, rq->fence.seqno,
 		   buf,
 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
 		   name);
@@ -1392,12 +1369,11 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 	if (i915_terminally_wedged(&engine->i915->gpu_error))
 		drm_printf(m, "*** WEDGED ***\n");
 
-	drm_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x/%x [%d ms]\n",
-		   intel_engine_get_seqno(engine),
-		   intel_engine_last_submit(engine),
+	drm_printf(m, "\tHangcheck %x/%x [%d ms]\n",
 		   engine->hangcheck.last_seqno,
 		   engine->hangcheck.next_seqno,
-		   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
+		   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp
+));
 	drm_printf(m, "\tReset count: %d (global %d)\n",
 		   i915_reset_engine_count(error, engine),
 		   i915_reset_count(error));
@@ -1406,14 +1382,12 @@ void intel_engine_dump(struct intel_engine_cs *engine,
 
 	drm_printf(m, "\tRequests:\n");
 
-	rq = list_first_entry(&engine->timeline.requests,
-			      struct i915_request, link);
-	if (&rq->link != &engine->timeline.requests)
+	rq = list_first_entry(&engine->requests, struct i915_request, link);
+	if (&rq->link != &engine->requests)
 		print_request(m, rq, "\t\tfirst  ");
 
-	rq = list_last_entry(&engine->timeline.requests,
-			     struct i915_request, link);
-	if (&rq->link != &engine->timeline.requests)
+	rq = list_last_entry(&engine->requests, struct i915_request, link);
+	if (&rq->link != &engine->requests)
 		print_request(m, rq, "\t\tlast   ");
 
 	rq = i915_gem_find_active_request(engine);
@@ -1494,7 +1468,7 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine)
 	if (!intel_engine_supports_stats(engine))
 		return -ENODEV;
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 	write_seqlock(&engine->stats.lock);
 
 	if (unlikely(engine->stats.enabled == ~0)) {
@@ -1520,7 +1494,7 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine)
 
 unlock:
 	write_sequnlock(&engine->stats.lock);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 
 	return err;
 }
diff --git a/drivers/gpu/drm/i915/intel_guc_submission.c b/drivers/gpu/drm/i915/intel_guc_submission.c
index b044162a41d3..8d62d7fd51e8 100644
--- a/drivers/gpu/drm/i915/intel_guc_submission.c
+++ b/drivers/gpu/drm/i915/intel_guc_submission.c
@@ -534,7 +534,7 @@ static void guc_add_request(struct intel_guc *guc, struct i915_request *rq)
 	spin_lock(&client->wq_lock);
 
 	guc_wq_item_append(client, engine->guc_id, ctx_desc,
-			   ring_tail, rq->global_seqno);
+			   ring_tail, rq->fence.seqno);
 	guc_ring_doorbell(client);
 
 	client->submissions[engine->id] += 1;
@@ -730,7 +730,7 @@ static bool __guc_dequeue(struct intel_engine_cs *engine)
 	bool submit = false;
 	struct rb_node *rb;
 
-	lockdep_assert_held(&engine->timeline.lock);
+	lockdep_assert_held(&engine->execution_lock);
 
 	if (port_isset(port)) {
 		if (intel_engine_has_preemption(engine)) {
@@ -811,7 +811,7 @@ static void guc_submission_tasklet(unsigned long data)
 	struct i915_request *rq;
 	unsigned long flags;
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 
 	rq = port_request(port);
 	while (rq && i915_request_completed(rq)) {
@@ -836,7 +836,7 @@ static void guc_submission_tasklet(unsigned long data)
 	if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_PREEMPT))
 		guc_dequeue(engine);
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 static void guc_reset_prepare(struct intel_engine_cs *engine)
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 541e0e9ee781..323341e9bf2d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -172,11 +172,6 @@ static void execlists_init_reg_state(u32 *reg_state,
 				     struct intel_engine_cs *engine,
 				     struct intel_ring *ring);
 
-static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
-{
-	return i915_ggtt_offset(engine->status_page.vma) + I915_GEM_HWS_INDEX_ADDR;
-}
-
 static inline u32 intel_hws_hangcheck_address(struct intel_engine_cs *engine)
 {
 	return (i915_ggtt_offset(engine->status_page.vma) +
@@ -285,11 +280,9 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
 	struct list_head *uninitialized_var(pl);
 	int prio = I915_PRIORITY_INVALID | I915_PRIORITY_NEWCLIENT;
 
-	lockdep_assert_held(&engine->timeline.lock);
+	lockdep_assert_held(&engine->execution_lock);
 
-	list_for_each_entry_safe_reverse(rq, rn,
-					 &engine->timeline.requests,
-					 link) {
+	list_for_each_entry_safe_reverse(rq, rn, &engine->requests, link) {
 		if (i915_request_completed(rq))
 			break;
 
@@ -452,13 +445,11 @@ static void execlists_submit_ports(struct intel_engine_cs *engine)
 			desc = execlists_update_context(rq);
 			GEM_DEBUG_EXEC(port[n].context_id = upper_32_bits(desc));
 
-			GEM_TRACE("%s in[%d]:  ctx=%d.%d, global=%d (fence %llx:%lld) (current %d:%d), prio=%d\n",
+			GEM_TRACE("%s in[%d]:  ctx=%d.%d, fence=%llx:%lld, current=%d, prio=%d\n",
 				  engine->name, n,
 				  port[n].context_id, count,
-				  rq->global_seqno,
 				  rq->fence.context, rq->fence.seqno,
 				  hwsp_seqno(rq),
-				  intel_engine_get_seqno(engine),
 				  rq_prio(rq));
 		} else {
 			GEM_BUG_ON(!n);
@@ -754,13 +745,11 @@ execlists_cancel_port_requests(struct intel_engine_execlists * const execlists)
 	while (num_ports-- && port_isset(port)) {
 		struct i915_request *rq = port_request(port);
 
-		GEM_TRACE("%s:port%u global=%d (fence %llx:%lld), (current %d:%d)\n",
+		GEM_TRACE("%s:port%u fence=%llx:%lld current=%d\n",
 			  rq->engine->name,
 			  (unsigned int)(port - execlists->port),
-			  rq->global_seqno,
 			  rq->fence.context, rq->fence.seqno,
-			  hwsp_seqno(rq),
-			  intel_engine_get_seqno(rq->engine));
+			  hwsp_seqno(rq));
 
 		GEM_BUG_ON(!execlists->active);
 		execlists_context_schedule_out(rq,
@@ -816,9 +805,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	struct rb_node *rb;
 	unsigned long flags;
 
-	GEM_TRACE("%s current %d\n",
-		  engine->name, intel_engine_get_seqno(engine));
-
 	/*
 	 * Before we call engine->cancel_requests(), we should have exclusive
 	 * access to the submission state. This is arranged for us by the
@@ -833,19 +819,16 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	 * submission's irq state, we also wish to remind ourselves that
 	 * it is irq state.)
 	 */
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 
 	/* Cancel the requests on the HW and clear the ELSP tracker. */
 	execlists_cancel_port_requests(execlists);
 	execlists_user_end(execlists);
 
 	/* Mark all executing requests as skipped. */
-	list_for_each_entry(rq, &engine->timeline.requests, link) {
-		GEM_BUG_ON(!rq->global_seqno);
-
+	list_for_each_entry(rq, &engine->requests, link) {
 		if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags))
 			dma_fence_set_error(&rq->fence, -EIO);
-
 		i915_request_fake_complete(rq);
 	}
 
@@ -867,10 +850,6 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 			kmem_cache_free(engine->i915->priorities, p);
 	}
 
-	intel_write_status_page(engine,
-				I915_GEM_HWS_INDEX,
-				intel_engine_last_submit(engine));
-
 	/* Remaining _unready_ requests will be nop'ed when submitted */
 
 	execlists->queue_priority = INT_MIN;
@@ -880,7 +859,7 @@ static void execlists_cancel_requests(struct intel_engine_cs *engine)
 	GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
 	execlists->tasklet.func = nop_submission_tasklet;
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 static inline bool
@@ -984,14 +963,12 @@ static void process_csb(struct intel_engine_cs *engine)
 						EXECLISTS_ACTIVE_USER));
 
 		rq = port_unpack(port, &count);
-		GEM_TRACE("%s out[0]: ctx=%d.%d, global=%d (fence %llx:%lld) (current %d:%d), prio=%d\n",
+		GEM_TRACE("%s out[0]: ctx=%d.%d, fence=%llx:%lld current=%d, prio=%d\n",
 			  engine->name,
 			  port->context_id, count,
-			  rq ? rq->global_seqno : 0,
 			  rq ? rq->fence.context : 0,
 			  rq ? rq->fence.seqno : 0,
 			  rq ? hwsp_seqno(rq) : 0,
-			  intel_engine_get_seqno(engine),
 			  rq ? rq_prio(rq) : 0);
 
 		/* Check the context/desc id for this event matches */
@@ -1056,7 +1033,7 @@ static void process_csb(struct intel_engine_cs *engine)
 
 static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
 {
-	lockdep_assert_held(&engine->timeline.lock);
+	lockdep_assert_held(&engine->execution_lock);
 
 	process_csb(engine);
 	if (!execlists_is_active(&engine->execlists, EXECLISTS_ACTIVE_PREEMPT))
@@ -1077,9 +1054,9 @@ static void execlists_submission_tasklet(unsigned long data)
 		  !!engine->i915->gt.awake,
 		  engine->execlists.active);
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 	__execlists_submission_tasklet(engine);
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 static void queue_request(struct intel_engine_cs *engine,
@@ -1118,7 +1095,7 @@ static void execlists_submit_request(struct i915_request *request)
 	unsigned long flags;
 
 	/* Will be called from irq-context when using foreign fences. */
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 
 	queue_request(engine, &request->sched, rq_prio(request));
 
@@ -1127,7 +1104,7 @@ static void execlists_submit_request(struct i915_request *request)
 
 	submit_queue(engine, request);
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 static void execlists_context_destroy(struct intel_context *ce)
@@ -1160,9 +1137,9 @@ static void execlists_context_unpin(struct intel_context *ce)
 	if (unlikely(engine)) {
 		unsigned long flags;
 
-		spin_lock_irqsave(&engine->timeline.lock, flags);
+		spin_lock_irqsave(&engine->execution_lock, flags);
 		process_csb(engine);
-		spin_unlock_irqrestore(&engine->timeline.lock, flags);
+		spin_unlock_irqrestore(&engine->execution_lock, flags);
 
 		GEM_BUG_ON(READ_ONCE(ce->active));
 	}
@@ -1787,9 +1764,9 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine)
 	GEM_BUG_ON(!reset_in_progress(execlists));
 
 	/* And flush any current direct submission. */
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 	process_csb(engine); /* drain preemption events */
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
@@ -1799,7 +1776,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	unsigned long flags;
 	u32 *regs;
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 
 	/*
 	 * Catch up with any missed context-switch interrupts.
@@ -1818,12 +1795,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	/* Following the reset, we need to reload the CSB read/write pointers */
 	reset_csb_pointers(&engine->execlists);
 
-	GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
-		  engine->name,
-		  rq ? rq->global_seqno : 0,
-		  intel_engine_get_seqno(engine),
-		  yesno(stalled));
-	if (!rq)
+	if (!rq || !i915_request_started(rq))
 		goto out_unlock;
 
 	/*
@@ -1865,7 +1837,7 @@ static void execlists_reset(struct intel_engine_cs *engine, bool stalled)
 	intel_ring_update_space(rq->ring);
 
 out_unlock:
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 static void execlists_reset_finish(struct intel_engine_cs *engine)
@@ -2061,9 +2033,6 @@ static void gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
 
 static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 {
-	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
-	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
-
 	cs = gen8_emit_ggtt_write(cs,
 				  request->fence.seqno,
 				  i915_timeline_seqno_address(request->timeline));
@@ -2072,10 +2041,6 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 				  intel_engine_next_hangcheck_seqno(request->engine),
 				  intel_hws_hangcheck_address(request->engine));
 
-	cs = gen8_emit_ggtt_write(cs,
-				  request->global_seqno,
-				  intel_hws_seqno_address(request->engine));
-
 	*cs++ = MI_USER_INTERRUPT;
 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 
@@ -2084,7 +2049,7 @@ static void gen8_emit_breadcrumb(struct i915_request *request, u32 *cs)
 
 	gen8_emit_wa_tail(request, cs);
 }
-static const int gen8_emit_breadcrumb_sz = 14 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_sz = 10 + WA_TAIL_DWORDS;
 
 static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 {
@@ -2102,11 +2067,6 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 				      intel_hws_hangcheck_address(request->engine),
 				      PIPE_CONTROL_CS_STALL);
 
-	cs = gen8_emit_ggtt_write_rcs(cs,
-				      request->global_seqno,
-				      intel_hws_seqno_address(request->engine),
-				      PIPE_CONTROL_CS_STALL);
-
 	*cs++ = MI_USER_INTERRUPT;
 	*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
 
@@ -2115,7 +2075,7 @@ static void gen8_emit_breadcrumb_rcs(struct i915_request *request, u32 *cs)
 
 	gen8_emit_wa_tail(request, cs);
 }
-static const int gen8_emit_breadcrumb_rcs_sz = 20 + WA_TAIL_DWORDS;
+static const int gen8_emit_breadcrumb_rcs_sz = 14 + WA_TAIL_DWORDS;
 
 static int gen8_init_rcs_context(struct i915_request *rq)
 {
@@ -2765,11 +2725,11 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
 	unsigned int count;
 	struct rb_node *rb;
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 
 	last = NULL;
 	count = 0;
-	list_for_each_entry(rq, &engine->timeline.requests, link) {
+	list_for_each_entry(rq, &engine->requests, link) {
 		if (count++ < max - 1)
 			show_request(m, rq, "\t\tE ");
 		else
@@ -2809,7 +2769,7 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
 		show_request(m, last, "\t\tQ ");
 	}
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 0bc513c1db33..e2c415aa8354 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -43,11 +43,6 @@
  */
 #define LEGACY_REQUEST_SIZE 200
 
-static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
-{
-	return i915_ggtt_offset(engine->status_page.vma) + I915_GEM_HWS_INDEX_ADDR;
-}
-
 static unsigned int __intel_ring_space(unsigned int head,
 				       unsigned int tail,
 				       unsigned int size)
@@ -329,18 +324,13 @@ static void gen6_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 		PIPE_CONTROL_GLOBAL_GTT;
 	*cs++ = rq->fence.seqno;
 
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL;
-	*cs++ = intel_hws_seqno_address(rq->engine) | PIPE_CONTROL_GLOBAL_GTT;
-	*cs++ = rq->global_seqno;
-
 	*cs++ = MI_USER_INTERRUPT;
 	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen6_rcs_emit_breadcrumb_sz = 18;
+static const int gen6_rcs_emit_breadcrumb_sz = 14;
 
 static int
 gen7_render_ring_cs_stall_wa(struct i915_request *rq)
@@ -434,13 +424,6 @@ static void gen7_rcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = i915_timeline_seqno_address(rq->timeline);
 	*cs++ = rq->fence.seqno;
 
-	*cs++ = GFX_OP_PIPE_CONTROL(4);
-	*cs++ = (PIPE_CONTROL_QW_WRITE |
-		 PIPE_CONTROL_GLOBAL_GTT_IVB |
-		 PIPE_CONTROL_CS_STALL);
-	*cs++ = intel_hws_seqno_address(rq->engine);
-	*cs++ = rq->global_seqno;
-
 	*cs++ = MI_USER_INTERRUPT;
 	*cs++ = MI_NOOP;
 
@@ -459,16 +442,13 @@ static void gen6_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
 
-	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
-	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
-	*cs++ = rq->global_seqno;
-
 	*cs++ = MI_USER_INTERRUPT;
+	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen6_xcs_emit_breadcrumb_sz = 10;
+static const int gen6_xcs_emit_breadcrumb_sz = 8;
 
 #define GEN7_XCS_WA 32
 static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
@@ -483,10 +463,6 @@ static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR | MI_FLUSH_DW_USE_GTT;
 	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
 
-	*cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX;
-	*cs++ = I915_GEM_HWS_INDEX_ADDR | MI_FLUSH_DW_USE_GTT;
-	*cs++ = rq->global_seqno;
-
 	for (i = 0; i < GEN7_XCS_WA; i++) {
 		*cs++ = MI_STORE_DWORD_INDEX;
 		*cs++ = I915_GEM_HWS_SEQNO_ADDR;
@@ -498,12 +474,11 @@ static void gen7_xcs_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = 0;
 
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int gen7_xcs_emit_breadcrumb_sz = 14 + GEN7_XCS_WA * 3;
+static const int gen7_xcs_emit_breadcrumb_sz = 10 + GEN7_XCS_WA * 3;
 #undef GEN7_XCS_WA
 
 static void set_hwstam(struct intel_engine_cs *engine, u32 mask)
@@ -755,25 +730,19 @@ static void reset_prepare(struct intel_engine_cs *engine)
 
 static void reset_ring(struct intel_engine_cs *engine, bool stalled)
 {
-	struct i915_timeline *tl = &engine->timeline;
 	struct i915_request *pos, *rq;
 	unsigned long flags;
 	u32 head;
 
 	rq = NULL;
-	spin_lock_irqsave(&tl->lock, flags);
-	list_for_each_entry(pos, &tl->requests, link) {
+	spin_lock_irqsave(&engine->execution_lock, flags);
+	list_for_each_entry(pos, &engine->requests, link) {
 		if (!i915_request_completed(pos)) {
 			rq = pos;
 			break;
 		}
 	}
 
-	GEM_TRACE("%s seqno=%d, current=%d, stalled? %s\n",
-		  engine->name,
-		  rq ? rq->global_seqno : 0,
-		  intel_engine_get_seqno(engine),
-		  yesno(stalled));
 	/*
 	 * The guilty request will get skipped on a hung engine.
 	 *
@@ -821,7 +790,7 @@ static void reset_ring(struct intel_engine_cs *engine, bool stalled)
 	}
 	engine->buffer->head = intel_ring_wrap(engine->buffer, head);
 
-	spin_unlock_irqrestore(&tl->lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 static void reset_finish(struct intel_engine_cs *engine)
@@ -899,25 +868,19 @@ static void cancel_requests(struct intel_engine_cs *engine)
 	struct i915_request *request;
 	unsigned long flags;
 
-	spin_lock_irqsave(&engine->timeline.lock, flags);
+	spin_lock_irqsave(&engine->execution_lock, flags);
 
 	/* Mark all submitted requests as skipped. */
-	list_for_each_entry(request, &engine->timeline.requests, link) {
-		GEM_BUG_ON(!request->global_seqno);
-
+	list_for_each_entry(request, &engine->requests, link) {
 		if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
 			      &request->fence.flags))
 			dma_fence_set_error(&request->fence, -EIO);
 		i915_request_fake_complete(request);
 	}
 
-	intel_write_status_page(engine,
-				I915_GEM_HWS_INDEX,
-				intel_engine_last_submit(engine));
-
 	/* Remaining _unready_ requests will be nop'ed when submitted */
 
-	spin_unlock_irqrestore(&engine->timeline.lock, flags);
+	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
 static void i9xx_submit_request(struct i915_request *request)
@@ -944,19 +907,14 @@ static void i9xx_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
 	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
 
-	*cs++ = MI_STORE_DWORD_INDEX;
-	*cs++ = I915_GEM_HWS_INDEX_ADDR;
-	*cs++ = rq->global_seqno;
-
 	*cs++ = MI_USER_INTERRUPT;
-	*cs++ = MI_NOOP;
 
 	rq->tail = intel_ring_offset(rq, cs);
 	assert_ring_tail_valid(rq->ring, rq->tail);
 }
-static const int i9xx_emit_breadcrumb_sz = 12;
+static const int i9xx_emit_breadcrumb_sz = 8;
 
-#define GEN5_WA_STORES 8 /* must be at least 1! */
+#define GEN5_WA_STORES 8
 static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 {
 	int i;
@@ -973,11 +931,10 @@ static void gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs)
 	*cs++ = I915_GEM_HWS_HANGCHECK_ADDR;
 	*cs++ = intel_engine_next_hangcheck_seqno(rq->engine);
 
-	BUILD_BUG_ON(GEN5_WA_STORES < 1);
 	for (i = 0; i < GEN5_WA_STORES; i++) {
 		*cs++ = MI_STORE_DWORD_INDEX;
-		*cs++ = I915_GEM_HWS_INDEX_ADDR;
-		*cs++ = rq->global_seqno;
+		*cs++ = I915_GEM_HWS_SCRATCH_ADDR;
+		*cs++ = 0;
 	}
 
 	*cs++ = MI_USER_INTERRUPT;
@@ -1331,7 +1288,6 @@ intel_engine_create_ring(struct intel_engine_cs *engine,
 
 	GEM_BUG_ON(!is_power_of_2(size));
 	GEM_BUG_ON(RING_CTL_SIZE(size) & ~RING_NR_PAGES);
-	GEM_BUG_ON(timeline == &engine->timeline);
 	lockdep_assert_held(&engine->i915->drm.struct_mutex);
 
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 76e06ca0490b..9ca8f5ff5dd4 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -365,7 +365,10 @@ struct intel_engine_cs {
 
 	struct intel_ring *buffer;
 
-	struct i915_timeline timeline;
+	struct list_head requests;
+	spinlock_t execution_lock;
+#define EXECUTION_HW 0 /* default */
+#define EXECUTION_VIRTUAL 1
 
 	struct drm_i915_gem_object *default_state;
 	void *pinned_default_state;
@@ -700,9 +703,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
  *
  * The area from dword 0x30 to 0x3ff is available for driver usage.
  */
-#define I915_GEM_HWS_INDEX		0x30
-#define I915_GEM_HWS_INDEX_ADDR (I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
-#define I915_GEM_HWS_PREEMPT_INDEX	0x32
+#define I915_GEM_HWS_PREEMPT_INDEX	0x30
 #define I915_GEM_HWS_PREEMPT_ADDR (I915_GEM_HWS_PREEMPT_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
 #define I915_GEM_HWS_HANGCHECK		0x34
 #define I915_GEM_HWS_HANGCHECK_ADDR (I915_GEM_HWS_HANGCHECK << MI_STORE_DWORD_INDEX_SHIFT)
@@ -812,8 +813,6 @@ intel_ring_set_tail(struct intel_ring *ring, unsigned int tail)
 	return tail;
 }
 
-void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno);
-
 int intel_engine_setup_common(struct intel_engine_cs *engine);
 int intel_engine_init_common(struct intel_engine_cs *engine);
 void intel_engine_cleanup_common(struct intel_engine_cs *engine);
@@ -831,44 +830,6 @@ void intel_engine_set_hwsp_writemask(struct intel_engine_cs *engine, u32 mask);
 u64 intel_engine_get_active_head(const struct intel_engine_cs *engine);
 u64 intel_engine_get_last_batch_head(const struct intel_engine_cs *engine);
 
-static inline u32 intel_engine_last_submit(struct intel_engine_cs *engine)
-{
-	/*
-	 * We are only peeking at the tail of the submit queue (and not the
-	 * queue itself) in order to gain a hint as to the current active
-	 * state of the engine. Callers are not expected to be taking
-	 * engine->timeline->lock, nor are they expected to be concerned
-	 * wtih serialising this hint with anything, so document it as
-	 * a hint and nothing more.
-	 */
-	return READ_ONCE(engine->timeline.seqno);
-}
-
-static inline u32 intel_engine_get_seqno(struct intel_engine_cs *engine)
-{
-	return intel_read_status_page(engine, I915_GEM_HWS_INDEX);
-}
-
-static inline bool intel_engine_signaled(struct intel_engine_cs *engine,
-					 u32 seqno)
-{
-	return i915_seqno_passed(intel_engine_get_seqno(engine), seqno);
-}
-
-static inline bool intel_engine_has_completed(struct intel_engine_cs *engine,
-					      u32 seqno)
-{
-	GEM_BUG_ON(!seqno);
-	return intel_engine_signaled(engine, seqno);
-}
-
-static inline bool intel_engine_has_started(struct intel_engine_cs *engine,
-					    u32 seqno)
-{
-	GEM_BUG_ON(!seqno);
-	return intel_engine_signaled(engine, seqno - 1);
-}
-
 void intel_engine_get_instdone(struct intel_engine_cs *engine,
 			       struct intel_instdone *instdone);
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index 5953a47827a7..fa079c6d9c65 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -225,8 +225,7 @@ static int igt_request_rewind(void *arg)
 	mutex_unlock(&i915->drm.struct_mutex);
 
 	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
-		pr_err("timed out waiting for high priority request, vip.seqno=%d, current seqno=%d\n",
-		       vip->global_seqno, intel_engine_get_seqno(i915->engine[RCS]));
+		pr_err("timed out waiting for high priority request\n");
 		goto err;
 	}
 
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 04d66b4303ab..9fe9ba66b5ec 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -565,11 +565,10 @@ static int active_request_put(struct i915_request *rq)
 		return 0;
 
 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
-		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld, seqno %d.\n",
+		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld.\n",
 			  rq->engine->name,
 			  rq->fence.context,
-			  rq->fence.seqno,
-			  i915_request_global_seqno(rq));
+			  rq->fence.seqno);
 		GEM_TRACE_DUMP();
 
 		i915_gem_set_wedged(rq->i915);
diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
index 0c5649044bc9..9f76ae27a6c2 100644
--- a/drivers/gpu/drm/i915/selftests/mock_engine.c
+++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
@@ -79,8 +79,6 @@ static void advance(struct mock_request *request)
 {
 	list_del_init(&request->link);
 	i915_request_fake_complete(&request->base);
-	intel_engine_write_global_seqno(request->base.engine,
-					request->base.global_seqno);
 	intel_engine_queue_breadcrumbs(request->base.engine);
 }
 
@@ -184,7 +182,6 @@ static void mock_submit_request(struct i915_request *request)
 	unsigned long flags;
 
 	i915_request_submit(request);
-	GEM_BUG_ON(!request->global_seqno);
 
 	spin_lock_irqsave(&engine->hw_lock, flags);
 	list_add_tail(&mock->link, &engine->hw_queue);
@@ -221,12 +218,8 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 	engine->base.emit_breadcrumb = mock_emit_breadcrumb;
 	engine->base.submit_request = mock_submit_request;
 
-	if (i915_timeline_init(i915,
-			       &engine->base.timeline,
-			       engine->base.name,
-			       NULL))
-		goto err_free;
-	i915_timeline_set_subclass(&engine->base.timeline, TIMELINE_ENGINE);
+	spin_lock_init(&engine->base.execution_lock);
+	INIT_LIST_HEAD(&engine->base.requests);
 
 	engine->base.execlists.queue_priority = INT_MIN;
 	engine->base.execlists.queue = RB_ROOT_CACHED;
@@ -246,8 +239,6 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
 
 err_breadcrumbs:
 	intel_engine_fini_breadcrumbs(&engine->base);
-	i915_timeline_fini(&engine->base.timeline);
-err_free:
 	kfree(engine);
 	return NULL;
 }
@@ -268,7 +259,6 @@ void mock_engine_flush(struct intel_engine_cs *engine)
 
 void mock_engine_reset(struct intel_engine_cs *engine)
 {
-	intel_engine_write_global_seqno(engine, 0);
 }
 
 void mock_engine_free(struct intel_engine_cs *engine)
@@ -281,7 +271,6 @@ void mock_engine_free(struct intel_engine_cs *engine)
 	context_unpin(engine->i915->kernel_context, engine);
 
 	intel_engine_fini_breadcrumbs(engine);
-	i915_timeline_fini(&engine->timeline);
 
 	kfree(engine);
 }
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 27/38] drm/i915: Introduce the i915_user_extension_method
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (25 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 26/38] drm/i915: Remove the global per-engine execution timeline Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-22  9:31   ` Tvrtko Ursulin
  2019-01-18 14:00 ` [PATCH 28/38] drm/i915: Create/destroy VM (ppGTT) for use with contexts Chris Wilson
                   ` (12 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

An idea for extending uABI inspired by Vulkan's extension chains.
Instead of expanding the data struct for each ioctl every time we need
to add a new feature, define an extension chain instead. As we add
optional interfaces to control the ioctl, we define a new extension
struct that can be linked into the ioctl data only when required by the
user. The key advantage being able to ignore large control structs for
optional interfaces/extensions, while being able to process them in a
consistent manner.

In comparison to other extensible ioctls, the key difference is the
use of a linked chain of extension structs vs an array of tagged
pointers. For example,

struct drm_amdgpu_cs_chunk {
        __u32           chunk_id;
        __u32           length_dw;
        __u64           chunk_data;
};

struct drm_amdgpu_cs_in {
        __u32           ctx_id;
        __u32           bo_list_handle;
        __u32           num_chunks;
        __u32           _pad;
        __u64           chunks;
};

allows userspace to pass in array of pointers to extension structs, but
must therefore keep constructing that array along side the command stream.
In dynamic situations like that, a linked list is preferred and does not
similar from extra cache line misses as the extension structs themselves
must still be loaded separate to the chunks array.

v2: Apply the tail call optimisation directly to nip the worry of stack
overflow in the bud.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Makefile               |  1 +
 drivers/gpu/drm/i915/i915_user_extensions.c | 42 +++++++++++++++++++++
 drivers/gpu/drm/i915/i915_user_extensions.h | 20 ++++++++++
 include/uapi/drm/i915_drm.h                 | 20 ++++++++++
 4 files changed, 83 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/i915_user_extensions.c
 create mode 100644 drivers/gpu/drm/i915/i915_user_extensions.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index 611115ed00db..f206fbc85cee 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -45,6 +45,7 @@ i915-y := i915_drv.o \
 	  i915_sw_fence.o \
 	  i915_syncmap.o \
 	  i915_sysfs.o \
+	  i915_user_extensions.o \
 	  intel_csr.o \
 	  intel_device_info.o \
 	  intel_pm.o \
diff --git a/drivers/gpu/drm/i915/i915_user_extensions.c b/drivers/gpu/drm/i915/i915_user_extensions.c
new file mode 100644
index 000000000000..5d90c368f185
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_user_extensions.c
@@ -0,0 +1,42 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2018 Intel Corporation
+ */
+
+#include <linux/sched/signal.h>
+#include <linux/uaccess.h>
+#include <uapi/drm/i915_drm.h>
+
+#include "i915_user_extensions.h"
+
+int i915_user_extensions(struct i915_user_extension __user *ext,
+			 const i915_user_extension_fn *tbl,
+			 unsigned long count,
+			 void *data)
+{
+	while (ext) {
+		int err;
+		u64 x;
+
+		cond_resched();
+		if (signal_pending(current))
+			return -EINTR;
+
+		if (get_user(x, &ext->name))
+			return -EFAULT;
+
+		err = -EINVAL;
+		if (x < count && tbl[x])
+			err = tbl[x](ext, data);
+		if (err)
+			return err;
+
+		if (get_user(x, &ext->next_extension))
+			return -EFAULT;
+
+		ext = u64_to_user_ptr(x);
+	}
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/i915/i915_user_extensions.h b/drivers/gpu/drm/i915/i915_user_extensions.h
new file mode 100644
index 000000000000..313a510b068a
--- /dev/null
+++ b/drivers/gpu/drm/i915/i915_user_extensions.h
@@ -0,0 +1,20 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2018 Intel Corporation
+ */
+
+#ifndef I915_USER_EXTENSIONS_H
+#define I915_USER_EXTENSIONS_H
+
+struct i915_user_extension;
+
+typedef int (*i915_user_extension_fn)(struct i915_user_extension __user *ext,
+				      void *data);
+
+int i915_user_extensions(struct i915_user_extension __user *ext,
+			 const i915_user_extension_fn *tbl,
+			 unsigned long count,
+			 void *data);
+
+#endif /* I915_USER_EXTENSIONS_H */
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 298b2e197744..6ee2221838da 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -62,6 +62,26 @@ extern "C" {
 #define I915_ERROR_UEVENT		"ERROR"
 #define I915_RESET_UEVENT		"RESET"
 
+/*
+ * i915_user_extension: Base class for defining a chain of extensions
+ *
+ * Many interfaces need to grow over time. In most cases we can simply
+ * extend the struct and have userspace pass in more data. Another option,
+ * as demonstrated by Vulkan's approach to providing extensions for forward
+ * and backward compatibility, is to use a list of optional structs to
+ * provide those extra details.
+ *
+ * The key advantage to using an extension chain is that it allows us to
+ * redefine the interface more easily than an ever growing struct of
+ * increasing complexity, and for large parts of that interface to be
+ * entirely optional. The downside is more pointer chasing; chasing across
+ * the __user boundary with pointers encapsulated inside u64.
+ */
+struct i915_user_extension {
+	__u64 next_extension;
+	__u64 name;
+};
+
 /*
  * MOCS indexes used for GPU surfaces, defining the cacheability of the
  * surface data and the coherency for this data wrt. CPU vs. GPU accesses.
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 28/38] drm/i915: Create/destroy VM (ppGTT) for use with contexts
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (26 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 27/38] drm/i915: Introduce the i915_user_extension_method Chris Wilson
@ 2019-01-18 14:00 ` Chris Wilson
  2019-01-23 11:30   ` Tvrtko Ursulin
  2019-01-18 14:01 ` [PATCH 29/38] drm/i915: Expose user control over the ppGTT associated with a context Chris Wilson
                   ` (11 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:00 UTC (permalink / raw)
  To: intel-gfx

In preparation to making the ppGTT binding for a context explicit (to
facilitate reusing the same ppGTT between different contexts), allow the
user to create and destroy named ppGTT.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c         |  2 +
 drivers/gpu/drm/i915/i915_drv.h         |  3 +
 drivers/gpu/drm/i915/i915_gem_context.c | 83 +++++++++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_context.h |  5 ++
 drivers/gpu/drm/i915/i915_gem_gtt.h     |  2 +
 include/uapi/drm/i915_drm.h             | 10 +++
 6 files changed, 105 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index f462a4d28af4..56bd087f1c4e 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -3003,6 +3003,8 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_PERF_ADD_CONFIG, i915_perf_add_config_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_PERF_REMOVE_CONFIG, i915_perf_remove_config_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_QUERY, i915_query_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_VM_CREATE, i915_gem_vm_create_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_VM_DESTROY, i915_gem_vm_destroy_ioctl, DRM_RENDER_ALLOW),
 };
 
 static struct drm_driver driver = {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 74bccb153359..cb11c23823c7 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -217,6 +217,9 @@ struct drm_i915_file_private {
 	} mm;
 	struct idr context_idr;
 
+	struct mutex vm_lock;
+	struct idr vm_idr;
+
 	struct intel_rps_client {
 		atomic_t boosts;
 	} rps_client;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index e8334c4bc130..7c90981704bf 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -598,19 +598,29 @@ static int context_idr_cleanup(int id, void *p, void *data)
 	return 0;
 }
 
+static int vm_idr_cleanup(int id, void *p, void *data)
+{
+	i915_ppgtt_put(p);
+	return 0;
+}
+
 int i915_gem_context_open(struct drm_i915_private *i915,
 			  struct drm_file *file)
 {
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct i915_gem_context *ctx;
 
+	mutex_init(&file_priv->vm_lock);
+
 	idr_init(&file_priv->context_idr);
+	idr_init_base(&file_priv->vm_idr, 1);
 
 	mutex_lock(&i915->drm.struct_mutex);
 	ctx = i915_gem_create_context(i915, file_priv);
 	mutex_unlock(&i915->drm.struct_mutex);
 	if (IS_ERR(ctx)) {
 		idr_destroy(&file_priv->context_idr);
+		idr_destroy(&file_priv->vm_idr);
 		return PTR_ERR(ctx);
 	}
 
@@ -627,6 +637,79 @@ void i915_gem_context_close(struct drm_file *file)
 
 	idr_for_each(&file_priv->context_idr, context_idr_cleanup, NULL);
 	idr_destroy(&file_priv->context_idr);
+
+	idr_for_each(&file_priv->vm_idr, vm_idr_cleanup, NULL);
+	idr_destroy(&file_priv->vm_idr);
+
+	mutex_destroy(&file_priv->vm_lock);
+}
+
+int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file)
+{
+	struct drm_i915_private *i915 = to_i915(dev);
+	struct drm_i915_gem_vm_create *args = data;
+	struct drm_i915_file_private *file_priv = file->driver_priv;
+	struct i915_hw_ppgtt *ppgtt;
+	int err;
+
+	if (!HAS_FULL_PPGTT(i915))
+		return -ENODEV;
+
+	if (args->flags)
+		return -EINVAL;
+
+	if (args->extensions)
+		return -EINVAL;
+
+	ppgtt = i915_ppgtt_create(i915, file_priv);
+	if (IS_ERR(ppgtt))
+		return PTR_ERR(ppgtt);
+
+	err = mutex_lock_interruptible(&file_priv->vm_lock);
+	if (err)
+		goto err_put;
+
+	err = idr_alloc(&file_priv->vm_idr, ppgtt, 0, 0, GFP_KERNEL);
+	mutex_unlock(&file_priv->vm_lock);
+	if (err < 0)
+		goto err_put;
+
+	ppgtt->user_handle = err;
+	args->id = err;
+	return 0;
+
+err_put:
+	i915_ppgtt_put(ppgtt);
+	return err;
+}
+
+int i915_gem_vm_destroy_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file)
+{
+	struct drm_i915_file_private *file_priv = file->driver_priv;
+	struct i915_hw_ppgtt *ppgtt;
+	int err;
+	u32 id;
+
+	id = *(u32 *)data;
+	if (!id)
+		return -ENOENT;
+
+	err = mutex_lock_interruptible(&file_priv->vm_lock);
+	if (err)
+		return err;
+
+	ppgtt = idr_remove(&file_priv->vm_idr, id);
+	if (ppgtt)
+		ppgtt->user_handle = 0;
+
+	mutex_unlock(&file_priv->vm_lock);
+	if (!ppgtt)
+		return -ENOENT;
+
+	i915_ppgtt_put(ppgtt);
+	return 0;
 }
 
 static struct i915_request *
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 1e41a97b8007..9786f86b659d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -348,6 +348,11 @@ void i915_gem_context_release(struct kref *ctx_ref);
 struct i915_gem_context *
 i915_gem_context_create_gvt(struct drm_device *dev);
 
+int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file);
+int i915_gem_vm_destroy_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file);
+
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file);
 int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 03ade71b8d9a..61698609a62c 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -396,6 +396,8 @@ struct i915_hw_ppgtt {
 		struct i915_page_directory_pointer pdp;	/* GEN8+ */
 		struct i915_page_directory pd;		/* GEN6-7 */
 	};
+
+	u32 user_handle;
 };
 
 struct gen6_hw_ppgtt {
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 6ee2221838da..c3336c931a95 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -339,6 +339,8 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_PERF_ADD_CONFIG	0x37
 #define DRM_I915_PERF_REMOVE_CONFIG	0x38
 #define DRM_I915_QUERY			0x39
+#define DRM_I915_GEM_VM_CREATE		0x3a
+#define DRM_I915_GEM_VM_DESTROY		0x3b
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -397,6 +399,8 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_PERF_ADD_CONFIG	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config)
 #define DRM_IOCTL_I915_PERF_REMOVE_CONFIG	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_REMOVE_CONFIG, __u64)
 #define DRM_IOCTL_I915_QUERY			DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, struct drm_i915_query)
+#define DRM_IOCTL_I915_GEM_VM_CREATE	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_VM_CREATE, struct drm_i915_gem_vm_create)
+#define DRM_IOCTL_I915_GEM_VM_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_VM_DESTROY, uint32_t)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -1442,6 +1446,12 @@ struct drm_i915_gem_context_destroy {
 	__u32 pad;
 };
 
+struct drm_i915_gem_vm_create {
+	__u64 extensions;
+	__u32 flags;
+	__u32 id; /* output: id of new vm */
+};
+
 struct drm_i915_reg_read {
 	/*
 	 * Register offset.
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 29/38] drm/i915: Expose user control over the ppGTT associated with a context
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (27 preceding siblings ...)
  2019-01-18 14:00 ` [PATCH 28/38] drm/i915: Create/destroy VM (ppGTT) for use with contexts Chris Wilson
@ 2019-01-18 14:01 ` Chris Wilson
  2019-01-23 12:00   ` Tvrtko Ursulin
  2019-01-18 14:01 ` [PATCH 30/38] drm/i915: Extend CONTEXT_CREATE to set parameters upon construction Chris Wilson
                   ` (10 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:01 UTC (permalink / raw)
  To: intel-gfx

Allow the user to share ppGTT between contexts on the same fd. This
gives the user the ability to relax their context isolation to share vm
between their own contexts, but does not allow them to import a vm from
another fd. The use case for sharing a vm is for the proposed virtual
engine work where a context may be created explicitly to setup a load
balancing engine, but always be run in conjunction with a second context
for rcs/compute etc. By giving control to the user on how they setup the
vm allows for them to have a single vm between all kernel contexts being
used to emulate a single client context, similarly to how we use a
single vm across all engines within a single kernel context today. It
also allows for future specification a separate vm between engines
inside a single kernel context should that be desired.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_context.c       | 118 ++++++++-
 drivers/gpu/drm/i915/i915_gem_gtt.c           |  17 +-
 drivers/gpu/drm/i915/i915_gem_gtt.h           |  14 +-
 drivers/gpu/drm/i915/selftests/huge_pages.c   |   1 -
 .../gpu/drm/i915/selftests/i915_gem_context.c | 247 ++++++++++++++----
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |   1 -
 drivers/gpu/drm/i915/selftests/mock_context.c |   8 +-
 include/uapi/drm/i915_drm.h                   |   1 +
 8 files changed, 339 insertions(+), 68 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 7c90981704bf..f707241dbc78 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -109,6 +109,8 @@ static void lut_close(struct i915_gem_context *ctx)
 		struct i915_vma *vma = rcu_dereference_raw(*slot);
 
 		radix_tree_iter_delete(&ctx->handles_vma, &iter, slot);
+
+		vma->open_count--;
 		__i915_gem_object_release_unless_active(vma->obj);
 	}
 	rcu_read_unlock();
@@ -291,7 +293,7 @@ static void context_close(struct i915_gem_context *ctx)
 	 */
 	lut_close(ctx);
 	if (ctx->ppgtt)
-		i915_ppgtt_close(&ctx->ppgtt->vm);
+		i915_ppgtt_close(ctx->ppgtt);
 
 	ctx->file_priv = ERR_PTR(-EBADF);
 	i915_gem_context_put(ctx);
@@ -401,6 +403,23 @@ static void __destroy_hw_context(struct i915_gem_context *ctx,
 	context_close(ctx);
 }
 
+static void __set_ppgtt(struct i915_gem_context *ctx,
+			struct i915_hw_ppgtt *ppgtt)
+{
+	if (ppgtt == ctx->ppgtt)
+		return;
+
+	if (ctx->ppgtt) {
+		i915_ppgtt_close(ctx->ppgtt);
+		i915_ppgtt_put(ctx->ppgtt);
+	}
+
+	i915_ppgtt_open(ppgtt);
+	ctx->ppgtt = i915_ppgtt_get(ppgtt);
+
+	ctx->desc_template = default_desc_template(ctx->i915, ppgtt);
+}
+
 static struct i915_gem_context *
 i915_gem_create_context(struct drm_i915_private *dev_priv,
 			struct drm_i915_file_private *file_priv)
@@ -427,8 +446,8 @@ i915_gem_create_context(struct drm_i915_private *dev_priv,
 			return ERR_CAST(ppgtt);
 		}
 
-		ctx->ppgtt = ppgtt;
-		ctx->desc_template = default_desc_template(dev_priv, ppgtt);
+		__set_ppgtt(ctx, ppgtt);
+		i915_ppgtt_put(ppgtt);
 	}
 
 	trace_i915_context_create(ctx);
@@ -784,6 +803,87 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
 	return 0;
 }
 
+static int get_ppgtt(struct i915_gem_context *ctx, u64 *out)
+{
+	struct drm_i915_file_private *file_priv = ctx->file_priv;
+	struct i915_hw_ppgtt *ppgtt;
+	int ret;
+
+	/* XXX rcu acquire? */
+	ppgtt = ctx->ppgtt;
+	if (!ppgtt)
+		return -ENODEV;
+
+	ret = mutex_lock_interruptible(&file_priv->vm_lock);
+	if (ret)
+		return ret;
+
+	ret = ppgtt->user_handle;
+	if (!ret) {
+		ret = idr_alloc(&file_priv->vm_idr, ppgtt, 0, 0, GFP_KERNEL);
+		if (ret > 0) {
+			ppgtt->user_handle = ret;
+			i915_ppgtt_get(ppgtt);
+		}
+	}
+
+	mutex_unlock(&file_priv->vm_lock);
+	if (ret < 0)
+		return ret;
+
+	*out = ret;
+	return 0;
+}
+
+static int set_ppgtt(struct i915_gem_context *ctx, u32 id)
+{
+	struct drm_i915_file_private *file_priv = ctx->file_priv;
+	struct i915_hw_ppgtt *ppgtt;
+	int err;
+
+	err = mutex_lock_interruptible(&file_priv->vm_lock);
+	if (err)
+		return err;
+
+	ppgtt = idr_find(&file_priv->vm_idr, id);
+	if (ppgtt)
+		i915_ppgtt_get(ppgtt);
+	mutex_unlock(&file_priv->vm_lock);
+	if (!ppgtt)
+		return -ENOENT;
+
+	err = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex);
+	if (err)
+		goto out;
+
+	/*
+	 * We need to flush any requests using the current ppgtt before
+	 * we release it as the requests do not hold a reference themselves,
+	 * only indirectly through the context. By switching to the kernel
+	 * context, we ensure that the TLBs are reloaded before using the
+	 * same context again -- an extra layer of paranoia over wait_for_idle.
+	 */
+	err = i915_gem_switch_to_kernel_context(ctx->i915);
+	if (err)
+		goto out_unlock;
+
+	err = i915_gem_wait_for_idle(ctx->i915,
+				     I915_WAIT_LOCKED |
+				     I915_WAIT_INTERRUPTIBLE,
+				     MAX_SCHEDULE_TIMEOUT);
+	if (err)
+		goto out_unlock;
+
+	__set_ppgtt(ctx, ppgtt);
+
+out_unlock:
+	mutex_unlock(&ctx->i915->drm.struct_mutex);
+
+out:
+	i915_ppgtt_put(ppgtt);
+	return err;
+}
+
 static bool client_is_banned(struct drm_i915_file_private *file_priv)
 {
 	return atomic_read(&file_priv->ban_score) >= I915_CLIENT_SCORE_BANNED;
@@ -896,6 +996,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_PRIORITY:
 		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
 		break;
+	case I915_CONTEXT_PARAM_VM:
+		ret = get_ppgtt(ctx, &args->value);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -968,6 +1071,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 		}
 		break;
 
+	case I915_CONTEXT_PARAM_VM:
+		if (args->size)
+			ret = -EINVAL;
+		else if (upper_32_bits(args->value))
+			ret = -EINVAL;
+		else
+			ret = set_ppgtt(ctx, lower_32_bits(args->value));
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 49b00996a15e..27b40c14e745 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2104,10 +2104,21 @@ i915_ppgtt_create(struct drm_i915_private *i915,
 	return ppgtt;
 }
 
-void i915_ppgtt_close(struct i915_address_space *vm)
+void i915_ppgtt_open(struct i915_hw_ppgtt *ppgtt)
 {
-	GEM_BUG_ON(vm->closed);
-	vm->closed = true;
+	GEM_BUG_ON(ppgtt->vm.closed);
+
+	ppgtt->open_count++;
+}
+
+void i915_ppgtt_close(struct i915_hw_ppgtt *ppgtt)
+{
+	GEM_BUG_ON(!ppgtt->open_count);
+	if (--ppgtt->open_count)
+		return;
+
+	GEM_BUG_ON(ppgtt->vm.closed);
+	ppgtt->vm.closed = true;
 }
 
 static void ppgtt_destroy_vma(struct i915_address_space *vm)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 61698609a62c..bb750318f52a 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -391,6 +391,8 @@ struct i915_hw_ppgtt {
 	struct kref ref;
 
 	unsigned long pd_dirty_rings;
+	unsigned int open_count;
+
 	union {
 		struct i915_pml4 pml4;		/* GEN8+ & 48b PPGTT */
 		struct i915_page_directory_pointer pdp;	/* GEN8+ */
@@ -608,12 +610,16 @@ int i915_ppgtt_init_hw(struct drm_i915_private *dev_priv);
 void i915_ppgtt_release(struct kref *kref);
 struct i915_hw_ppgtt *i915_ppgtt_create(struct drm_i915_private *dev_priv,
 					struct drm_i915_file_private *fpriv);
-void i915_ppgtt_close(struct i915_address_space *vm);
-static inline void i915_ppgtt_get(struct i915_hw_ppgtt *ppgtt)
+
+void i915_ppgtt_open(struct i915_hw_ppgtt *ppgtt);
+void i915_ppgtt_close(struct i915_hw_ppgtt *ppgtt);
+
+static inline struct i915_hw_ppgtt *i915_ppgtt_get(struct i915_hw_ppgtt *ppgtt)
 {
-	if (ppgtt)
-		kref_get(&ppgtt->ref);
+	kref_get(&ppgtt->ref);
+	return ppgtt;
 }
+
 static inline void i915_ppgtt_put(struct i915_hw_ppgtt *ppgtt)
 {
 	if (ppgtt)
diff --git a/drivers/gpu/drm/i915/selftests/huge_pages.c b/drivers/gpu/drm/i915/selftests/huge_pages.c
index a9a2fa35876f..a7ee8e97bcee 100644
--- a/drivers/gpu/drm/i915/selftests/huge_pages.c
+++ b/drivers/gpu/drm/i915/selftests/huge_pages.c
@@ -1734,7 +1734,6 @@ int i915_gem_huge_page_mock_selftests(void)
 	err = i915_subtests(tests, ppgtt);
 
 out_close:
-	i915_ppgtt_close(&ppgtt->vm);
 	i915_ppgtt_put(ppgtt);
 
 out_unlock:
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 6a241745e78a..2864cfb82325 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -243,6 +243,12 @@ static int live_nop_switch(void *arg)
 	return err;
 }
 
+#define GEN8_HIGH_ADDRESS_BIT 47
+static inline u64 gen8_canonical_addr(u64 address)
+{
+	return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
+}
+
 static struct i915_vma *
 gpu_fill_dw(struct i915_vma *vma, u64 offset, unsigned long count, u32 value)
 {
@@ -266,6 +272,7 @@ gpu_fill_dw(struct i915_vma *vma, u64 offset, unsigned long count, u32 value)
 
 	GEM_BUG_ON(offset + (count - 1) * PAGE_SIZE > vma->node.size);
 	offset += vma->node.start;
+	offset = gen8_canonical_addr(offset);
 
 	for (n = 0; n < count; n++) {
 		if (gen >= 8) {
@@ -391,6 +398,7 @@ static int gpu_fill(struct drm_i915_gem_object *obj,
 		goto skip_request;
 
 	i915_gem_object_set_active_reference(batch->obj);
+
 	i915_vma_unpin(batch);
 	i915_vma_close(batch);
 
@@ -439,7 +447,8 @@ static int cpu_fill(struct drm_i915_gem_object *obj, u32 value)
 	return 0;
 }
 
-static int cpu_check(struct drm_i915_gem_object *obj, unsigned int max)
+static noinline int cpu_check(struct drm_i915_gem_object *obj,
+			      unsigned int idx, unsigned int max)
 {
 	unsigned int n, m, needs_flush;
 	int err;
@@ -457,8 +466,10 @@ static int cpu_check(struct drm_i915_gem_object *obj, unsigned int max)
 
 		for (m = 0; m < max; m++) {
 			if (map[m] != m) {
-				pr_err("Invalid value at page %d, offset %d: found %x expected %x\n",
-				       n, m, map[m], m);
+				pr_err("%pS: Invalid value at object %d page %d/%ld, offset %d/%d: found %x expected %x\n",
+				       __builtin_return_address(0), idx,
+				       n, real_page_count(obj), m, max,
+				       map[m], m);
 				err = -EINVAL;
 				goto out_unmap;
 			}
@@ -466,8 +477,9 @@ static int cpu_check(struct drm_i915_gem_object *obj, unsigned int max)
 
 		for (; m < DW_PER_PAGE; m++) {
 			if (map[m] != STACK_MAGIC) {
-				pr_err("Invalid value at page %d, offset %d: found %x expected %x\n",
-				       n, m, map[m], STACK_MAGIC);
+				pr_err("%pS: Invalid value at object %d page %d, offset %d: found %x expected %x (uninitialised)\n",
+				       __builtin_return_address(0), idx, n, m,
+				       map[m], STACK_MAGIC);
 				err = -EINVAL;
 				goto out_unmap;
 			}
@@ -545,12 +557,8 @@ static unsigned long max_dwords(struct drm_i915_gem_object *obj)
 static int igt_ctx_exec(void *arg)
 {
 	struct drm_i915_private *i915 = arg;
-	struct drm_i915_gem_object *obj = NULL;
-	unsigned long ncontexts, ndwords, dw;
-	struct drm_file *file;
-	IGT_TIMEOUT(end_time);
-	LIST_HEAD(objects);
-	struct live_test t;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
 	int err = -ENODEV;
 
 	/*
@@ -562,38 +570,42 @@ static int igt_ctx_exec(void *arg)
 	if (!DRIVER_CAPS(i915)->has_logical_contexts)
 		return 0;
 
-	file = mock_file(i915);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
+	for_each_engine(engine, i915, id) {
+		struct drm_i915_gem_object *obj = NULL;
+		unsigned long ncontexts, ndwords, dw;
+		struct drm_file *file;
+		IGT_TIMEOUT(end_time);
+		LIST_HEAD(objects);
+		struct live_test t;
 
-	mutex_lock(&i915->drm.struct_mutex);
+		if (!intel_engine_can_store_dword(engine))
+			continue;
 
-	err = begin_live_test(&t, i915, __func__, "");
-	if (err)
-		goto out_unlock;
+		if (!engine->context_size)
+			continue; /* No logical context support in HW */
 
-	ncontexts = 0;
-	ndwords = 0;
-	dw = 0;
-	while (!time_after(jiffies, end_time)) {
-		struct intel_engine_cs *engine;
-		struct i915_gem_context *ctx;
-		unsigned int id;
+		file = mock_file(i915);
+		if (IS_ERR(file))
+			return PTR_ERR(file);
 
-		ctx = i915_gem_create_context(i915, file->driver_priv);
-		if (IS_ERR(ctx)) {
-			err = PTR_ERR(ctx);
+		mutex_lock(&i915->drm.struct_mutex);
+
+		err = begin_live_test(&t, i915, __func__, engine->name);
+		if (err)
 			goto out_unlock;
-		}
 
-		for_each_engine(engine, i915, id) {
+		ncontexts = 0;
+		ndwords = 0;
+		dw = 0;
+		while (!time_after(jiffies, end_time)) {
+			struct i915_gem_context *ctx;
 			intel_wakeref_t wakeref;
 
-			if (!engine->context_size)
-				continue; /* No logical context support in HW */
-
-			if (!intel_engine_can_store_dword(engine))
-				continue;
+			ctx = i915_gem_create_context(i915, file->driver_priv);
+			if (IS_ERR(ctx)) {
+				err = PTR_ERR(ctx);
+				goto out_unlock;
+			}
 
 			if (!obj) {
 				obj = create_test_object(ctx, file, &objects);
@@ -603,7 +615,6 @@ static int igt_ctx_exec(void *arg)
 				}
 			}
 
-			err = 0;
 			with_intel_runtime_pm(i915, wakeref)
 				err = gpu_fill(obj, ctx, engine, dw);
 			if (err) {
@@ -618,32 +629,158 @@ static int igt_ctx_exec(void *arg)
 				obj = NULL;
 				dw = 0;
 			}
+
 			ndwords++;
+			ncontexts++;
+		}
+
+		pr_info("Submitted %lu contexts to %s, filling %lu dwords\n",
+			ncontexts, engine->name, ndwords);
+
+		ncontexts = dw = 0;
+		list_for_each_entry(obj, &objects, st_link) {
+			unsigned int rem =
+				min_t(unsigned int, ndwords - dw, max_dwords(obj));
+
+			err = cpu_check(obj, ncontexts++, rem);
+			if (err)
+				break;
+
+			dw += rem;
 		}
-		ncontexts++;
+
+out_unlock:
+		if (end_live_test(&t))
+			err = -EIO;
+		mutex_unlock(&i915->drm.struct_mutex);
+
+		mock_file_free(i915, file);
+		if (err)
+			return err;
 	}
-	pr_info("Submitted %lu contexts (across %u engines), filling %lu dwords\n",
-		ncontexts, RUNTIME_INFO(i915)->num_rings, ndwords);
 
-	dw = 0;
-	list_for_each_entry(obj, &objects, st_link) {
-		unsigned int rem =
-			min_t(unsigned int, ndwords - dw, max_dwords(obj));
+	return 0;
+}
 
-		err = cpu_check(obj, rem);
+static int igt_shared_ctx_exec(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int err = -ENODEV;
+
+	/*
+	 * Create a few different contexts with the same mm and write
+	 * through each ctx using the GPU making sure those writes end
+	 * up in the expected pages of our obj.
+	 */
+
+	for_each_engine(engine, i915, id) {
+		unsigned long ncontexts, ndwords, dw;
+		struct drm_i915_gem_object *obj = NULL;
+		struct i915_gem_context *ctx = NULL;
+		struct i915_gem_context *parent;
+		struct drm_file *file;
+		IGT_TIMEOUT(end_time);
+		LIST_HEAD(objects);
+		struct live_test t;
+
+		if (!intel_engine_can_store_dword(engine))
+			continue;
+
+		file = mock_file(i915);
+		if (IS_ERR(file))
+			return PTR_ERR(file);
+
+		mutex_lock(&i915->drm.struct_mutex);
+
+		err = begin_live_test(&t, i915, __func__, engine->name);
 		if (err)
-			break;
+			goto out_unlock;
 
-		dw += rem;
-	}
+		parent = i915_gem_create_context(i915, file->driver_priv);
+		if (IS_ERR(parent)) {
+			err = PTR_ERR(parent);
+			if (err == -ENODEV) /* no logical ctx support */
+				err = 0;
+			goto out_unlock;
+		}
+
+		if (!parent->ppgtt) {
+			err = 0;
+			goto out_unlock;
+		}
+
+		ncontexts = 0;
+		ndwords = 0;
+		dw = 0;
+		while (!time_after(jiffies, end_time)) {
+			intel_wakeref_t wakeref;
+
+			if (ctx)
+				__destroy_hw_context(ctx, file->driver_priv);
+
+			ctx = i915_gem_create_context(i915, file->driver_priv);
+			if (IS_ERR(ctx)) {
+				err = PTR_ERR(ctx);
+				goto out_unlock;
+			}
+
+			__set_ppgtt(ctx, parent->ppgtt);
+
+			if (!obj) {
+				obj = create_test_object(parent, file, &objects);
+				if (IS_ERR(obj)) {
+					err = PTR_ERR(obj);
+					goto out_unlock;
+				}
+			}
+
+			err = 0;
+			with_intel_runtime_pm(i915, wakeref)
+				err = gpu_fill(obj, ctx, engine, dw);
+			if (err) {
+				pr_err("Failed to fill dword %lu [%lu/%lu] with gpu (%s) in ctx %u [full-ppgtt? %s], err=%d\n",
+				       ndwords, dw, max_dwords(obj),
+				       engine->name, ctx->hw_id,
+				       yesno(!!ctx->ppgtt), err);
+				goto out_unlock;
+			}
+
+			if (++dw == max_dwords(obj)) {
+				obj = NULL;
+				dw = 0;
+			}
+
+			ndwords++;
+			ncontexts++;
+		}
+		pr_info("Submitted %lu contexts to %s, filling %lu dwords\n",
+			ncontexts, engine->name, ndwords);
+
+		ncontexts = dw = 0;
+		list_for_each_entry(obj, &objects, st_link) {
+			unsigned int rem =
+				min_t(unsigned int, ndwords - dw, max_dwords(obj));
+
+			err = cpu_check(obj, ncontexts++, rem);
+			if (err)
+				break;
+
+			dw += rem;
+		}
 
 out_unlock:
-	if (end_live_test(&t))
-		err = -EIO;
-	mutex_unlock(&i915->drm.struct_mutex);
+		if (end_live_test(&t))
+			err = -EIO;
+		mutex_unlock(&i915->drm.struct_mutex);
 
-	mock_file_free(i915, file);
-	return err;
+		mock_file_free(i915, file);
+		if (err)
+			return err;
+	}
+
+	return 0;
 }
 
 static int igt_ctx_readonly(void *arg)
@@ -652,7 +789,7 @@ static int igt_ctx_readonly(void *arg)
 	struct drm_i915_gem_object *obj = NULL;
 	struct i915_gem_context *ctx;
 	struct i915_hw_ppgtt *ppgtt;
-	unsigned long ndwords, dw;
+	unsigned long idx, ndwords, dw;
 	struct drm_file *file;
 	I915_RND_STATE(prng);
 	IGT_TIMEOUT(end_time);
@@ -733,6 +870,7 @@ static int igt_ctx_readonly(void *arg)
 		ndwords, RUNTIME_INFO(i915)->num_rings);
 
 	dw = 0;
+	idx = 0;
 	list_for_each_entry(obj, &objects, st_link) {
 		unsigned int rem =
 			min_t(unsigned int, ndwords - dw, max_dwords(obj));
@@ -742,7 +880,7 @@ static int igt_ctx_readonly(void *arg)
 		if (i915_gem_object_is_readonly(obj))
 			num_writes = 0;
 
-		err = cpu_check(obj, num_writes);
+		err = cpu_check(obj, idx++, num_writes);
 		if (err)
 			break;
 
@@ -1214,6 +1352,7 @@ int i915_gem_context_live_selftests(struct drm_i915_private *dev_priv)
 		SUBTEST(live_nop_switch),
 		SUBTEST(igt_ctx_exec),
 		SUBTEST(igt_ctx_readonly),
+		SUBTEST(igt_shared_ctx_exec),
 		SUBTEST(igt_vm_isolation),
 	};
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
index 35eb40e5de91..298c4e11deda 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
@@ -1020,7 +1020,6 @@ static int exercise_ppgtt(struct drm_i915_private *dev_priv,
 
 	err = func(dev_priv, &ppgtt->vm, 0, ppgtt->vm.total, end_time);
 
-	i915_ppgtt_close(&ppgtt->vm);
 	i915_ppgtt_put(ppgtt);
 out_unlock:
 	mutex_unlock(&dev_priv->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index 2009e776b5d8..b9fd2a4b95e9 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -59,13 +59,17 @@ mock_context(struct drm_i915_private *i915,
 		goto err_handles;
 
 	if (name) {
+		struct i915_hw_ppgtt *ppgtt;
+
 		ctx->name = kstrdup(name, GFP_KERNEL);
 		if (!ctx->name)
 			goto err_put;
 
-		ctx->ppgtt = mock_ppgtt(i915, name);
-		if (!ctx->ppgtt)
+		ppgtt = mock_ppgtt(i915, name);
+		if (!ppgtt)
 			goto err_put;
+
+		__set_ppgtt(ctx, ppgtt);
 	}
 
 	return ctx;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index c3336c931a95..eb19cf8a77ef 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1516,6 +1516,7 @@ struct drm_i915_gem_context_param {
 #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
 #define   I915_CONTEXT_DEFAULT_PRIORITY		0
 #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
+#define I915_CONTEXT_PARAM_VM		0x7
 	__u64 value;
 };
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 30/38] drm/i915: Extend CONTEXT_CREATE to set parameters upon construction
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (28 preceding siblings ...)
  2019-01-18 14:01 ` [PATCH 29/38] drm/i915: Expose user control over the ppGTT associated with a context Chris Wilson
@ 2019-01-18 14:01 ` Chris Wilson
  2019-01-18 14:01 ` [PATCH 31/38] drm/i915: Allow contexts to share a single timeline across all engines Chris Wilson
                   ` (9 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:01 UTC (permalink / raw)
  To: intel-gfx

It can be useful to have a single ioctl to create a context with all
the initial parameters instead of a series of create + setparam + setparam
ioctls. This extension to create context allows any of the parameters
to be passed in as a linked list to be applied to the newly constructed
context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c         |   2 +-
 drivers/gpu/drm/i915/i915_gem_context.c | 172 +++++++++++++++---------
 include/uapi/drm/i915_drm.h             |  50 ++++---
 3 files changed, 137 insertions(+), 87 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 56bd087f1c4e..bab0d45dd7ab 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -2992,7 +2992,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_SET_SPRITE_COLORKEY, intel_sprite_set_colorkey_ioctl, DRM_MASTER),
 	DRM_IOCTL_DEF_DRV(I915_GET_SPRITE_COLORKEY, drm_noop, DRM_MASTER),
 	DRM_IOCTL_DEF_DRV(I915_GEM_WAIT, i915_gem_wait_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
-	DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_CREATE, i915_gem_context_create_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_CREATE_EXT, i915_gem_context_create_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_GEM_CONTEXT_DESTROY, i915_gem_context_destroy_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_REG_READ, i915_reg_read_ioctl, DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_GET_RESET_STATS, i915_gem_context_reset_stats_ioctl, DRM_RENDER_ALLOW),
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index f707241dbc78..ec5e3e1c6402 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -86,9 +86,12 @@
  */
 
 #include <linux/log2.h>
+
 #include <drm/i915_drm.h>
+
 #include "i915_drv.h"
 #include "i915_trace.h"
+#include "i915_user_extensions.h"
 #include "intel_workarounds.h"
 
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
@@ -884,6 +887,94 @@ static int set_ppgtt(struct i915_gem_context *ctx, u32 id)
 	return err;
 }
 
+static int ctx_setparam(struct i915_gem_context *ctx,
+			const struct drm_i915_gem_context_param *args)
+{
+	int ret = 0;
+
+	switch (args->param) {
+	case I915_CONTEXT_PARAM_NO_ZEROMAP:
+		if (args->size)
+			ret = -EINVAL;
+		else if (args->value)
+			set_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags);
+		else
+			clear_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags);
+		break;
+
+	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
+		if (args->size)
+			ret = -EINVAL;
+		else if (args->value)
+			i915_gem_context_set_no_error_capture(ctx);
+		else
+			i915_gem_context_clear_no_error_capture(ctx);
+		break;
+
+	case I915_CONTEXT_PARAM_BANNABLE:
+		if (args->size)
+			ret = -EINVAL;
+		else if (!capable(CAP_SYS_ADMIN) && !args->value)
+			ret = -EPERM;
+		else if (args->value)
+			i915_gem_context_set_bannable(ctx);
+		else
+			i915_gem_context_clear_bannable(ctx);
+		break;
+
+	case I915_CONTEXT_PARAM_PRIORITY:
+		{
+			s64 priority = args->value;
+
+			if (args->size)
+				ret = -EINVAL;
+			else if (!(ctx->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
+				ret = -ENODEV;
+			else if (priority > I915_CONTEXT_MAX_USER_PRIORITY ||
+				 priority < I915_CONTEXT_MIN_USER_PRIORITY)
+				ret = -EINVAL;
+			else if (priority > I915_CONTEXT_DEFAULT_PRIORITY &&
+				 !capable(CAP_SYS_NICE))
+				ret = -EPERM;
+			else
+				ctx->sched.priority =
+					I915_USER_PRIORITY(priority);
+		}
+		break;
+
+	case I915_CONTEXT_PARAM_VM:
+		if (args->size)
+			ret = -EINVAL;
+		else if (upper_32_bits(args->value))
+			ret = -EINVAL;
+		else
+			ret = set_ppgtt(ctx, lower_32_bits(args->value));
+		break;
+
+	case I915_CONTEXT_PARAM_BAN_PERIOD:
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int create_setparam(struct i915_user_extension *ext, void *data)
+{
+	const struct drm_i915_gem_context_create_ext_setparam *args =
+		container_of(ext, typeof(*args), base);
+
+	if (args->setparam.ctx_id)
+		return -EINVAL;
+
+	return ctx_setparam(data, &args->setparam);
+}
+
+static const i915_user_extension_fn create_extensions[] = {
+	[I915_CONTEXT_CREATE_EXT_SETPARAM] = create_setparam,
+};
+
 static bool client_is_banned(struct drm_i915_file_private *file_priv)
 {
 	return atomic_read(&file_priv->ban_score) >= I915_CLIENT_SCORE_BANNED;
@@ -893,7 +984,7 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file)
 {
 	struct drm_i915_private *dev_priv = to_i915(dev);
-	struct drm_i915_gem_context_create *args = data;
+	struct drm_i915_gem_context_create_ext *args = data;
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct i915_gem_context *ctx;
 	int ret;
@@ -901,7 +992,7 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 	if (!DRIVER_CAPS(dev_priv)->has_logical_contexts)
 		return -ENODEV;
 
-	if (args->pad != 0)
+	if (args->flags)
 		return -EINVAL;
 
 	if (client_is_banned(file_priv)) {
@@ -923,6 +1014,16 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 
 	GEM_BUG_ON(i915_gem_context_is_kernel(ctx));
 
+	ret = i915_user_extensions(u64_to_user_ptr(args->extensions),
+				   create_extensions,
+				   ARRAY_SIZE(create_extensions),
+				   ctx);
+	if (ret) {
+		idr_remove(&file_priv->context_idr, ctx->user_handle);
+		context_close(ctx);
+		return ret;
+	}
+
 	args->ctx_id = ctx->user_handle;
 	DRM_DEBUG("HW context %d created\n", args->ctx_id);
 
@@ -1014,76 +1115,13 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct drm_i915_gem_context_param *args = data;
 	struct i915_gem_context *ctx;
-	int ret = 0;
+	int ret;
 
 	ctx = i915_gem_context_lookup(file_priv, args->ctx_id);
 	if (!ctx)
 		return -ENOENT;
 
-	switch (args->param) {
-	case I915_CONTEXT_PARAM_BAN_PERIOD:
-		ret = -EINVAL;
-		break;
-	case I915_CONTEXT_PARAM_NO_ZEROMAP:
-		if (args->size)
-			ret = -EINVAL;
-		else if (args->value)
-			set_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags);
-		else
-			clear_bit(UCONTEXT_NO_ZEROMAP, &ctx->user_flags);
-		break;
-	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
-		if (args->size)
-			ret = -EINVAL;
-		else if (args->value)
-			i915_gem_context_set_no_error_capture(ctx);
-		else
-			i915_gem_context_clear_no_error_capture(ctx);
-		break;
-	case I915_CONTEXT_PARAM_BANNABLE:
-		if (args->size)
-			ret = -EINVAL;
-		else if (!capable(CAP_SYS_ADMIN) && !args->value)
-			ret = -EPERM;
-		else if (args->value)
-			i915_gem_context_set_bannable(ctx);
-		else
-			i915_gem_context_clear_bannable(ctx);
-		break;
-
-	case I915_CONTEXT_PARAM_PRIORITY:
-		{
-			s64 priority = args->value;
-
-			if (args->size)
-				ret = -EINVAL;
-			else if (!(to_i915(dev)->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
-				ret = -ENODEV;
-			else if (priority > I915_CONTEXT_MAX_USER_PRIORITY ||
-				 priority < I915_CONTEXT_MIN_USER_PRIORITY)
-				ret = -EINVAL;
-			else if (priority > I915_CONTEXT_DEFAULT_PRIORITY &&
-				 !capable(CAP_SYS_NICE))
-				ret = -EPERM;
-			else
-				ctx->sched.priority =
-					I915_USER_PRIORITY(priority);
-		}
-		break;
-
-	case I915_CONTEXT_PARAM_VM:
-		if (args->size)
-			ret = -EINVAL;
-		else if (upper_32_bits(args->value))
-			ret = -EINVAL;
-		else
-			ret = set_ppgtt(ctx, lower_32_bits(args->value));
-		break;
-
-	default:
-		ret = -EINVAL;
-		break;
-	}
+	ret = ctx_setparam(ctx, args);
 
 	i915_gem_context_put(ctx);
 	return ret;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index eb19cf8a77ef..704e9d2fe2d6 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -389,6 +389,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GET_SPRITE_COLORKEY DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GET_SPRITE_COLORKEY, struct drm_intel_sprite_colorkey)
 #define DRM_IOCTL_I915_GEM_WAIT		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_WAIT, struct drm_i915_gem_wait)
 #define DRM_IOCTL_I915_GEM_CONTEXT_CREATE	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create)
+#define DRM_IOCTL_I915_GEM_CONTEXT_CREATE_EXT	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_CREATE, struct drm_i915_gem_context_create_ext)
 #define DRM_IOCTL_I915_GEM_CONTEXT_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_CONTEXT_DESTROY, struct drm_i915_gem_context_destroy)
 #define DRM_IOCTL_I915_REG_READ			DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_REG_READ, struct drm_i915_reg_read)
 #define DRM_IOCTL_I915_GET_RESET_STATS		DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GET_RESET_STATS, struct drm_i915_reset_stats)
@@ -1436,11 +1437,39 @@ struct drm_i915_gem_wait {
 };
 
 struct drm_i915_gem_context_create {
-	/*  output: id of new context*/
-	__u32 ctx_id;
+	__u32 ctx_id; /* output: id of new context*/
 	__u32 pad;
 };
 
+struct drm_i915_gem_context_create_ext {
+	__u32 ctx_id; /* output: id of new context*/
+	__u32 flags;
+	__u64 extensions;
+};
+
+struct drm_i915_gem_context_param {
+	__u32 ctx_id;
+	__u32 size;
+	__u64 param;
+#define I915_CONTEXT_PARAM_BAN_PERIOD	0x1
+#define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
+#define I915_CONTEXT_PARAM_GTT_SIZE	0x3
+#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
+#define I915_CONTEXT_PARAM_BANNABLE	0x5
+#define I915_CONTEXT_PARAM_PRIORITY	0x6
+#define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
+#define   I915_CONTEXT_DEFAULT_PRIORITY		0
+#define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
+#define I915_CONTEXT_PARAM_VM		0x7
+	__u64 value;
+};
+
+struct drm_i915_gem_context_create_ext_setparam {
+#define I915_CONTEXT_CREATE_EXT_SETPARAM 0
+	struct i915_user_extension base;
+	struct drm_i915_gem_context_param setparam;
+};
+
 struct drm_i915_gem_context_destroy {
 	__u32 ctx_id;
 	__u32 pad;
@@ -1503,23 +1532,6 @@ struct drm_i915_gem_userptr {
 	__u32 handle;
 };
 
-struct drm_i915_gem_context_param {
-	__u32 ctx_id;
-	__u32 size;
-	__u64 param;
-#define I915_CONTEXT_PARAM_BAN_PERIOD	0x1
-#define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
-#define I915_CONTEXT_PARAM_GTT_SIZE	0x3
-#define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
-#define I915_CONTEXT_PARAM_BANNABLE	0x5
-#define I915_CONTEXT_PARAM_PRIORITY	0x6
-#define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
-#define   I915_CONTEXT_DEFAULT_PRIORITY		0
-#define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
-#define I915_CONTEXT_PARAM_VM		0x7
-	__u64 value;
-};
-
 enum drm_i915_oa_format {
 	I915_OA_FORMAT_A13 = 1,	    /* HSW only */
 	I915_OA_FORMAT_A29,	    /* HSW only */
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 31/38] drm/i915: Allow contexts to share a single timeline across all engines
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (29 preceding siblings ...)
  2019-01-18 14:01 ` [PATCH 30/38] drm/i915: Extend CONTEXT_CREATE to set parameters upon construction Chris Wilson
@ 2019-01-18 14:01 ` Chris Wilson
  2019-01-24 17:35   ` Tvrtko Ursulin
  2019-01-18 14:01 ` [PATCH 32/38] drm/i915: Fix I915_EXEC_RING_MASK Chris Wilson
                   ` (8 subsequent siblings)
  39 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:01 UTC (permalink / raw)
  To: intel-gfx

Previously, our view has been always to run the engines independently
within a context. (Multiple engines happened before we had contexts and
timelines, so they always operated independently and that behaviour
persisted into contexts.) However, at the user level the context often
represents a single timeline (e.g. GL contexts) and userspace must
ensure that the individual engines are serialised to present that
ordering to the client (or forgot about this detail entirely and hope no
one notices - a fair ploy if the client can only directly control one
engine themselves ;)

In the next patch, we will want to construct a set of engines that
operate as one, that have a single timeline interwoven between them, to
present a single virtual engine to the user. (They submit to the virtual
engine, then we decide which engine to execute on based.)

To that end, we want to be able to create contexts which have a single
timeline (fence context) shared between all engines, rather than multiple
timelines.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_context.c       | 33 +++++++++++++---
 drivers/gpu/drm/i915/i915_gem_context.h       |  3 ++
 drivers/gpu/drm/i915/i915_request.c           | 10 ++++-
 drivers/gpu/drm/i915/i915_request.h           |  5 ++-
 drivers/gpu/drm/i915/i915_sw_fence.c          | 39 ++++++++++++++++---
 drivers/gpu/drm/i915/i915_sw_fence.h          | 13 ++++++-
 drivers/gpu/drm/i915/intel_lrc.c              |  5 ++-
 .../gpu/drm/i915/selftests/i915_gem_context.c | 17 ++++----
 drivers/gpu/drm/i915/selftests/mock_context.c |  2 +-
 include/uapi/drm/i915_drm.h                   |  1 +
 10 files changed, 103 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index ec5e3e1c6402..e28be242399d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -225,6 +225,9 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
 			ce->ops->destroy(ce);
 	}
 
+	if (ctx->timeline)
+		i915_timeline_put(ctx->timeline);
+
 	kfree(ctx->name);
 	put_pid(ctx->pid);
 
@@ -425,12 +428,17 @@ static void __set_ppgtt(struct i915_gem_context *ctx,
 
 static struct i915_gem_context *
 i915_gem_create_context(struct drm_i915_private *dev_priv,
-			struct drm_i915_file_private *file_priv)
+			struct drm_i915_file_private *file_priv,
+			unsigned int flags)
 {
 	struct i915_gem_context *ctx;
 
 	lockdep_assert_held(&dev_priv->drm.struct_mutex);
 
+	if (flags & I915_GEM_CONTEXT_SINGLE_TIMELINE &&
+	    !HAS_EXECLISTS(dev_priv))
+		return ERR_PTR(-EINVAL);
+
 	/* Reap the most stale context */
 	contexts_free_first(dev_priv);
 
@@ -453,6 +461,18 @@ i915_gem_create_context(struct drm_i915_private *dev_priv,
 		i915_ppgtt_put(ppgtt);
 	}
 
+	if (flags & I915_GEM_CONTEXT_SINGLE_TIMELINE) {
+		struct i915_timeline *timeline;
+
+		timeline = i915_timeline_create(dev_priv, ctx->name, NULL);
+		if (IS_ERR(timeline)) {
+			__destroy_hw_context(ctx, file_priv);
+			return ERR_CAST(timeline);
+		}
+
+		ctx->timeline = timeline;
+	}
+
 	trace_i915_context_create(ctx);
 
 	return ctx;
@@ -481,7 +501,7 @@ i915_gem_context_create_gvt(struct drm_device *dev)
 	if (ret)
 		return ERR_PTR(ret);
 
-	ctx = i915_gem_create_context(to_i915(dev), NULL);
+	ctx = i915_gem_create_context(to_i915(dev), NULL, 0);
 	if (IS_ERR(ctx))
 		goto out;
 
@@ -517,7 +537,7 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
 	struct i915_gem_context *ctx;
 	int err;
 
-	ctx = i915_gem_create_context(i915, NULL);
+	ctx = i915_gem_create_context(i915, NULL, 0);
 	if (IS_ERR(ctx))
 		return ctx;
 
@@ -638,7 +658,7 @@ int i915_gem_context_open(struct drm_i915_private *i915,
 	idr_init_base(&file_priv->vm_idr, 1);
 
 	mutex_lock(&i915->drm.struct_mutex);
-	ctx = i915_gem_create_context(i915, file_priv);
+	ctx = i915_gem_create_context(i915, file_priv, 0);
 	mutex_unlock(&i915->drm.struct_mutex);
 	if (IS_ERR(ctx)) {
 		idr_destroy(&file_priv->context_idr);
@@ -992,7 +1012,8 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 	if (!DRIVER_CAPS(dev_priv)->has_logical_contexts)
 		return -ENODEV;
 
-	if (args->flags)
+	if (args->flags &
+	    ~(I915_GEM_CONTEXT_SINGLE_TIMELINE))
 		return -EINVAL;
 
 	if (client_is_banned(file_priv)) {
@@ -1007,7 +1028,7 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		return ret;
 
-	ctx = i915_gem_create_context(dev_priv, file_priv);
+	ctx = i915_gem_create_context(dev_priv, file_priv, args->flags);
 	mutex_unlock(&dev->struct_mutex);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 9786f86b659d..b3a840747330 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -41,6 +41,7 @@ struct drm_i915_private;
 struct drm_i915_file_private;
 struct i915_hw_ppgtt;
 struct i915_request;
+struct i915_timeline;
 struct i915_vma;
 struct intel_ring;
 
@@ -66,6 +67,8 @@ struct i915_gem_context {
 	/** file_priv: owning file descriptor */
 	struct drm_i915_file_private *file_priv;
 
+	struct i915_timeline *timeline;
+
 	/**
 	 * @ppgtt: unique address space (GTT)
 	 *
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 7bccf578cd65..ca432d3d8211 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -860,8 +860,14 @@ void i915_request_add(struct i915_request *request)
 	prev = i915_gem_active_raw(&timeline->last_request,
 				   &request->i915->drm.struct_mutex);
 	if (prev && !i915_request_completed(prev)) {
-		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
-					     &request->submitq);
+		if (prev->engine == engine)
+			i915_sw_fence_await_sw_fence(&request->submit,
+						     &prev->submit,
+						     &request->submitq);
+		else
+			__i915_sw_fence_await_dma_fence(&request->submit,
+							&prev->fence,
+							&request->dmaq);
 		if (engine->schedule)
 			__i915_sched_node_add_dependency(&request->sched,
 							 &prev->sched,
diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
index 679b4663f774..f715384ff485 100644
--- a/drivers/gpu/drm/i915/i915_request.h
+++ b/drivers/gpu/drm/i915/i915_request.h
@@ -108,7 +108,10 @@ struct i915_request {
 	 * It is used by the driver to then queue the request for execution.
 	 */
 	struct i915_sw_fence submit;
-	wait_queue_entry_t submitq;
+	union {
+		wait_queue_entry_t submitq;
+		struct i915_sw_dma_fence_cb dmaq;
+	};
 
 	/*
 	 * A list of everyone we wait upon, and everyone who waits upon us.
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 7c58b049ecb5..7bb64437dbbe 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -359,11 +359,6 @@ int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
 	return __i915_sw_fence_await_sw_fence(fence, signaler, NULL, gfp);
 }
 
-struct i915_sw_dma_fence_cb {
-	struct dma_fence_cb base;
-	struct i915_sw_fence *fence;
-};
-
 struct i915_sw_dma_fence_cb_timer {
 	struct i915_sw_dma_fence_cb base;
 	struct dma_fence *dma;
@@ -480,6 +475,40 @@ int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
 	return ret;
 }
 
+static void __dma_i915_sw_fence_wake(struct dma_fence *dma,
+				     struct dma_fence_cb *data)
+{
+	struct i915_sw_dma_fence_cb *cb = container_of(data, typeof(*cb), base);
+
+	i915_sw_fence_complete(cb->fence);
+}
+
+int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
+				    struct dma_fence *dma,
+				    struct i915_sw_dma_fence_cb *cb)
+{
+	int ret;
+
+	debug_fence_assert(fence);
+
+	if (dma_fence_is_signaled(dma))
+		return 0;
+
+	cb->fence = fence;
+	i915_sw_fence_await(fence);
+
+	ret = dma_fence_add_callback(dma, &cb->base, __dma_i915_sw_fence_wake);
+	if (ret == 0) {
+		ret = 1;
+	} else {
+		i915_sw_fence_complete(fence);
+		if (ret == -ENOENT) /* fence already signaled */
+			ret = 0;
+	}
+
+	return ret;
+}
+
 int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    struct reservation_object *resv,
 				    const struct dma_fence_ops *exclude,
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 0e055ea0179f..b420ceadb813 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -9,14 +9,13 @@
 #ifndef _I915_SW_FENCE_H_
 #define _I915_SW_FENCE_H_
 
+#include <linux/dma-fence.h>
 #include <linux/gfp.h>
 #include <linux/kref.h>
 #include <linux/notifier.h> /* for NOTIFY_DONE */
 #include <linux/wait.h>
 
 struct completion;
-struct dma_fence;
-struct dma_fence_ops;
 struct reservation_object;
 
 struct i915_sw_fence {
@@ -68,10 +67,20 @@ int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
 int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
 				     struct i915_sw_fence *after,
 				     gfp_t gfp);
+
+struct i915_sw_dma_fence_cb {
+	struct dma_fence_cb base;
+	struct i915_sw_fence *fence;
+};
+
+int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
+				    struct dma_fence *dma,
+				    struct i915_sw_dma_fence_cb *cb);
 int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
 				  struct dma_fence *dma,
 				  unsigned long timeout,
 				  gfp_t gfp);
+
 int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
 				    struct reservation_object *resv,
 				    const struct dma_fence_ops *exclude,
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 323341e9bf2d..10c42820bb46 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -2644,7 +2644,10 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 		goto error_deref_obj;
 	}
 
-	timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
+	if (ctx->timeline)
+		timeline = i915_timeline_get(ctx->timeline);
+	else
+		timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
 	if (IS_ERR(timeline)) {
 		ret = PTR_ERR(timeline);
 		goto error_deref_obj;
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 2864cfb82325..3e68c2888b9c 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -143,7 +143,7 @@ static int live_nop_switch(void *arg)
 	}
 
 	for (n = 0; n < nctx; n++) {
-		ctx[n] = i915_gem_create_context(i915, file->driver_priv);
+		ctx[n] = i915_gem_create_context(i915, file->driver_priv, 0);
 		if (IS_ERR(ctx[n])) {
 			err = PTR_ERR(ctx[n]);
 			goto out_unlock;
@@ -601,7 +601,8 @@ static int igt_ctx_exec(void *arg)
 			struct i915_gem_context *ctx;
 			intel_wakeref_t wakeref;
 
-			ctx = i915_gem_create_context(i915, file->driver_priv);
+			ctx = i915_gem_create_context(i915,
+						      file->driver_priv, 0);
 			if (IS_ERR(ctx)) {
 				err = PTR_ERR(ctx);
 				goto out_unlock;
@@ -698,7 +699,8 @@ static int igt_shared_ctx_exec(void *arg)
 		if (err)
 			goto out_unlock;
 
-		parent = i915_gem_create_context(i915, file->driver_priv);
+		parent = i915_gem_create_context(i915,
+						 file->driver_priv, 0);
 		if (IS_ERR(parent)) {
 			err = PTR_ERR(parent);
 			if (err == -ENODEV) /* no logical ctx support */
@@ -720,7 +722,8 @@ static int igt_shared_ctx_exec(void *arg)
 			if (ctx)
 				__destroy_hw_context(ctx, file->driver_priv);
 
-			ctx = i915_gem_create_context(i915, file->driver_priv);
+			ctx = i915_gem_create_context(i915,
+						      file->driver_priv, 0);
 			if (IS_ERR(ctx)) {
 				err = PTR_ERR(ctx);
 				goto out_unlock;
@@ -813,7 +816,7 @@ static int igt_ctx_readonly(void *arg)
 	if (err)
 		goto out_unlock;
 
-	ctx = i915_gem_create_context(i915, file->driver_priv);
+	ctx = i915_gem_create_context(i915, file->driver_priv, 0);
 	if (IS_ERR(ctx)) {
 		err = PTR_ERR(ctx);
 		goto out_unlock;
@@ -1139,13 +1142,13 @@ static int igt_vm_isolation(void *arg)
 	if (err)
 		goto out_unlock;
 
-	ctx_a = i915_gem_create_context(i915, file->driver_priv);
+	ctx_a = i915_gem_create_context(i915, file->driver_priv, 0);
 	if (IS_ERR(ctx_a)) {
 		err = PTR_ERR(ctx_a);
 		goto out_unlock;
 	}
 
-	ctx_b = i915_gem_create_context(i915, file->driver_priv);
+	ctx_b = i915_gem_create_context(i915, file->driver_priv, 0);
 	if (IS_ERR(ctx_b)) {
 		err = PTR_ERR(ctx_b);
 		goto out_unlock;
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index b9fd2a4b95e9..f13f9c726034 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -99,7 +99,7 @@ live_context(struct drm_i915_private *i915, struct drm_file *file)
 {
 	lockdep_assert_held(&i915->drm.struct_mutex);
 
-	return i915_gem_create_context(i915, file->driver_priv);
+	return i915_gem_create_context(i915, file->driver_priv, 0);
 }
 
 struct i915_gem_context *
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 704e9d2fe2d6..72749dc9801e 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1444,6 +1444,7 @@ struct drm_i915_gem_context_create {
 struct drm_i915_gem_context_create_ext {
 	__u32 ctx_id; /* output: id of new context*/
 	__u32 flags;
+#define I915_GEM_CONTEXT_SINGLE_TIMELINE	0x1
 	__u64 extensions;
 };
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 32/38] drm/i915: Fix I915_EXEC_RING_MASK
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (30 preceding siblings ...)
  2019-01-18 14:01 ` [PATCH 31/38] drm/i915: Allow contexts to share a single timeline across all engines Chris Wilson
@ 2019-01-18 14:01 ` Chris Wilson
  2019-01-18 14:01 ` [PATCH 33/38] drm/i915: Remove last traces of exec-id (GEM_BUSY) Chris Wilson
                   ` (7 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:01 UTC (permalink / raw)
  To: intel-gfx; +Cc: tvrtko.ursulin, Chris Wilson, stable

This was supposed to be a mask of all known rings, but it is being used
by execbuffer to filter out invalid rings, and so is instead mapping high
unused values onto valid rings. Instead of a mask of all known rings,
we need it to be the mask of all possible rings.

Fixes: 549f7365820a ("drm/i915: Enable SandyBridge blitter ring")
Fixes: de1add360522 ("drm/i915: Decouple execbuf uAPI from internal implementation")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: <stable@vger.kernel.org> # v4.6+
---
 include/uapi/drm/i915_drm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 72749dc9801e..ca25ca28d75f 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -997,7 +997,7 @@ struct drm_i915_gem_execbuffer2 {
 	 * struct drm_i915_gem_exec_fence *fences.
 	 */
 	__u64 cliprects_ptr;
-#define I915_EXEC_RING_MASK              (7<<0)
+#define I915_EXEC_RING_MASK              (0x3f)
 #define I915_EXEC_DEFAULT                (0<<0)
 #define I915_EXEC_RENDER                 (1<<0)
 #define I915_EXEC_BSD                    (2<<0)
-- 
2.20.1


^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 33/38] drm/i915: Remove last traces of exec-id (GEM_BUSY)
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (31 preceding siblings ...)
  2019-01-18 14:01 ` [PATCH 32/38] drm/i915: Fix I915_EXEC_RING_MASK Chris Wilson
@ 2019-01-18 14:01 ` Chris Wilson
  2019-01-18 14:01 ` [PATCH 34/38] drm/i915: Re-arrange execbuf so context is known before engine Chris Wilson
                   ` (6 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:01 UTC (permalink / raw)
  To: intel-gfx

As we allow per-context engine allows the legacy concept of
I915_EXEC_RING no longer applies universally. We are still exposing the
unrelated exec-id in GEM_BUSY, so transition this ioctl (once more
slightly changing its ABI, but no one cares) over to only reporting the
uabi-class (not instance as we can not foreseeably fit those into the
small bitmask).

The only user of the extended ring information from GEM_BUSY is ddx/sna,
which tries to use the non-rcs business information to guide which
engine to use for subsequent operations on foreign bo. All that matters
for it is the decision between rcs and !rcs, so it is unaffected by the
change in higher bits.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c         | 32 +++++++++++++------------
 drivers/gpu/drm/i915/intel_engine_cs.c  | 10 --------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  1 -
 include/uapi/drm/i915_drm.h             | 32 +++++++++++++------------
 4 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 9a78e1c9b323..4c8691533e9b 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3939,20 +3939,17 @@ i915_gem_object_ggtt_pin(struct drm_i915_gem_object *obj,
 
 static __always_inline unsigned int __busy_read_flag(unsigned int id)
 {
-	/* Note that we could alias engines in the execbuf API, but
-	 * that would be very unwise as it prevents userspace from
-	 * fine control over engine selection. Ahem.
-	 *
-	 * This should be something like EXEC_MAX_ENGINE instead of
-	 * I915_NUM_ENGINES.
-	 */
-	BUILD_BUG_ON(I915_NUM_ENGINES > 16);
+	if (id == I915_ENGINE_CLASS_INVALID)
+		return 0xffff0000;
+
+	GEM_BUG_ON(id >= 16);
 	return 0x10000 << id;
 }
 
 static __always_inline unsigned int __busy_write_id(unsigned int id)
 {
-	/* The uABI guarantees an active writer is also amongst the read
+	/*
+	 * The uABI guarantees an active writer is also amongst the read
 	 * engines. This would be true if we accessed the activity tracking
 	 * under the lock, but as we perform the lookup of the object and
 	 * its activity locklessly we can not guarantee that the last_write
@@ -3960,16 +3957,20 @@ static __always_inline unsigned int __busy_write_id(unsigned int id)
 	 * last_read - hence we always set both read and write busy for
 	 * last_write.
 	 */
-	return id | __busy_read_flag(id);
+	if (id == I915_ENGINE_CLASS_INVALID)
+		return 0xffffffff;
+
+	return (id + 1) | __busy_read_flag(id);
 }
 
 static __always_inline unsigned int
 __busy_set_if_active(const struct dma_fence *fence,
 		     unsigned int (*flag)(unsigned int id))
 {
-	struct i915_request *rq;
+	const struct i915_request *rq;
 
-	/* We have to check the current hw status of the fence as the uABI
+	/*
+	 * We have to check the current hw status of the fence as the uABI
 	 * guarantees forward progress. We could rely on the idle worker
 	 * to eventually flush us, but to minimise latency just ask the
 	 * hardware.
@@ -3980,11 +3981,11 @@ __busy_set_if_active(const struct dma_fence *fence,
 		return 0;
 
 	/* opencode to_request() in order to avoid const warnings */
-	rq = container_of(fence, struct i915_request, fence);
+	rq = container_of(fence, const struct i915_request, fence);
 	if (i915_request_completed(rq))
 		return 0;
 
-	return flag(rq->engine->uabi_id);
+	return flag(rq->engine->uabi_class);
 }
 
 static __always_inline unsigned int
@@ -4018,7 +4019,8 @@ i915_gem_busy_ioctl(struct drm_device *dev, void *data,
 	if (!obj)
 		goto out;
 
-	/* A discrepancy here is that we do not report the status of
+	/*
+	 * A discrepancy here is that we do not report the status of
 	 * non-i915 fences, i.e. even though we may report the object as idle,
 	 * a call to set-domain may still stall waiting for foreign rendering.
 	 * This also means that wait-ioctl may report an object as busy,
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 47f3cad6e861..53f68c983334 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -84,7 +84,6 @@ static const struct engine_class_info intel_engine_classes[] = {
 #define MAX_MMIO_BASES 3
 struct engine_info {
 	unsigned int hw_id;
-	unsigned int uabi_id;
 	u8 class;
 	u8 instance;
 	/* mmio bases table *must* be sorted in reverse gen order */
@@ -97,7 +96,6 @@ struct engine_info {
 static const struct engine_info intel_engines[] = {
 	[RCS] = {
 		.hw_id = RCS_HW,
-		.uabi_id = I915_EXEC_RENDER,
 		.class = RENDER_CLASS,
 		.instance = 0,
 		.mmio_bases = {
@@ -106,7 +104,6 @@ static const struct engine_info intel_engines[] = {
 	},
 	[BCS] = {
 		.hw_id = BCS_HW,
-		.uabi_id = I915_EXEC_BLT,
 		.class = COPY_ENGINE_CLASS,
 		.instance = 0,
 		.mmio_bases = {
@@ -115,7 +112,6 @@ static const struct engine_info intel_engines[] = {
 	},
 	[VCS] = {
 		.hw_id = VCS_HW,
-		.uabi_id = I915_EXEC_BSD,
 		.class = VIDEO_DECODE_CLASS,
 		.instance = 0,
 		.mmio_bases = {
@@ -126,7 +122,6 @@ static const struct engine_info intel_engines[] = {
 	},
 	[VCS2] = {
 		.hw_id = VCS2_HW,
-		.uabi_id = I915_EXEC_BSD,
 		.class = VIDEO_DECODE_CLASS,
 		.instance = 1,
 		.mmio_bases = {
@@ -136,7 +131,6 @@ static const struct engine_info intel_engines[] = {
 	},
 	[VCS3] = {
 		.hw_id = VCS3_HW,
-		.uabi_id = I915_EXEC_BSD,
 		.class = VIDEO_DECODE_CLASS,
 		.instance = 2,
 		.mmio_bases = {
@@ -145,7 +139,6 @@ static const struct engine_info intel_engines[] = {
 	},
 	[VCS4] = {
 		.hw_id = VCS4_HW,
-		.uabi_id = I915_EXEC_BSD,
 		.class = VIDEO_DECODE_CLASS,
 		.instance = 3,
 		.mmio_bases = {
@@ -154,7 +147,6 @@ static const struct engine_info intel_engines[] = {
 	},
 	[VECS] = {
 		.hw_id = VECS_HW,
-		.uabi_id = I915_EXEC_VEBOX,
 		.class = VIDEO_ENHANCEMENT_CLASS,
 		.instance = 0,
 		.mmio_bases = {
@@ -164,7 +156,6 @@ static const struct engine_info intel_engines[] = {
 	},
 	[VECS2] = {
 		.hw_id = VECS2_HW,
-		.uabi_id = I915_EXEC_VEBOX,
 		.class = VIDEO_ENHANCEMENT_CLASS,
 		.instance = 1,
 		.mmio_bases = {
@@ -321,7 +312,6 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
 	engine->class = info->class;
 	engine->instance = info->instance;
 
-	engine->uabi_id = info->uabi_id;
 	engine->uabi_class = intel_engine_classes[info->class].uabi_class;
 
 	engine->context_size = __intel_engine_context_size(dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 9ca8f5ff5dd4..f37c5d1468be 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -355,7 +355,6 @@ struct intel_engine_cs {
 	unsigned int hw_id;
 	unsigned int guc_id;
 
-	u8 uabi_id;
 	u8 uabi_class;
 
 	u8 class;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index ca25ca28d75f..9db459b19d4e 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -1145,32 +1145,34 @@ struct drm_i915_gem_busy {
 	 * as busy may become idle before the ioctl is completed.
 	 *
 	 * Furthermore, if the object is busy, which engine is busy is only
-	 * provided as a guide. There are race conditions which prevent the
-	 * report of which engines are busy from being always accurate.
-	 * However, the converse is not true. If the object is idle, the
-	 * result of the ioctl, that all engines are idle, is accurate.
+	 * provided as a guide and only indirectly by reporting its class
+	 * (there may be more than one engine in each class). There are race
+	 * conditions which prevent the report of which engines are busy from
+	 * being always accurate.  However, the converse is not true. If the
+	 * object is idle, the result of the ioctl, that all engines are idle,
+	 * is accurate.
 	 *
 	 * The returned dword is split into two fields to indicate both
-	 * the engines on which the object is being read, and the
-	 * engine on which it is currently being written (if any).
+	 * the engine classess on which the object is being read, and the
+	 * engine class on which it is currently being written (if any).
 	 *
 	 * The low word (bits 0:15) indicate if the object is being written
 	 * to by any engine (there can only be one, as the GEM implicit
 	 * synchronisation rules force writes to be serialised). Only the
-	 * engine for the last write is reported.
+	 * engine class (offset by 1, I915_ENGINE_CLASS_RENDER is reported as
+	 * 1 not 0 etc) for the last write is reported.
 	 *
-	 * The high word (bits 16:31) are a bitmask of which engines are
-	 * currently reading from the object. Multiple engines may be
+	 * The high word (bits 16:31) are a bitmask of which engines classes
+	 * are currently reading from the object. Multiple engines may be
 	 * reading from the object simultaneously.
 	 *
-	 * The value of each engine is the same as specified in the
-	 * EXECBUFFER2 ioctl, i.e. I915_EXEC_RENDER, I915_EXEC_BSD etc.
-	 * Note I915_EXEC_DEFAULT is a symbolic value and is mapped to
-	 * the I915_EXEC_RENDER engine for execution, and so it is never
+	 * The value of each engine class is the same as specified in the
+	 * I915_CONTEXT_SET_ENGINES parameter and via perf, i.e.
+	 * I915_ENGINE_CLASS_RENDER, I915_ENGINE_CLASS_COPY, etc.
 	 * reported as active itself. Some hardware may have parallel
 	 * execution engines, e.g. multiple media engines, which are
-	 * mapped to the same identifier in the EXECBUFFER2 ioctl and
-	 * so are not separately reported for busyness.
+	 * mapped to the same class identifier and so are not separately
+	 * reported for busyness.
 	 *
 	 * Caveat emptor:
 	 * Only the boolean result of this query is reliable; that is whether
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 34/38] drm/i915: Re-arrange execbuf so context is known before engine
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (32 preceding siblings ...)
  2019-01-18 14:01 ` [PATCH 33/38] drm/i915: Remove last traces of exec-id (GEM_BUSY) Chris Wilson
@ 2019-01-18 14:01 ` Chris Wilson
  2019-01-18 14:01 ` [PATCH 35/38] drm/i915: Allow a context to define its set of engines Chris Wilson
                   ` (5 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:01 UTC (permalink / raw)
  To: intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Needed for a following patch.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index f250109e1f66..e63c179f3b3d 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -2238,10 +2238,6 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	if (args->flags & I915_EXEC_IS_PINNED)
 		eb.batch_flags |= I915_DISPATCH_PINNED;
 
-	eb.engine = eb_select_engine(eb.i915, file, args);
-	if (!eb.engine)
-		return -EINVAL;
-
 	if (args->flags & I915_EXEC_FENCE_IN) {
 		in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
 		if (!in_fence)
@@ -2266,6 +2262,12 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	if (unlikely(err))
 		goto err_destroy;
 
+	eb.engine = eb_select_engine(eb.i915, file, args);
+	if (!eb.engine) {
+		err = -EINVAL;
+		goto err_engine;
+	}
+
 	/*
 	 * Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
@@ -2426,6 +2428,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	mutex_unlock(&dev->struct_mutex);
 err_rpm:
 	intel_runtime_pm_put(eb.i915, wakeref);
+err_engine:
 	i915_gem_context_put(eb.ctx);
 err_destroy:
 	eb_destroy(&eb);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 35/38] drm/i915: Allow a context to define its set of engines
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (33 preceding siblings ...)
  2019-01-18 14:01 ` [PATCH 34/38] drm/i915: Re-arrange execbuf so context is known before engine Chris Wilson
@ 2019-01-18 14:01 ` Chris Wilson
  2019-01-18 14:01 ` [PATCH 36/38] drm/i915/execlists: Refactor out can_merge_rq() Chris Wilson
                   ` (4 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:01 UTC (permalink / raw)
  To: intel-gfx

Over the last few years, we have debated how to extend the user API to
support an increase in the number of engines, that may be sparse and
even be heterogeneous within a class (not all video decoders created
equal). We settled on using (class, instance) tuples to identify a
specific engine, with an API for the user to construct a map of engines
to capabilities. Into this picture, we then add a challenge of virtual
engines; one user engine that maps behind the scenes to any number of
physical engines. To keep it general, we want the user to have full
control over that mapping. To that end, we allow the user to constrain a
context to define the set of engines that it can access, order fully
controlled by the user via (class, instance). With such precise control
in context setup, we can continue to use the existing execbuf uABI of
specifying a single index; only now it doesn't automagically map onto
the engines, it uses the user defined engine map from the context.

The I915_EXEC_DEFAULT slot is left empty, and invalid for use by
execbuf. It's use will be revealed in the next patch.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_context.c    | 158 +++++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_context.h    |   4 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  22 ++-
 include/uapi/drm/i915_drm.h                |  30 ++++
 4 files changed, 208 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index e28be242399d..701af326120d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -225,6 +225,8 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
 			ce->ops->destroy(ce);
 	}
 
+	kfree(ctx->engines);
+
 	if (ctx->timeline)
 		i915_timeline_put(ctx->timeline);
 
@@ -907,6 +909,155 @@ static int set_ppgtt(struct i915_gem_context *ctx, u32 id)
 	return err;
 }
 
+struct set_engines {
+	struct i915_gem_context *ctx;
+	struct intel_engine_cs **engines;
+	unsigned int nengine;
+};
+
+static const i915_user_extension_fn set_engines__extensions[] = {
+};
+
+static int set_engines(struct i915_gem_context *ctx,
+		       const struct drm_i915_gem_context_param *args)
+{
+	struct i915_context_param_engines __user *user;
+	struct set_engines set = { .ctx = ctx };
+	u64 size, extensions;
+	unsigned int n;
+	int err;
+
+	user = u64_to_user_ptr(args->value);
+	size = args->size;
+	if (!size)
+		goto out;
+
+	if (size < sizeof(*user))
+		return -EINVAL;
+
+	size -= sizeof(*user);
+	if (size % sizeof(*user->class_instance))
+		return -EINVAL;
+
+	set.nengine = size / sizeof(*user->class_instance);
+	if (set.nengine == 0 || set.nengine > I915_EXEC_RING_MASK)
+		return -EINVAL;
+
+	set.engines = kmalloc_array(set.nengine,
+				    sizeof(*set.engines),
+				    GFP_KERNEL);
+	if (!set.engines)
+		return -ENOMEM;
+
+	for (n = 0; n < set.nengine; n++) {
+		u16 class, inst;
+
+		if (get_user(class, &user->class_instance[n].engine_class) ||
+		    get_user(inst, &user->class_instance[n].engine_instance)) {
+			kfree(set.engines);
+			return -EFAULT;
+		}
+
+		if (class == I915_ENGINE_CLASS_INVALID &&
+		    inst == I915_ENGINE_CLASS_INVALID_NONE) {
+			set.engines[n] = NULL;
+			continue;
+		}
+
+		set.engines[n] =
+			intel_engine_lookup_user(ctx->i915, class, inst);
+		if (!set.engines[n]) {
+			kfree(set.engines);
+			return -ENOENT;
+		}
+	}
+
+	err = -EFAULT;
+	if (!get_user(extensions, &user->extensions))
+		err = i915_user_extensions(u64_to_user_ptr(extensions),
+					   set_engines__extensions,
+					   ARRAY_SIZE(set_engines__extensions),
+					   &set);
+	if (err) {
+		kfree(set.engines);
+		return err;
+	}
+
+out:
+	mutex_lock(&ctx->i915->drm.struct_mutex);
+	kfree(ctx->engines);
+	ctx->engines = set.engines;
+	ctx->nengine = set.nengine;
+	mutex_unlock(&ctx->i915->drm.struct_mutex);
+
+	return 0;
+}
+
+static int get_engines(struct i915_gem_context *ctx,
+		       struct drm_i915_gem_context_param *args)
+{
+	struct i915_context_param_engines *local;
+	unsigned int n, count, size;
+	int err;
+
+restart:
+	count = READ_ONCE(ctx->nengine);
+	if (count > (INT_MAX - sizeof(*local)) / sizeof(*local->class_instance))
+		return -ENOMEM; /* unrepresentable! */
+
+	size = sizeof(*local) + count * sizeof(*local->class_instance);
+	if (!args->size) {
+		args->size = size;
+		return 0;
+	}
+	if (args->size < size)
+		return -EINVAL;
+
+	local = kmalloc(size, GFP_KERNEL);
+	if (!local)
+		return -ENOMEM;
+
+	if (mutex_lock_interruptible(&ctx->i915->drm.struct_mutex)) {
+		err = -EINTR;
+		goto err;
+	}
+
+	if (READ_ONCE(ctx->nengine) != count) {
+		mutex_unlock(&ctx->i915->drm.struct_mutex);
+		kfree(local);
+		goto restart;
+	}
+
+	local->extensions = 0;
+	for (n = 0; n < count; n++) {
+		if (ctx->engines[n]) {
+			local->class_instance[n].engine_class =
+				ctx->engines[n]->uabi_class;
+			local->class_instance[n].engine_instance =
+				ctx->engines[n]->instance;
+		} else {
+			local->class_instance[n].engine_class =
+				I915_ENGINE_CLASS_INVALID;
+			local->class_instance[n].engine_instance =
+				I915_ENGINE_CLASS_INVALID_NONE;
+		}
+	}
+
+	mutex_unlock(&ctx->i915->drm.struct_mutex);
+
+	if (copy_to_user(u64_to_user_ptr(args->value), local, size)) {
+		err = -EFAULT;
+		goto err;
+	}
+
+	args->size = size;
+	return 0;
+
+err:
+	kfree(local);
+	return err;
+}
+
 static int ctx_setparam(struct i915_gem_context *ctx,
 			const struct drm_i915_gem_context_param *args)
 {
@@ -971,6 +1122,10 @@ static int ctx_setparam(struct i915_gem_context *ctx,
 			ret = set_ppgtt(ctx, lower_32_bits(args->value));
 		break;
 
+	case I915_CONTEXT_PARAM_ENGINES:
+		ret = set_engines(ctx, args);
+		break;
+
 	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	default:
 		ret = -EINVAL;
@@ -1121,6 +1276,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_VM:
 		ret = get_ppgtt(ctx, &args->value);
 		break;
+	case I915_CONTEXT_PARAM_ENGINES:
+		ret = get_engines(ctx, args);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index b3a840747330..635f693994c4 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -67,6 +67,8 @@ struct i915_gem_context {
 	/** file_priv: owning file descriptor */
 	struct drm_i915_file_private *file_priv;
 
+	struct intel_engine_cs **engines;
+
 	struct i915_timeline *timeline;
 
 	/**
@@ -135,6 +137,8 @@ struct i915_gem_context {
 #define CONTEXT_CLOSED			1
 #define CONTEXT_FORCE_SINGLE_SUBMISSION	2
 
+	unsigned int nengine;
+
 	/**
 	 * @hw_id: - unique identifier for the context
 	 *
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index e63c179f3b3d..17187519e8f3 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -2016,13 +2016,23 @@ static const enum intel_engine_id user_ring_map[I915_USER_RINGS + 1] = {
 };
 
 static struct intel_engine_cs *
-eb_select_engine(struct drm_i915_private *dev_priv,
+eb_select_engine(struct i915_execbuffer *eb,
 		 struct drm_file *file,
 		 struct drm_i915_gem_execbuffer2 *args)
 {
 	unsigned int user_ring_id = args->flags & I915_EXEC_RING_MASK;
 	struct intel_engine_cs *engine;
 
+	if (eb->ctx->engines) {
+		if (user_ring_id >= eb->ctx->nengine) {
+			DRM_DEBUG("execbuf with unknown ring: %u\n",
+				  user_ring_id);
+			return NULL;
+		}
+
+		return eb->ctx->engines[user_ring_id];
+	}
+
 	if (user_ring_id > I915_USER_RINGS) {
 		DRM_DEBUG("execbuf with unknown ring: %u\n", user_ring_id);
 		return NULL;
@@ -2035,11 +2045,11 @@ eb_select_engine(struct drm_i915_private *dev_priv,
 		return NULL;
 	}
 
-	if (user_ring_id == I915_EXEC_BSD && HAS_BSD2(dev_priv)) {
+	if (user_ring_id == I915_EXEC_BSD && HAS_BSD2(eb->i915)) {
 		unsigned int bsd_idx = args->flags & I915_EXEC_BSD_MASK;
 
 		if (bsd_idx == I915_EXEC_BSD_DEFAULT) {
-			bsd_idx = gen8_dispatch_bsd_engine(dev_priv, file);
+			bsd_idx = gen8_dispatch_bsd_engine(eb->i915, file);
 		} else if (bsd_idx >= I915_EXEC_BSD_RING1 &&
 			   bsd_idx <= I915_EXEC_BSD_RING2) {
 			bsd_idx >>= I915_EXEC_BSD_SHIFT;
@@ -2050,9 +2060,9 @@ eb_select_engine(struct drm_i915_private *dev_priv,
 			return NULL;
 		}
 
-		engine = dev_priv->engine[_VCS(bsd_idx)];
+		engine = eb->i915->engine[_VCS(bsd_idx)];
 	} else {
-		engine = dev_priv->engine[user_ring_map[user_ring_id]];
+		engine = eb->i915->engine[user_ring_map[user_ring_id]];
 	}
 
 	if (!engine) {
@@ -2262,7 +2272,7 @@ i915_gem_do_execbuffer(struct drm_device *dev,
 	if (unlikely(err))
 		goto err_destroy;
 
-	eb.engine = eb_select_engine(eb.i915, file, args);
+	eb.engine = eb_select_engine(&eb, file, args);
 	if (!eb.engine) {
 		err = -EINVAL;
 		goto err_engine;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 9db459b19d4e..84f42317de92 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -122,6 +122,8 @@ enum drm_i915_gem_engine_class {
 	I915_ENGINE_CLASS_INVALID	= -1
 };
 
+#define I915_ENGINE_CLASS_INVALID_NONE -1
+
 /**
  * DOC: perf_events exposed by i915 through /sys/bus/event_sources/drivers/i915
  *
@@ -1464,9 +1466,37 @@ struct drm_i915_gem_context_param {
 #define   I915_CONTEXT_DEFAULT_PRIORITY		0
 #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
 #define I915_CONTEXT_PARAM_VM		0x7
+
+/*
+ * I915_CONTEXT_PARAM_ENGINES:
+ *
+ * Bind this context to operate on this subset of available engines. Henceforth,
+ * the I915_EXEC_RING selector for DRM_IOCTL_I915_GEM_EXECBUFFER2 operates as
+ * an index into this array of engines; I915_EXEC_DEFAULT selecting engine[0]
+ * and upwards. Slots 0...N are filled in using the specified (class, instance).
+ * Use (I915_ENGINE_CLASS_INVALID, I915_ENGINE_CLASS_INVALID_NONE) to specify
+ * a gap in the array that can be filled in later, e.g. by a virtual engine
+ * used for load balancing.
+ *
+ * Setting the number of engines bound to the context to 0, by passing a zero
+ * sized argument, will revert back to default settings.
+ *
+ * See struct i915_context_param_engines.
+ */
+#define I915_CONTEXT_PARAM_ENGINES	0x8
+
 	__u64 value;
 };
 
+struct i915_context_param_engines {
+	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
+
+	struct {
+		__u16 engine_class; /* see enum drm_i915_gem_engine_class */
+		__u16 engine_instance;
+	} class_instance[0];
+};
+
 struct drm_i915_gem_context_create_ext_setparam {
 #define I915_CONTEXT_CREATE_EXT_SETPARAM 0
 	struct i915_user_extension base;
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 36/38] drm/i915/execlists: Refactor out can_merge_rq()
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (34 preceding siblings ...)
  2019-01-18 14:01 ` [PATCH 35/38] drm/i915: Allow a context to define its set of engines Chris Wilson
@ 2019-01-18 14:01 ` Chris Wilson
  2019-01-18 14:01 ` [PATCH 37/38] drm/i915: Store the BIT(engine->id) as the engine's mask Chris Wilson
                   ` (3 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:01 UTC (permalink / raw)
  To: intel-gfx

In the next patch, we add another user that wants to check whether
requests can be merge into a single HW execution, and in the future we
want to add more conditions under which requests from the same context
cannot be merge. In preparation, extract out can_merge_rq().

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 10c42820bb46..e365c12c08eb 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -484,6 +484,15 @@ static bool can_merge_ctx(const struct intel_context *prev,
 	return true;
 }
 
+static bool can_merge_rq(const struct i915_request *prev,
+			 const struct i915_request *next)
+{
+	if (!can_merge_ctx(prev->hw_context, next->hw_context))
+		return false;
+
+	return true;
+}
+
 static void port_assign(struct execlist_port *port, struct i915_request *rq)
 {
 	GEM_BUG_ON(rq == port_request(port));
@@ -648,8 +657,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			 * second request, and so we never need to tell the
 			 * hardware about the first.
 			 */
-			if (last &&
-			    !can_merge_ctx(rq->hw_context, last->hw_context)) {
+			if (last && !can_merge_rq(rq, last)) {
 				/*
 				 * If we are on the second port and cannot
 				 * combine this request with the last, then we
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 37/38] drm/i915: Store the BIT(engine->id) as the engine's mask
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (35 preceding siblings ...)
  2019-01-18 14:01 ` [PATCH 36/38] drm/i915/execlists: Refactor out can_merge_rq() Chris Wilson
@ 2019-01-18 14:01 ` Chris Wilson
  2019-01-18 14:01 ` [PATCH 38/38] drm/i915: Load balancing across a virtual engine Chris Wilson
                   ` (2 subsequent siblings)
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:01 UTC (permalink / raw)
  To: intel-gfx

In the next patch, we are introducing a broad virtual engine to encompass
multiple physical engines, losing the 1:1 nature of BIT(engine->id). To
reflect the broader set of engines implied by the virtual instance, lets
store the full bitmask.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_request.c              | 2 +-
 drivers/gpu/drm/i915/i915_reset.c                | 4 ++--
 drivers/gpu/drm/i915/intel_engine_cs.c           | 1 +
 drivers/gpu/drm/i915/intel_hangcheck.c           | 8 ++++----
 drivers/gpu/drm/i915/intel_ringbuffer.c          | 4 ++--
 drivers/gpu/drm/i915/intel_ringbuffer.h          | 7 +------
 drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 2 +-
 7 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index ca432d3d8211..22ac02f02b33 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -888,7 +888,7 @@ void i915_request_add(struct i915_request *request)
 		list_add(&ring->active_link, &request->i915->gt.active_rings);
 	}
 	pin_active_context(request->i915, request->hw_context);
-	request->i915->gt.active_engines |= BIT(engine->id);
+	request->i915->gt.active_engines |= engine->mask;
 
 	request->emitted_jiffies = jiffies;
 
diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
index e0d87e287b10..660af00329b4 100644
--- a/drivers/gpu/drm/i915/i915_reset.c
+++ b/drivers/gpu/drm/i915/i915_reset.c
@@ -1041,7 +1041,7 @@ void i915_reset(struct drm_i915_private *i915,
 static inline int intel_gt_reset_engine(struct drm_i915_private *i915,
 					struct intel_engine_cs *engine)
 {
-	return intel_gpu_reset(i915, intel_engine_flag(engine));
+	return intel_gpu_reset(i915, engine->mask);
 }
 
 /**
@@ -1239,7 +1239,7 @@ void i915_handle_error(struct drm_i915_private *i915,
 				continue;
 
 			if (i915_reset_engine(engine, msg) == 0)
-				engine_mask &= ~intel_engine_flag(engine);
+				engine_mask &= ~engine->mask;
 
 			clear_bit(I915_RESET_ENGINE + engine->id,
 				  &i915->gpu_error.flags);
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 53f68c983334..41deffa3c69c 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -305,6 +305,7 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
 		return -ENOMEM;
 
 	engine->id = id;
+	engine->mask = BIT(id);
 	engine->i915 = dev_priv;
 	__sprint_engine_name(engine->name, info);
 	engine->hw_id = engine->guc_id = info->hw_id;
diff --git a/drivers/gpu/drm/i915/intel_hangcheck.c b/drivers/gpu/drm/i915/intel_hangcheck.c
index 2d0ef22518b1..2bc3003ffe6f 100644
--- a/drivers/gpu/drm/i915/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/intel_hangcheck.c
@@ -120,7 +120,7 @@ engine_stuck(struct intel_engine_cs *engine, u64 acthd)
 	 */
 	tmp = I915_READ_CTL(engine);
 	if (tmp & RING_WAIT) {
-		i915_handle_error(dev_priv, BIT(engine->id), 0,
+		i915_handle_error(dev_priv, engine->mask, 0,
 				  "stuck wait on %s", engine->name);
 		I915_WRITE_CTL(engine, tmp);
 		return ENGINE_WAIT_KICK;
@@ -286,13 +286,13 @@ static void i915_hangcheck_elapsed(struct work_struct *work)
 		hangcheck_store_sample(engine, &hc);
 
 		if (hc.stalled) {
-			hung |= intel_engine_flag(engine);
+			hung |= engine->mask;
 			if (hc.action != ENGINE_DEAD)
-				stuck |= intel_engine_flag(engine);
+				stuck |= engine->mask;
 		}
 
 		if (hc.wedged)
-			wedged |= intel_engine_flag(engine);
+			wedged |= engine->mask;
 	}
 
 	if (wedged) {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index e2c415aa8354..07379c02af83 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1841,8 +1841,8 @@ static int switch_context(struct i915_request *rq)
 				goto err;
 		} while (--loops);
 
-		if (intel_engine_flag(engine) & ppgtt->pd_dirty_rings) {
-			unwind_mm = intel_engine_flag(engine);
+		if (ppgtt->pd_dirty_rings & engine->mask) {
+			unwind_mm = engine->mask;
 			ppgtt->pd_dirty_rings &= ~unwind_mm;
 			hw_flags = MI_FORCE_RESTORE;
 		}
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index f37c5d1468be..c1afe77260d0 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -354,6 +354,7 @@ struct intel_engine_cs {
 	enum intel_engine_id id;
 	unsigned int hw_id;
 	unsigned int guc_id;
+	unsigned long mask;
 
 	u8 uabi_class;
 
@@ -654,12 +655,6 @@ execlists_port_complete(struct intel_engine_execlists * const execlists,
 	return port;
 }
 
-static inline unsigned int
-intel_engine_flag(const struct intel_engine_cs *engine)
-{
-	return BIT(engine->id);
-}
-
 static inline u32
 intel_read_status_page(const struct intel_engine_cs *engine, int reg)
 {
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 9fe9ba66b5ec..625d5b6055be 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -1137,7 +1137,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
 	}
 
 out_reset:
-	fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
+	fake_hangcheck(rq->i915, rq->engine->mask);
 
 	if (tsk) {
 		struct igt_wedge_me w;
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* [PATCH 38/38] drm/i915: Load balancing across a virtual engine
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (36 preceding siblings ...)
  2019-01-18 14:01 ` [PATCH 37/38] drm/i915: Store the BIT(engine->id) as the engine's mask Chris Wilson
@ 2019-01-18 14:01 ` Chris Wilson
  2019-01-18 14:17 ` ✗ Fi.CI.BAT: failure for series starting with [01/38] drm/i915/execlists: Store the highest priority context Patchwork
  2019-01-24 16:28 ` ✗ Fi.CI.BAT: failure for series starting with [01/38] drm/i915/execlists: Store the highest priority context (rev2) Patchwork
  39 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 14:01 UTC (permalink / raw)
  To: intel-gfx

Having allowed the user to define a set of engines that they will want
to only use, we go one step further and allow them to bind those engines
into a single virtual instance. Submitting a batch to the virtual engine
will then forward it to any one of the set in a manner as best to
distribute load.  The virtual engine has a single timeline across all
engines (it operates as a single queue), so it is not able to concurrently
run batches across multiple engines by itself; that is left up to the user
to submit multiple concurrent batches to multiple queues. Multiple users
will be load balanced across the system.

The mechanism used for load balancing in this patch is a late greedy
balancer. When a request is ready for execution, it is added to each
engine's queue, and when an engine is ready for its next request it
claims it from the virtual engine. The first engine to do so, wins, i.e.
the request is executed at the earliest opportunity (idle moment) in the
system.

As not all HW is created equal, the user is still able to skip the
virtual engine and execute the batch on a specific engine, all within the
same queue. It will then be executed in order on the correct engine,
with execution on other virtual engines being moved away due to the load
detection.

A couple of areas for potential improvement left!

- The virtual engine always take priority over equal-priority tasks.
Mostly broken up by applying FQ_CODEL rules for prioritising new clients,
and hopefully the virtual and real engines are not then congested (i.e.
all work is via virtual engines, or all work is to the real engine).

- We require the breadcrumb irq around every virtual engine request. For
normal engines, we eliminate the need for the slow round trip via
interrupt by using the submit fence and queueing in order. For virtual
engines, we have to allow any job to transfer to a new ring, and cannot
coalesce the submissions, so require the completion fence instead,
forcing the persistent use of interrupts.

Other areas of improvement are more general, such as reducing lock
contention, reducing dispatch overhead, looking at direct submission
rather than bouncing around tasklets etc.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/Makefile                 |   1 +
 drivers/gpu/drm/i915/i915_gem.h               |   5 +
 drivers/gpu/drm/i915/i915_gem_context.c       |  95 +++-
 drivers/gpu/drm/i915/i915_gem_context.h       |  14 +
 drivers/gpu/drm/i915/i915_request.c           |   2 +-
 drivers/gpu/drm/i915/i915_scheduler.c         |   1 +
 drivers/gpu/drm/i915/intel_lrc.c              | 453 +++++++++++++++++-
 drivers/gpu/drm/i915/intel_lrc.h              |   6 +
 drivers/gpu/drm/i915/intel_ringbuffer.h       |   8 +
 .../gpu/drm/i915/selftests/i915_gem_context.c | 102 +---
 drivers/gpu/drm/i915/selftests/i915_request.c |  85 +---
 .../gpu/drm/i915/selftests/igt_live_test.c    |  78 +++
 .../gpu/drm/i915/selftests/igt_live_test.h    |  29 ++
 drivers/gpu/drm/i915/selftests/intel_lrc.c    | 209 ++++++++
 drivers/gpu/drm/i915/selftests/mock_context.c |  10 +-
 include/uapi/drm/i915_drm.h                   |  27 ++
 16 files changed, 938 insertions(+), 187 deletions(-)
 create mode 100644 drivers/gpu/drm/i915/selftests/igt_live_test.c
 create mode 100644 drivers/gpu/drm/i915/selftests/igt_live_test.h

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index f206fbc85cee..ebc690897009 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -168,6 +168,7 @@ i915-$(CONFIG_DRM_I915_SELFTEST) += \
 	selftests/i915_random.o \
 	selftests/i915_selftest.o \
 	selftests/igt_flush_test.o \
+	selftests/igt_live_test.o \
 	selftests/igt_reset.o \
 	selftests/igt_spinner.o
 
diff --git a/drivers/gpu/drm/i915/i915_gem.h b/drivers/gpu/drm/i915/i915_gem.h
index b0e4b976880c..9905fcdd33c8 100644
--- a/drivers/gpu/drm/i915/i915_gem.h
+++ b/drivers/gpu/drm/i915/i915_gem.h
@@ -89,4 +89,9 @@ static inline bool __tasklet_is_enabled(const struct tasklet_struct *t)
 	return !atomic_read(&t->count);
 }
 
+static inline bool __tasklet_is_scheduled(struct tasklet_struct *t)
+{
+	return test_bit(TASKLET_STATE_SCHED, &t->state);
+}
+
 #endif /* __I915_GEM_H__ */
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 701af326120d..dbd5c4299246 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -92,6 +92,7 @@
 #include "i915_drv.h"
 #include "i915_trace.h"
 #include "i915_user_extensions.h"
+#include "intel_lrc.h"
 #include "intel_workarounds.h"
 
 #define ALL_L3_SLICES(dev) (1 << NUM_L3_SLICES(dev)) - 1
@@ -225,7 +226,10 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
 			ce->ops->destroy(ce);
 	}
 
-	kfree(ctx->engines);
+	if (ctx->engines) {
+		intel_virtual_engine_put(ctx->engines[0]);
+		kfree(ctx->engines);
+	}
 
 	if (ctx->timeline)
 		i915_timeline_put(ctx->timeline);
@@ -348,14 +352,8 @@ __create_hw_context(struct drm_i915_private *dev_priv,
 	ctx->i915 = dev_priv;
 	ctx->sched.priority = I915_USER_PRIORITY(I915_PRIORITY_NORMAL);
 
-	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
-		struct intel_context *ce = &ctx->__engine[n];
-
-		ce->gem_context = ctx;
-		INIT_LIST_HEAD(&ce->active_link);
-		INIT_LIST_HEAD(&ce->signal_link);
-		INIT_LIST_HEAD(&ce->signals);
-	}
+	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++)
+		intel_context_init(&ctx->__engine[n], ctx, dev_priv->engine[n]);
 
 	INIT_RADIX_TREE(&ctx->handles_vma, GFP_KERNEL);
 	INIT_LIST_HEAD(&ctx->handles_list);
@@ -764,7 +762,8 @@ last_request_on_engine(struct i915_timeline *timeline,
 
 	rq = i915_gem_active_raw(&timeline->last_request,
 				 &engine->i915->drm.struct_mutex);
-	if (rq && rq->engine == engine) {
+	if (rq &&
+	    (rq->engine == engine || intel_engine_is_virtual(rq->engine))) {
 		GEM_TRACE("last request for %s on engine %s: %llx:%llu\n",
 			  timeline->name, engine->name,
 			  rq->fence.context, rq->fence.seqno);
@@ -909,13 +908,83 @@ static int set_ppgtt(struct i915_gem_context *ctx, u32 id)
 	return err;
 }
 
+static int check_user_mbz64(u64 __user *user)
+{
+	u64 mbz;
+
+	if (get_user(mbz, user))
+		return -EFAULT;
+
+	return mbz ? -EINVAL : 0;
+}
+
 struct set_engines {
 	struct i915_gem_context *ctx;
 	struct intel_engine_cs **engines;
 	unsigned int nengine;
 };
 
+static int set_engines__load_balance(struct i915_user_extension __user *base,
+				     void *data)
+
+{
+	struct i915_context_engines_load_balance __user *ext =
+		container_of(base, typeof(*ext) __user, base);
+	const struct set_engines *set = data;
+	struct intel_engine_cs *ve;
+	unsigned int n;
+	u64 mask;
+	int err;
+
+	if (set->engines[0])
+		return -EEXIST;
+
+	if (!HAS_EXECLISTS(set->ctx->i915))
+		return -ENODEV;
+
+	if (USES_GUC_SUBMISSION(set->ctx->i915))
+		return -ENODEV;
+
+	err = check_user_mbz64(&ext->flags);
+	if (err)
+		return err;
+
+	for (n = 0; n < ARRAY_SIZE(ext->mbz); n++) {
+		err = check_user_mbz64(&ext->mbz[n]);
+		if (err)
+			return err;
+	}
+
+	if (get_user(mask, &ext->engines_mask))
+		return -EFAULT;
+
+	if (mask == ~0ull) {
+		ve = intel_execlists_create_virtual(set->ctx,
+						    set->engines + 1,
+						    set->nengine - 1);
+	} else {
+		struct intel_engine_cs *stack[64];
+		int bit;
+
+		n = 0;
+		for_each_set_bit(bit, (unsigned long *)&mask, set->nengine)
+			stack[n++] = set->engines[bit];
+
+		ve = intel_execlists_create_virtual(set->ctx, stack, n);
+	}
+	if (IS_ERR(ve))
+		return PTR_ERR(ve);
+
+	if (cmpxchg(&set->engines[0], NULL, ve)) {
+		intel_virtual_engine_put(ve);
+		return -EEXIST;
+	}
+
+	return 0;
+}
+
 static const i915_user_extension_fn set_engines__extensions[] = {
+	[I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE] = set_engines__load_balance,
 };
 
 static int set_engines(struct i915_gem_context *ctx,
@@ -979,13 +1048,17 @@ static int set_engines(struct i915_gem_context *ctx,
 					   ARRAY_SIZE(set_engines__extensions),
 					   &set);
 	if (err) {
+		intel_virtual_engine_put(set.engines[0]);
 		kfree(set.engines);
 		return err;
 	}
 
 out:
 	mutex_lock(&ctx->i915->drm.struct_mutex);
-	kfree(ctx->engines);
+	if (ctx->engines) {
+		intel_virtual_engine_put(ctx->engines[0]);
+		kfree(ctx->engines);
+	}
 	ctx->engines = set.engines;
 	ctx->nengine = set.nengine;
 	mutex_unlock(&ctx->i915->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 635f693994c4..2aeca60674ad 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -171,6 +171,7 @@ struct i915_gem_context {
 	struct intel_context {
 		struct i915_gem_context *gem_context;
 		struct intel_engine_cs *active;
+		struct intel_engine_cs *owner;
 		struct list_head active_link;
 		struct list_head signal_link;
 		struct list_head signals;
@@ -386,4 +387,17 @@ static inline void i915_gem_context_put(struct i915_gem_context *ctx)
 	kref_put(&ctx->ref, i915_gem_context_release);
 }
 
+static inline void
+intel_context_init(struct intel_context *ce,
+		   struct i915_gem_context *ctx,
+		   struct intel_engine_cs *engine)
+{
+	ce->gem_context = ctx;
+	ce->owner = engine;
+
+	INIT_LIST_HEAD(&ce->active_link);
+	INIT_LIST_HEAD(&ce->signal_link);
+	INIT_LIST_HEAD(&ce->signals);
+}
+
 #endif /* !__I915_GEM_CONTEXT_H__ */
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index 22ac02f02b33..6637e9b93a6a 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -860,7 +860,7 @@ void i915_request_add(struct i915_request *request)
 	prev = i915_gem_active_raw(&timeline->last_request,
 				   &request->i915->drm.struct_mutex);
 	if (prev && !i915_request_completed(prev)) {
-		if (prev->engine == engine)
+		if (prev->engine == engine && !intel_engine_is_virtual(engine))
 			i915_sw_fence_await_sw_fence(&request->submit,
 						     &prev->submit,
 						     &request->submitq);
diff --git a/drivers/gpu/drm/i915/i915_scheduler.c b/drivers/gpu/drm/i915/i915_scheduler.c
index 53e76a91ad75..c1d119753063 100644
--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -347,6 +347,7 @@ static void __i915_schedule(struct i915_request *rq,
 
 		node->attr.priority = prio;
 		if (!list_empty(&node->link)) {
+			GEM_BUG_ON(intel_engine_is_virtual(engine));
 			if (last != engine) {
 				pl = i915_sched_lookup_priolist(engine, prio);
 				last = engine;
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index e365c12c08eb..2d2803b76a12 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -164,6 +164,29 @@
 #define WA_TAIL_DWORDS 2
 #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
 
+struct virtual_engine {
+	struct intel_engine_cs base;
+
+	struct intel_context context;
+	struct kref kref;
+
+	struct intel_engine_cs *bound;
+
+	struct i915_request *request;
+	struct ve_node {
+		struct rb_node rb;
+		int prio;
+	} nodes[I915_NUM_ENGINES];
+
+	unsigned int count;
+	struct intel_engine_cs *siblings[0];
+};
+
+static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
+{
+	return container_of(engine, struct virtual_engine, base);
+}
+
 static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
 					    struct intel_engine_cs *engine,
 					    struct intel_context *ce);
@@ -283,6 +306,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
 	lockdep_assert_held(&engine->execution_lock);
 
 	list_for_each_entry_safe_reverse(rq, rn, &engine->requests, link) {
+		struct intel_engine_cs *owner;
+
 		if (i915_request_completed(rq))
 			break;
 
@@ -291,14 +316,20 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
 
 		GEM_BUG_ON(rq->hw_context->active);
 
-		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
-		if (rq_prio(rq) != prio) {
-			prio = rq_prio(rq);
-			pl = i915_sched_lookup_priolist(engine, prio);
-		}
-		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
+		owner = rq->hw_context->owner;
+		if (likely(owner == engine)) {
+			GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
+			if (rq_prio(rq) != prio) {
+				prio = rq_prio(rq);
+				pl = i915_sched_lookup_priolist(engine, prio);
+			}
+			GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
 
-		list_add(&rq->sched.link, pl);
+			list_add(&rq->sched.link, pl);
+		} else {
+			rq->engine = owner;
+			owner->submit_request(rq);
+		}
 
 		active = rq;
 	}
@@ -308,7 +339,8 @@ __unwind_incomplete_requests(struct intel_engine_cs *engine)
 	 * stream, so give it the equivalent small priority bump to prevent
 	 * it being gazumped a second time by another peer.
 	 */
-	if (!(prio & I915_PRIORITY_NEWCLIENT)) {
+	if (!(prio & I915_PRIORITY_NEWCLIENT) &&
+	    active->hw_context->owner == engine) {
 		prio |= I915_PRIORITY_NEWCLIENT;
 		list_move_tail(&active->sched.link,
 			       i915_sched_lookup_priolist(engine, prio));
@@ -544,6 +576,50 @@ static void complete_preempt_context(struct intel_engine_execlists *execlists)
 						  execlists));
 }
 
+static void virtual_update_register_offsets(u32 *regs,
+					    struct intel_engine_cs *engine)
+{
+	u32 base = engine->mmio_base;
+
+	regs[CTX_CONTEXT_CONTROL] =
+		i915_mmio_reg_offset(RING_CONTEXT_CONTROL(engine));
+	regs[CTX_RING_HEAD] = i915_mmio_reg_offset(RING_HEAD(base));
+	regs[CTX_RING_TAIL] = i915_mmio_reg_offset(RING_TAIL(base));
+	regs[CTX_RING_BUFFER_START] = i915_mmio_reg_offset(RING_START(base));
+	regs[CTX_RING_BUFFER_CONTROL] = i915_mmio_reg_offset(RING_CTL(base));
+
+	regs[CTX_BB_HEAD_U] = i915_mmio_reg_offset(RING_BBADDR_UDW(base));
+	regs[CTX_BB_HEAD_L] = i915_mmio_reg_offset(RING_BBADDR(base));
+	regs[CTX_BB_STATE] = i915_mmio_reg_offset(RING_BBSTATE(base));
+	regs[CTX_SECOND_BB_HEAD_U] =
+		i915_mmio_reg_offset(RING_SBBADDR_UDW(base));
+	regs[CTX_SECOND_BB_HEAD_L] = i915_mmio_reg_offset(RING_SBBADDR(base));
+	regs[CTX_SECOND_BB_STATE] = i915_mmio_reg_offset(RING_SBBSTATE(base));
+
+	regs[CTX_CTX_TIMESTAMP] =
+		i915_mmio_reg_offset(RING_CTX_TIMESTAMP(base));
+	regs[CTX_PDP3_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 3));
+	regs[CTX_PDP3_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 3));
+	regs[CTX_PDP2_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 2));
+	regs[CTX_PDP2_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 2));
+	regs[CTX_PDP1_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 1));
+	regs[CTX_PDP1_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 1));
+	regs[CTX_PDP0_UDW] = i915_mmio_reg_offset(GEN8_RING_PDP_UDW(engine, 0));
+	regs[CTX_PDP0_LDW] = i915_mmio_reg_offset(GEN8_RING_PDP_LDW(engine, 0));
+
+	if (engine->class == RENDER_CLASS) {
+		regs[CTX_RCS_INDIRECT_CTX] =
+			i915_mmio_reg_offset(RING_INDIRECT_CTX(base));
+		regs[CTX_RCS_INDIRECT_CTX_OFFSET] =
+			i915_mmio_reg_offset(RING_INDIRECT_CTX_OFFSET(base));
+		regs[CTX_BB_PER_CTX_PTR] =
+			i915_mmio_reg_offset(RING_BB_PER_CTX_PTR(base));
+
+		regs[CTX_R_PWR_CLK_STATE] =
+			i915_mmio_reg_offset(GEN8_R_PWR_CLK_STATE);
+	}
+}
+
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct intel_engine_execlists * const execlists = &engine->execlists;
@@ -553,6 +629,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	struct i915_request *last = port_request(port);
 	struct rb_node *rb;
 	bool submit = false;
+	const struct intel_context *q_context;
+	int q_priority;
 
 	/*
 	 * Hardware submission is through 2 ports. Conceptually each port
@@ -576,6 +654,35 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * and context switches) submission.
 	 */
 
+restart_virtual_engine:
+	q_context = execlists->queue_context;
+	q_priority = execlists->queue_priority;
+	for (rb = rb_first_cached(&execlists->virtual); rb; ) {
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+		struct i915_request *rq = READ_ONCE(ve->request);
+		struct intel_engine_cs *active;
+
+		if (!rq) {
+			rb_erase_cached(rb, &execlists->virtual);
+			RB_CLEAR_NODE(rb);
+			rb = rb_first_cached(&execlists->virtual);
+			continue;
+		}
+
+		active = READ_ONCE(ve->context.active);
+		if (active && active != engine) {
+			rb = rb_next(rb);
+			continue;
+		}
+
+		if (q_priority >= rq_prio(rq)) {
+			q_priority = rq_prio(rq);
+			q_context = rq->hw_context;
+		}
+		break;
+	}
+
 	if (last) {
 		/*
 		 * Don't resubmit or switch until all outstanding
@@ -597,10 +704,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		if (!execlists_is_active(execlists, EXECLISTS_ACTIVE_HWACK))
 			return;
 
-		if (need_preempt(engine, last,
-				 execlists->queue_priority,
-				 execlists->queue_context)) {
-			GEM_BUG_ON(execlists->queue_priority == INT_MIN);
+		if (need_preempt(engine, last, q_priority, q_context)) {
+			GEM_BUG_ON(q_priority == INT_MIN);
 			inject_preempt_context(engine);
 			return;
 		}
@@ -640,6 +745,67 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		last->tail = last->wa_tail;
 	}
 
+	if (rb) { /* XXX virtual is always taking precedence */
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+		struct i915_request *rq;
+
+		spin_lock(&ve->base.execution_lock);
+
+		rq = ve->request;
+		if (unlikely(!rq)) { /* lost the race to a sibling */
+			spin_unlock(&ve->base.execution_lock);
+			goto restart_virtual_engine;
+		}
+
+		if (rq_prio(rq) >= q_priority) {
+			if (last && !can_merge_rq(rq, last)) {
+				spin_unlock(&ve->base.execution_lock);
+				return;
+			}
+
+			GEM_BUG_ON(rq->engine != &ve->base);
+			ve->request = NULL;
+			ve->base.execlists.queue_priority = INT_MIN;
+			rb_erase_cached(rb, &execlists->virtual);
+			RB_CLEAR_NODE(rb);
+
+			GEM_BUG_ON(rq->hw_context != &ve->context);
+			rq->engine = engine;
+
+			if (engine != ve->bound) {
+				u32 *regs = ve->context.lrc_reg_state;
+				unsigned int n;
+
+				GEM_BUG_ON(READ_ONCE(ve->context.active));
+				virtual_update_register_offsets(regs, engine);
+				ve->bound = engine;
+
+				/*
+				 * Move the bound engine to the top of the list
+				 * for future execution. We then kick this
+				 * tasklet first before checking others, so that
+				 * we preferentially reuse this set of bound
+				 * registers.
+				 */
+				for (n = 1; n < ve->count; n++) {
+					if (ve->siblings[n] == engine) {
+						swap(ve->siblings[n],
+						     ve->siblings[0]);
+						break;
+					}
+				}
+			}
+
+			__i915_request_submit(rq);
+			trace_i915_request_in(rq, port_index(port, execlists));
+			submit = true;
+			last = rq;
+		}
+
+		spin_unlock(&ve->base.execution_lock);
+	}
+
 	while ((rb = rb_first_cached(&execlists->queue))) {
 		struct i915_priolist *p = to_priolist(rb);
 		struct i915_request *rq, *rn;
@@ -2723,6 +2889,246 @@ void intel_lr_context_resume(struct drm_i915_private *i915)
 	}
 }
 
+static void virtual_engine_free(struct kref *kref)
+{
+	struct virtual_engine *ve = container_of(kref, typeof(*ve), kref);
+	unsigned int n;
+
+	GEM_BUG_ON(ve->request);
+	GEM_BUG_ON(ve->context.active);
+
+	for (n = 0; n < ve->count; n++) {
+		struct intel_engine_cs *sibling = ve->siblings[n];
+		struct rb_node *node = &ve->nodes[sibling->id].rb;
+
+		if (RB_EMPTY_NODE(node))
+			continue;
+
+		spin_lock_irq(&sibling->execution_lock);
+
+		if (!RB_EMPTY_NODE(node))
+			rb_erase_cached(node, &sibling->execlists.virtual);
+
+		spin_unlock_irq(&sibling->execution_lock);
+	}
+	GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
+
+	if (ve->context.state)
+		execlists_context_destroy(&ve->context);
+
+	kfree(ve);
+}
+
+static void virtual_context_unpin(struct intel_context *ce)
+{
+	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
+
+	execlists_context_unpin(ce);
+
+	kref_put(&ve->kref, virtual_engine_free);
+}
+
+static const struct intel_context_ops virtual_context_ops = {
+	.unpin = virtual_context_unpin,
+};
+
+static struct intel_context *
+virtual_context_pin(struct intel_engine_cs *engine,
+		    struct i915_gem_context *ctx)
+{
+	struct virtual_engine *ve = to_virtual_engine(engine);
+	struct intel_context *ce = &ve->context;
+
+	lockdep_assert_held(&ctx->i915->drm.struct_mutex);
+
+	if (likely(ce->pin_count++))
+		return ce;
+	GEM_BUG_ON(!ce->pin_count); /* no overflow please! */
+
+	kref_get(&ve->kref);
+	ce->ops = &virtual_context_ops;
+
+	if (!ve->bound)
+		ve->bound = ve->siblings[0];
+
+	/* Note: we must use a real engine class for setting up reg state */
+	return __execlists_context_pin(ve->bound, ctx, ce);
+}
+
+static void virtual_submission_tasklet(unsigned long data)
+{
+	struct virtual_engine * const ve = (struct virtual_engine *)data;
+	unsigned int n;
+	int prio;
+
+	prio = READ_ONCE(ve->base.execlists.queue_priority);
+	if (prio == INT_MIN)
+		return;
+
+	local_irq_disable();
+	for (n = 0; READ_ONCE(ve->request) && n < ve->count; n++) {
+		struct intel_engine_cs *sibling = ve->siblings[n];
+		struct ve_node * const node = &ve->nodes[sibling->id];
+		struct rb_node **parent, *rb;
+		bool first;
+
+		spin_lock(&sibling->execution_lock);
+
+		if (!RB_EMPTY_NODE(&node->rb)) {
+			first = rb_first_cached(&sibling->execlists.virtual) == &node->rb;
+			if (prio == node->prio || (prio > node->prio && first))
+				goto submit_engine;
+
+			rb_erase_cached(&node->rb, &sibling->execlists.virtual);
+		}
+
+		rb = NULL;
+		first = true;
+		parent = &sibling->execlists.virtual.rb_root.rb_node;
+		while (*parent) {
+			struct ve_node *other;
+
+			rb = *parent;
+			other = rb_entry(rb, typeof(*other), rb);
+			if (prio > other->prio) {
+				parent = &rb->rb_left;
+			} else {
+				parent = &rb->rb_right;
+				first = false;
+			}
+		}
+
+		rb_link_node(&node->rb, rb, parent);
+		rb_insert_color_cached(&node->rb,
+				       &sibling->execlists.virtual,
+				       first);
+
+submit_engine:
+		GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
+		node->prio = prio;
+		if (first && prio > sibling->execlists.queue_priority)
+			tasklet_hi_schedule(&sibling->execlists.tasklet);
+
+		spin_unlock(&sibling->execution_lock);
+	}
+	local_irq_enable();
+}
+
+static void virtual_submit_request(struct i915_request *request)
+{
+	struct virtual_engine *ve = to_virtual_engine(request->engine);
+
+	GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
+
+	GEM_BUG_ON(ve->request);
+	ve->base.execlists.queue_priority = rq_prio(request);
+	WRITE_ONCE(ve->request, request);
+
+	tasklet_schedule(&ve->base.execlists.tasklet);
+}
+
+struct intel_engine_cs *
+intel_execlists_create_virtual(struct i915_gem_context *ctx,
+			       struct intel_engine_cs **siblings,
+			       unsigned int count)
+{
+	struct virtual_engine *ve;
+	unsigned int n;
+	int err;
+
+	if (!count)
+		return ERR_PTR(-EINVAL);
+
+	ve = kzalloc(sizeof(*ve) + count * sizeof(*ve->siblings), GFP_KERNEL);
+	if (!ve)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&ve->kref);
+	ve->base.i915 = ctx->i915;
+	ve->base.id = -1;
+	ve->base.class = OTHER_CLASS;
+	ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
+	ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
+	ve->base.flags = I915_ENGINE_IS_VIRTUAL;
+
+	snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
+
+	INIT_LIST_HEAD(&ve->base.requests);
+	spin_lock_init(&ve->base.execution_lock);
+	lockdep_set_subclass(&ve->base.execution_lock, EXECUTION_VIRTUAL);
+
+	intel_context_init(&ve->context, ctx, &ve->base);
+
+	ve->base.context_pin = virtual_context_pin;
+	ve->base.request_alloc = execlists_request_alloc;
+
+	ve->base.schedule = i915_schedule;
+	ve->base.submit_request = virtual_submit_request;
+
+	ve->base.execlists.queue_priority = INT_MIN;
+	tasklet_init(&ve->base.execlists.tasklet,
+		     virtual_submission_tasklet,
+		     (unsigned long)ve);
+
+	ve->count = count;
+	for (n = 0; n < count; n++) {
+		struct intel_engine_cs *sibling = siblings[n];
+
+		ve->siblings[n] = sibling;
+		ve->base.mask |= sibling->mask;
+
+		if (sibling->execlists.tasklet.func != execlists_submission_tasklet) {
+			err = -ENODEV;
+			ve->count = n;
+			goto err_put;
+		}
+
+		if (RB_EMPTY_NODE(&ve->nodes[sibling->id].rb)) {
+			err = -EINVAL;
+			ve->count = n;
+			goto err_put;
+		}
+
+		RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
+
+		if (ve->base.class != OTHER_CLASS) {
+			if (ve->base.class != sibling->class) {
+				err = -EINVAL;
+				ve->count = n;
+				goto err_put;
+			}
+			continue;
+		}
+
+		ve->base.class = sibling->class;
+		snprintf(ve->base.name, sizeof(ve->base.name),
+			 "v%dx%d", ve->base.class, count);
+		ve->base.context_size = sibling->context_size;
+
+		ve->base.emit_bb_start = sibling->emit_bb_start;
+		ve->base.emit_flush = sibling->emit_flush;
+		ve->base.emit_breadcrumb = sibling->emit_breadcrumb;
+		ve->base.emit_breadcrumb_sz = sibling->emit_breadcrumb_sz;
+	}
+
+	return &ve->base;
+
+err_put:
+	virtual_engine_free(&ve->kref);
+	return ERR_PTR(err);
+}
+
+void intel_virtual_engine_put(struct intel_engine_cs *engine)
+{
+	if (!engine)
+		return;
+
+	if (engine->id != -1)
+		return;
+
+	kref_put(&to_virtual_engine(engine)->kref, virtual_engine_free);
+}
+
 void intel_execlists_show_requests(struct intel_engine_cs *engine,
 				   struct drm_printer *m,
 				   void (*show_request)(struct drm_printer *m,
@@ -2780,6 +3186,29 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
 		show_request(m, last, "\t\tQ ");
 	}
 
+	last = NULL;
+	count = 0;
+	for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
+		struct virtual_engine *ve =
+			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
+		struct i915_request *rq = READ_ONCE(ve->request);
+
+		if (rq) {
+			if (count++ < max - 1)
+				show_request(m, rq, "\t\tV ");
+			else
+				last = rq;
+		}
+	}
+	if (last) {
+		if (count > max) {
+			drm_printf(m,
+				   "\t\t...skipping %d virtual requests...\n",
+				   count - max);
+		}
+		show_request(m, last, "\t\tV ");
+	}
+
 	spin_unlock_irqrestore(&engine->execution_lock, flags);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.h b/drivers/gpu/drm/i915/intel_lrc.h
index 3d86c27c6b32..c2a5ede8c126 100644
--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -112,4 +112,10 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
 							const char *prefix),
 				   unsigned int max);
 
+struct intel_engine_cs *
+intel_execlists_create_virtual(struct i915_gem_context *ctx,
+			       struct intel_engine_cs **siblings,
+			       unsigned int count);
+void intel_virtual_engine_put(struct intel_engine_cs *engine);
+
 #endif /* _INTEL_LRC_H_ */
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index c1afe77260d0..b1497b2ada2a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -317,6 +317,7 @@ struct intel_engine_execlists {
 	 * @queue: queue of requests, in priority lists
 	 */
 	struct rb_root_cached queue;
+	struct rb_root_cached virtual;
 
 	/**
 	 * @csb_write: control register for Context Switch buffer
@@ -508,6 +509,7 @@ struct intel_engine_cs {
 #define I915_ENGINE_NEEDS_CMD_PARSER BIT(0)
 #define I915_ENGINE_SUPPORTS_STATS   BIT(1)
 #define I915_ENGINE_HAS_PREEMPTION   BIT(2)
+#define I915_ENGINE_IS_VIRTUAL       BIT(3)
 	unsigned int flags;
 
 	/*
@@ -590,6 +592,12 @@ static inline bool __execlists_need_preempt(int prio, int last)
 	return prio > max(0, last);
 }
 
+static inline bool
+intel_engine_is_virtual(const struct intel_engine_cs *engine)
+{
+	return engine->flags & I915_ENGINE_IS_VIRTUAL;
+}
+
 static inline void
 execlists_set_active(struct intel_engine_execlists *execlists,
 		     unsigned int bit)
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 3e68c2888b9c..22beeb674b17 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -27,6 +27,7 @@
 #include "../i915_selftest.h"
 #include "i915_random.h"
 #include "igt_flush_test.h"
+#include "igt_live_test.h"
 
 #include "mock_drm.h"
 #include "mock_gem_device.h"
@@ -34,77 +35,6 @@
 
 #define DW_PER_PAGE (PAGE_SIZE / sizeof(u32))
 
-struct live_test {
-	struct drm_i915_private *i915;
-	const char *func;
-	const char *name;
-
-	unsigned int reset_global;
-	unsigned int reset_engine[I915_NUM_ENGINES];
-};
-
-static int begin_live_test(struct live_test *t,
-			   struct drm_i915_private *i915,
-			   const char *func,
-			   const char *name)
-{
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-	int err;
-
-	t->i915 = i915;
-	t->func = func;
-	t->name = name;
-
-	err = i915_gem_wait_for_idle(i915,
-				     I915_WAIT_LOCKED,
-				     MAX_SCHEDULE_TIMEOUT);
-	if (err) {
-		pr_err("%s(%s): failed to idle before, with err=%d!",
-		       func, name, err);
-		return err;
-	}
-
-	t->reset_global = i915_reset_count(&i915->gpu_error);
-
-	for_each_engine(engine, i915, id)
-		t->reset_engine[id] =
-			i915_reset_engine_count(&i915->gpu_error, engine);
-
-	return 0;
-}
-
-static int end_live_test(struct live_test *t)
-{
-	struct drm_i915_private *i915 = t->i915;
-	struct intel_engine_cs *engine;
-	enum intel_engine_id id;
-
-	if (igt_flush_test(i915, I915_WAIT_LOCKED))
-		return -EIO;
-
-	if (t->reset_global != i915_reset_count(&i915->gpu_error)) {
-		pr_err("%s(%s): GPU was reset %d times!\n",
-		       t->func, t->name,
-		       i915_reset_count(&i915->gpu_error) - t->reset_global);
-		return -EIO;
-	}
-
-	for_each_engine(engine, i915, id) {
-		if (t->reset_engine[id] ==
-		    i915_reset_engine_count(&i915->gpu_error, engine))
-			continue;
-
-		pr_err("%s(%s): engine '%s' was reset %d times!\n",
-		       t->func, t->name, engine->name,
-		       i915_reset_engine_count(&i915->gpu_error, engine) -
-		       t->reset_engine[id]);
-		return -EIO;
-	}
-
-	return 0;
-}
-
 static int live_nop_switch(void *arg)
 {
 	const unsigned int nctx = 1024;
@@ -114,7 +44,7 @@ static int live_nop_switch(void *arg)
 	enum intel_engine_id id;
 	intel_wakeref_t wakeref;
 	struct drm_file *file;
-	struct live_test t;
+	struct igt_live_test t;
 	unsigned long n;
 	int err = -ENODEV;
 
@@ -178,7 +108,7 @@ static int live_nop_switch(void *arg)
 		pr_info("Populated %d contexts on %s in %lluns\n",
 			nctx, engine->name, ktime_to_ns(times[1] - times[0]));
 
-		err = begin_live_test(&t, i915, __func__, engine->name);
+		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 		if (err)
 			goto out_unlock;
 
@@ -226,7 +156,7 @@ static int live_nop_switch(void *arg)
 				break;
 		}
 
-		err = end_live_test(&t);
+		err = igt_live_test_end(&t);
 		if (err)
 			goto out_unlock;
 
@@ -576,7 +506,7 @@ static int igt_ctx_exec(void *arg)
 		struct drm_file *file;
 		IGT_TIMEOUT(end_time);
 		LIST_HEAD(objects);
-		struct live_test t;
+		struct igt_live_test t;
 
 		if (!intel_engine_can_store_dword(engine))
 			continue;
@@ -590,7 +520,7 @@ static int igt_ctx_exec(void *arg)
 
 		mutex_lock(&i915->drm.struct_mutex);
 
-		err = begin_live_test(&t, i915, __func__, engine->name);
+		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 		if (err)
 			goto out_unlock;
 
@@ -651,7 +581,7 @@ static int igt_ctx_exec(void *arg)
 		}
 
 out_unlock:
-		if (end_live_test(&t))
+		if (igt_live_test_end(&t))
 			err = -EIO;
 		mutex_unlock(&i915->drm.struct_mutex);
 
@@ -684,7 +614,7 @@ static int igt_shared_ctx_exec(void *arg)
 		struct drm_file *file;
 		IGT_TIMEOUT(end_time);
 		LIST_HEAD(objects);
-		struct live_test t;
+		struct igt_live_test t;
 
 		if (!intel_engine_can_store_dword(engine))
 			continue;
@@ -695,7 +625,7 @@ static int igt_shared_ctx_exec(void *arg)
 
 		mutex_lock(&i915->drm.struct_mutex);
 
-		err = begin_live_test(&t, i915, __func__, engine->name);
+		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 		if (err)
 			goto out_unlock;
 
@@ -774,7 +704,7 @@ static int igt_shared_ctx_exec(void *arg)
 		}
 
 out_unlock:
-		if (end_live_test(&t))
+		if (igt_live_test_end(&t))
 			err = -EIO;
 		mutex_unlock(&i915->drm.struct_mutex);
 
@@ -797,7 +727,7 @@ static int igt_ctx_readonly(void *arg)
 	I915_RND_STATE(prng);
 	IGT_TIMEOUT(end_time);
 	LIST_HEAD(objects);
-	struct live_test t;
+	struct igt_live_test t;
 	int err = -ENODEV;
 
 	/*
@@ -812,7 +742,7 @@ static int igt_ctx_readonly(void *arg)
 
 	mutex_lock(&i915->drm.struct_mutex);
 
-	err = begin_live_test(&t, i915, __func__, "");
+	err = igt_live_test_begin(&t, i915, __func__, "");
 	if (err)
 		goto out_unlock;
 
@@ -891,7 +821,7 @@ static int igt_ctx_readonly(void *arg)
 	}
 
 out_unlock:
-	if (end_live_test(&t))
+	if (igt_live_test_end(&t))
 		err = -EIO;
 	mutex_unlock(&i915->drm.struct_mutex);
 
@@ -1119,7 +1049,7 @@ static int igt_vm_isolation(void *arg)
 	struct drm_file *file;
 	I915_RND_STATE(prng);
 	unsigned long count;
-	struct live_test t;
+	struct igt_live_test t;
 	unsigned int id;
 	u64 vm_total;
 	int err;
@@ -1138,7 +1068,7 @@ static int igt_vm_isolation(void *arg)
 
 	mutex_lock(&i915->drm.struct_mutex);
 
-	err = begin_live_test(&t, i915, __func__, "");
+	err = igt_live_test_begin(&t, i915, __func__, "");
 	if (err)
 		goto out_unlock;
 
@@ -1209,7 +1139,7 @@ static int igt_vm_isolation(void *arg)
 out_rpm:
 	intel_runtime_pm_put(i915, wakeref);
 out_unlock:
-	if (end_live_test(&t))
+	if (igt_live_test_end(&t))
 		err = -EIO;
 	mutex_unlock(&i915->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_request.c b/drivers/gpu/drm/i915/selftests/i915_request.c
index fa079c6d9c65..f60ba9b3990c 100644
--- a/drivers/gpu/drm/i915/selftests/i915_request.c
+++ b/drivers/gpu/drm/i915/selftests/i915_request.c
@@ -26,6 +26,7 @@
 
 #include "../i915_selftest.h"
 #include "i915_random.h"
+#include "igt_live_test.h"
 #include "lib_sw_fence.h"
 
 #include "mock_context.h"
@@ -506,66 +507,12 @@ int i915_request_mock_selftests(void)
 	return err;
 }
 
-struct live_test {
-	struct drm_i915_private *i915;
-	const char *func;
-	const char *name;
-
-	unsigned int reset_count;
-};
-
-static int begin_live_test(struct live_test *t,
-			   struct drm_i915_private *i915,
-			   const char *func,
-			   const char *name)
-{
-	int err;
-
-	t->i915 = i915;
-	t->func = func;
-	t->name = name;
-
-	err = i915_gem_wait_for_idle(i915,
-				     I915_WAIT_LOCKED,
-				     MAX_SCHEDULE_TIMEOUT);
-	if (err) {
-		pr_err("%s(%s): failed to idle before, with err=%d!",
-		       func, name, err);
-		return err;
-	}
-
-	t->reset_count = i915_reset_count(&i915->gpu_error);
-
-	return 0;
-}
-
-static int end_live_test(struct live_test *t)
-{
-	struct drm_i915_private *i915 = t->i915;
-
-	i915_retire_requests(i915);
-
-	if (wait_for(intel_engines_are_idle(i915), 10)) {
-		pr_err("%s(%s): GPU not idle\n", t->func, t->name);
-		return -EIO;
-	}
-
-	if (t->reset_count != i915_reset_count(&i915->gpu_error)) {
-		pr_err("%s(%s): GPU was reset %d times!\n",
-		       t->func, t->name,
-		       i915_reset_count(&i915->gpu_error) - t->reset_count);
-		return -EIO;
-	}
-
-	return 0;
-}
-
 static int live_nop_request(void *arg)
 {
 	struct drm_i915_private *i915 = arg;
 	struct intel_engine_cs *engine;
 	intel_wakeref_t wakeref;
-	struct live_test t;
+	struct igt_live_test t;
 	unsigned int id;
 	int err = -ENODEV;
 
@@ -583,7 +530,7 @@ static int live_nop_request(void *arg)
 		IGT_TIMEOUT(end_time);
 		ktime_t times[2] = {};
 
-		err = begin_live_test(&t, i915, __func__, engine->name);
+		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 		if (err)
 			goto out_unlock;
 
@@ -625,7 +572,7 @@ static int live_nop_request(void *arg)
 				break;
 		}
 
-		err = end_live_test(&t);
+		err = igt_live_test_end(&t);
 		if (err)
 			goto out_unlock;
 
@@ -713,7 +660,7 @@ static int live_empty_request(void *arg)
 	struct intel_engine_cs *engine;
 	intel_wakeref_t wakeref;
 	struct i915_vma *batch;
-	struct live_test t;
+	struct igt_live_test t;
 	unsigned int id;
 	int err = 0;
 
@@ -737,7 +684,7 @@ static int live_empty_request(void *arg)
 		unsigned long n, prime;
 		ktime_t times[2] = {};
 
-		err = begin_live_test(&t, i915, __func__, engine->name);
+		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 		if (err)
 			goto out_batch;
 
@@ -773,7 +720,7 @@ static int live_empty_request(void *arg)
 				break;
 		}
 
-		err = end_live_test(&t);
+		err = igt_live_test_end(&t);
 		if (err)
 			goto out_batch;
 
@@ -873,7 +820,7 @@ static int live_all_engines(void *arg)
 	struct i915_request *request[I915_NUM_ENGINES];
 	intel_wakeref_t wakeref;
 	struct i915_vma *batch;
-	struct live_test t;
+	struct igt_live_test t;
 	unsigned int id;
 	int err;
 
@@ -885,7 +832,7 @@ static int live_all_engines(void *arg)
 	mutex_lock(&i915->drm.struct_mutex);
 	wakeref = intel_runtime_pm_get(i915);
 
-	err = begin_live_test(&t, i915, __func__, "");
+	err = igt_live_test_begin(&t, i915, __func__, "");
 	if (err)
 		goto out_unlock;
 
@@ -957,7 +904,7 @@ static int live_all_engines(void *arg)
 		request[id] = NULL;
 	}
 
-	err = end_live_test(&t);
+	err = igt_live_test_end(&t);
 
 out_request:
 	for_each_engine(engine, i915, id)
@@ -978,7 +925,7 @@ static int live_sequential_engines(void *arg)
 	struct i915_request *prev = NULL;
 	struct intel_engine_cs *engine;
 	intel_wakeref_t wakeref;
-	struct live_test t;
+	struct igt_live_test t;
 	unsigned int id;
 	int err;
 
@@ -991,7 +938,7 @@ static int live_sequential_engines(void *arg)
 	mutex_lock(&i915->drm.struct_mutex);
 	wakeref = intel_runtime_pm_get(i915);
 
-	err = begin_live_test(&t, i915, __func__, "");
+	err = igt_live_test_begin(&t, i915, __func__, "");
 	if (err)
 		goto out_unlock;
 
@@ -1074,7 +1021,7 @@ static int live_sequential_engines(void *arg)
 		GEM_BUG_ON(!i915_request_completed(request[id]));
 	}
 
-	err = end_live_test(&t);
+	err = igt_live_test_end(&t);
 
 out_request:
 	for_each_engine(engine, i915, id) {
@@ -1152,7 +1099,7 @@ static int live_breadcrumbs_smoketest(void *arg)
 	enum intel_engine_id id;
 	intel_wakeref_t wakeref;
 	struct drm_file *file;
-	struct live_test live;
+	struct igt_live_test live;
 	unsigned int n;
 	int ret = 0;
 
@@ -1190,7 +1137,7 @@ static int live_breadcrumbs_smoketest(void *arg)
 		}
 	}
 
-	ret = begin_live_test(&live, i915, __func__, "");
+	ret = igt_live_test_begin(&live, i915, __func__, "");
 	if (ret)
 		goto out_contexts;
 
@@ -1248,7 +1195,7 @@ static int live_breadcrumbs_smoketest(void *arg)
 		num_waits, num_fences, RUNTIME_INFO(i915)->num_rings, ncpus);
 
 	mutex_lock(&i915->drm.struct_mutex);
-	ret = end_live_test(&live) ?: ret;
+	ret = igt_live_test_end(&live) ?: ret;
 out_contexts:
 	mutex_unlock(&i915->drm.struct_mutex);
 	kfree(t[0].contexts);
diff --git a/drivers/gpu/drm/i915/selftests/igt_live_test.c b/drivers/gpu/drm/i915/selftests/igt_live_test.c
new file mode 100644
index 000000000000..3e902761cd16
--- /dev/null
+++ b/drivers/gpu/drm/i915/selftests/igt_live_test.c
@@ -0,0 +1,78 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2018 Intel Corporation
+ */
+
+#include "../i915_drv.h"
+
+#include "../i915_selftest.h"
+#include "igt_flush_test.h"
+#include "igt_live_test.h"
+
+int igt_live_test_begin(struct igt_live_test *t,
+			struct drm_i915_private *i915,
+			const char *func,
+			const char *name)
+{
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int err;
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+
+	t->i915 = i915;
+	t->func = func;
+	t->name = name;
+
+	err = i915_gem_wait_for_idle(i915,
+				     I915_WAIT_INTERRUPTIBLE |
+				     I915_WAIT_LOCKED,
+				     MAX_SCHEDULE_TIMEOUT);
+	if (err) {
+		pr_err("%s(%s): failed to idle before, with err=%d!",
+		       func, name, err);
+		return err;
+	}
+
+	t->reset_global = i915_reset_count(&i915->gpu_error);
+
+	for_each_engine(engine, i915, id)
+		t->reset_engine[id] =
+			i915_reset_engine_count(&i915->gpu_error, engine);
+
+	return 0;
+}
+
+int igt_live_test_end(struct igt_live_test *t)
+{
+	struct drm_i915_private *i915 = t->i915;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+
+	lockdep_assert_held(&i915->drm.struct_mutex);
+
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		return -EIO;
+
+	if (t->reset_global != i915_reset_count(&i915->gpu_error)) {
+		pr_err("%s(%s): GPU was reset %d times!\n",
+		       t->func, t->name,
+		       i915_reset_count(&i915->gpu_error) - t->reset_global);
+		return -EIO;
+	}
+
+	for_each_engine(engine, i915, id) {
+		if (t->reset_engine[id] ==
+		    i915_reset_engine_count(&i915->gpu_error, engine))
+			continue;
+
+		pr_err("%s(%s): engine '%s' was reset %d times!\n",
+		       t->func, t->name, engine->name,
+		       i915_reset_engine_count(&i915->gpu_error, engine) -
+		       t->reset_engine[id]);
+		return -EIO;
+	}
+
+	return 0;
+}
diff --git a/drivers/gpu/drm/i915/selftests/igt_live_test.h b/drivers/gpu/drm/i915/selftests/igt_live_test.h
new file mode 100644
index 000000000000..94570044675e
--- /dev/null
+++ b/drivers/gpu/drm/i915/selftests/igt_live_test.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-License-Identifier: MIT
+ *
+ * Copyright © 2019 Intel Corporation
+ */
+
+#ifndef IGT_LIVE_TEST_H
+#define IGT_LIVE_TEST_H
+
+#include "../i915_gem.h"
+
+struct drm_i915_private;
+
+struct igt_live_test {
+	struct drm_i915_private *i915;
+	const char *func;
+	const char *name;
+
+	unsigned int reset_global;
+	unsigned int reset_engine[I915_NUM_ENGINES];
+};
+
+int igt_live_test_begin(struct igt_live_test *t,
+			struct drm_i915_private *i915,
+			const char *func,
+			const char *name);
+int igt_live_test_end(struct igt_live_test *t);
+
+#endif /* IGT_LIVE_TEST_H */
diff --git a/drivers/gpu/drm/i915/selftests/intel_lrc.c b/drivers/gpu/drm/i915/selftests/intel_lrc.c
index 607c969ea605..bb0ea918a9ef 100644
--- a/drivers/gpu/drm/i915/selftests/intel_lrc.c
+++ b/drivers/gpu/drm/i915/selftests/intel_lrc.c
@@ -4,6 +4,8 @@
  * Copyright © 2018 Intel Corporation
  */
 
+#include <linux/prime_numbers.h>
+
 #include "../i915_reset.h"
 
 #include "../i915_selftest.h"
@@ -641,6 +643,212 @@ static int live_preempt_smoke(void *arg)
 	return err;
 }
 
+struct live_test {
+	struct drm_i915_private *i915;
+	const char *func;
+	const char *name;
+
+	unsigned int reset_count;
+	bool wedge;
+};
+
+static int begin_live_test(struct live_test *t,
+			   struct drm_i915_private *i915,
+			   const char *func,
+			   const char *name)
+{
+	t->i915 = i915;
+	t->func = func;
+	t->name = name;
+
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		return -EIO;
+
+	t->reset_count = i915_reset_count(&i915->gpu_error);
+
+	return 0;
+}
+
+static int end_live_test(struct live_test *t)
+{
+	struct drm_i915_private *i915 = t->i915;
+
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		return -EIO;
+
+	if (t->reset_count != i915_reset_count(&i915->gpu_error)) {
+		pr_err("%s(%s): GPU was reset %d times!\n",
+		       t->func, t->name,
+		       i915_reset_count(&i915->gpu_error) - t->reset_count);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+static int nop_virtual_engine(struct drm_i915_private *i915,
+			      struct intel_engine_cs **siblings,
+			      unsigned int nsibling,
+			      unsigned int nctx,
+			      unsigned int flags)
+#define CHAIN BIT(0)
+{
+	IGT_TIMEOUT(end_time);
+	struct i915_request *request[16];
+	struct i915_gem_context *ctx[16];
+	struct intel_engine_cs *ve[16];
+	unsigned long n, prime, nc;
+	ktime_t times[2] = {};
+	struct live_test t;
+	int err;
+
+	GEM_BUG_ON(!nctx || nctx > ARRAY_SIZE(ctx));
+
+	for (n = 0; n < nctx; n++) {
+		ctx[n] = kernel_context(i915);
+		if (!ctx[n])
+			return -ENOMEM;
+
+		ve[n] = intel_execlists_create_virtual(ctx[n],
+						       siblings, nsibling);
+		if (IS_ERR(ve[n]))
+			return PTR_ERR(ve[n]);
+	}
+
+	err = begin_live_test(&t, i915, __func__, ve[0]->name);
+	if (err)
+		goto out;
+
+	for_each_prime_number_from(prime, 1, 8192) {
+		times[1] = ktime_get_raw();
+
+		if (flags & CHAIN) {
+			for (nc = 0; nc < nctx; nc++) {
+				for (n = 0; n < prime; n++) {
+					request[nc] =
+						i915_request_alloc(ve[nc], ctx[nc]);
+					if (IS_ERR(request[nc])) {
+						err = PTR_ERR(request[nc]);
+						goto out;
+					}
+
+					i915_request_add(request[nc]);
+				}
+			}
+		} else {
+			for (n = 0; n < prime; n++) {
+				for (nc = 0; nc < nctx; nc++) {
+					request[nc] =
+						i915_request_alloc(ve[nc], ctx[nc]);
+					if (IS_ERR(request[nc])) {
+						err = PTR_ERR(request[nc]);
+						goto out;
+					}
+
+					i915_request_add(request[nc]);
+				}
+			}
+		}
+
+		for (nc = 0; nc < nctx; nc++) {
+			if (i915_request_wait(request[nc],
+					      I915_WAIT_LOCKED,
+					      HZ / 10) < 0) {
+				pr_err("%s(%s): wait for %llx:%lld timed out\n",
+				       __func__, ve[0]->name,
+				       request[nc]->fence.context,
+				       request[nc]->fence.seqno);
+
+				GEM_TRACE("%s(%s) failed at request %llx:%lld\n",
+					  __func__, ve[0]->name,
+					  request[nc]->fence.context,
+					  request[nc]->fence.seqno);
+				GEM_TRACE_DUMP();
+				i915_gem_set_wedged(i915);
+				break;
+			}
+		}
+
+		times[1] = ktime_sub(ktime_get_raw(), times[1]);
+		if (prime == 1)
+			times[0] = times[1];
+
+		if (__igt_timeout(end_time, NULL))
+			break;
+	}
+
+	err = end_live_test(&t);
+	if (err)
+		goto out;
+
+	pr_info("Requestx%d latencies on %s: 1 = %lluns, %lu = %lluns\n",
+		nctx, ve[0]->name, ktime_to_ns(times[0]),
+		prime, div64_u64(ktime_to_ns(times[1]), prime));
+
+out:
+	if (igt_flush_test(i915, I915_WAIT_LOCKED))
+		err = -EIO;
+
+	for (nc = 0; nc < nctx; nc++) {
+		intel_virtual_engine_put(ve[nc]);
+		kernel_context_close(ctx[nc]);
+	}
+	return err;
+}
+
+static int live_virtual_engine(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	unsigned int class, inst;
+	int err = -ENODEV;
+
+	if (USES_GUC_SUBMISSION(i915))
+		return 0;
+
+	mutex_lock(&i915->drm.struct_mutex);
+
+	for_each_engine(engine, i915, id) {
+		err = nop_virtual_engine(i915, &engine, 1, 1, 0);
+		if (err) {
+			pr_err("Failed to wrap engine %s: err=%d\n",
+			       engine->name, err);
+			goto out_unlock;
+		}
+	}
+
+	for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
+		int nsibling, n;
+
+		nsibling = 0;
+		for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
+			if (!i915->engine_class[class][inst])
+				break;
+
+			siblings[nsibling++] = i915->engine_class[class][inst];
+		}
+		if (nsibling < 2)
+			continue;
+
+		for (n = 1; n <= nsibling + 1; n++) {
+			err = nop_virtual_engine(i915, siblings, nsibling,
+						 n, 0);
+			if (err)
+				goto out_unlock;
+		}
+
+		err = nop_virtual_engine(i915, siblings, nsibling, n, CHAIN);
+		if (err)
+			goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&i915->drm.struct_mutex);
+	return err;
+}
+
 int intel_execlists_live_selftests(struct drm_i915_private *i915)
 {
 	static const struct i915_subtest tests[] = {
@@ -649,6 +857,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
 		SUBTEST(live_late_preempt),
 		SUBTEST(live_preempt_hang),
 		SUBTEST(live_preempt_smoke),
+		SUBTEST(live_virtual_engine),
 	};
 
 	if (!HAS_EXECLISTS(i915))
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index f13f9c726034..47ed6de311bc 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -45,14 +45,8 @@ mock_context(struct drm_i915_private *i915,
 	INIT_LIST_HEAD(&ctx->handles_list);
 	INIT_LIST_HEAD(&ctx->hw_id_link);
 
-	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++) {
-		struct intel_context *ce = &ctx->__engine[n];
-
-		ce->gem_context = ctx;
-		INIT_LIST_HEAD(&ce->active_link);
-		INIT_LIST_HEAD(&ce->signal_link);
-		INIT_LIST_HEAD(&ce->signals);
-	}
+	for (n = 0; n < ARRAY_SIZE(ctx->__engine); n++)
+		intel_context_init(&ctx->__engine[n], ctx, i915->engine[n]);
 
 	ret = i915_gem_context_pin_hw_id(ctx);
 	if (ret < 0)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 84f42317de92..40a758dbe18d 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -123,6 +123,7 @@ enum drm_i915_gem_engine_class {
 };
 
 #define I915_ENGINE_CLASS_INVALID_NONE -1
+#define I915_ENGINE_CLASS_INVALID_VIRTUAL 0
 
 /**
  * DOC: perf_events exposed by i915 through /sys/bus/event_sources/drivers/i915
@@ -1488,8 +1489,34 @@ struct drm_i915_gem_context_param {
 	__u64 value;
 };
 
+/*
+ * i915_context_engines_load_balance:
+ *
+ * Enable load balancing across this set of engines.
+ *
+ * Into the I915_EXEC_DEFAULT slot [0], a virtual engine is created that when
+ * used will proxy the execbuffer request onto one of the set of engines
+ * in such a way as to distribute the load evenly across the set.
+ *
+ * The set of engines must be compatible (e.g. the same HW class) as they
+ * will share the same logical GPU context and ring.
+ *
+ * To intermix rendering with the virtual engine and direct rendering onto
+ * the backing engines (bypassing the load balancing proxy), the context must
+ * be defined to use a single timeline for all engines.
+ */
+struct i915_context_engines_load_balance {
+	struct i915_user_extension base;
+
+	__u64 flags; /* all undefined flags must be zero */
+	__u64 engines_mask; /* selection mask of engines[] */
+
+	__u64 mbz[4]; /* reserved for future use; must be zero */
+};
+
 struct i915_context_param_engines {
 	__u64 extensions; /* linked chain of extension blocks, 0 terminates */
+#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0
 
 	struct {
 		__u16 engine_class; /* see enum drm_i915_gem_engine_class */
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [01/38] drm/i915/execlists: Store the highest priority context
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (37 preceding siblings ...)
  2019-01-18 14:01 ` [PATCH 38/38] drm/i915: Load balancing across a virtual engine Chris Wilson
@ 2019-01-18 14:17 ` Patchwork
  2019-01-24 16:28 ` ✗ Fi.CI.BAT: failure for series starting with [01/38] drm/i915/execlists: Store the highest priority context (rev2) Patchwork
  39 siblings, 0 replies; 66+ messages in thread
From: Patchwork @ 2019-01-18 14:17 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/38] drm/i915/execlists: Store the highest priority context
URL   : https://patchwork.freedesktop.org/series/55411/
State : failure

== Summary ==

CALL    scripts/checksyscalls.sh
  DESCEND  objtool
  CHK     include/generated/compile.h
  CC [M]  drivers/gpu/drm/i915/i915_timeline.o
In file included from ./arch/x86/include/asm/bug.h:83:0,
                 from ./include/linux/bug.h:5,
                 from ./include/linux/mmdebug.h:5,
                 from ./include/linux/gfp.h:5,
                 from ./include/linux/slab.h:15,
                 from ./include/linux/io-mapping.h:22,
                 from drivers/gpu/drm/i915/i915_drv.h:36,
                 from drivers/gpu/drm/i915/i915_timeline.c:7:
drivers/gpu/drm/i915/selftests/i915_timeline.c: In function ‘tl_write’:
./include/linux/lockdep.h:340:52: error: invalid type argument of ‘->’ (have ‘struct mutex’)
 #define lockdep_is_held(lock)  lock_is_held(&(lock)->dep_map)
                                                    ^
./include/asm-generic/bug.h:131:25: note: in definition of macro ‘WARN’
  int __ret_warn_on = !!(condition);    \
                         ^~~~~~~~~
./include/linux/lockdep.h:366:3: note: in expansion of macro ‘WARN_ON’
   WARN_ON(debug_locks && !lockdep_is_held(l)); \
   ^~~~~~~
./include/linux/lockdep.h:366:27: note: in expansion of macro ‘lockdep_is_held’
   WARN_ON(debug_locks && !lockdep_is_held(l)); \
                           ^~~~~~~~~~~~~~~
drivers/gpu/drm/i915/selftests/i915_timeline.c:310:2: note: in expansion of macro ‘lockdep_assert_held’
  lockdep_assert_held(tl->i915->drm.struct_mutex); /* lazy with rq refs */
  ^~~~~~~~~~~~~~~~~~~
scripts/Makefile.build:276: recipe for target 'drivers/gpu/drm/i915/i915_timeline.o' failed
make[4]: *** [drivers/gpu/drm/i915/i915_timeline.o] Error 1
scripts/Makefile.build:492: recipe for target 'drivers/gpu/drm/i915' failed
make[3]: *** [drivers/gpu/drm/i915] Error 2
scripts/Makefile.build:492: recipe for target 'drivers/gpu/drm' failed
make[2]: *** [drivers/gpu/drm] Error 2
scripts/Makefile.build:492: recipe for target 'drivers/gpu' failed
make[1]: *** [drivers/gpu] Error 2
Makefile:1042: recipe for target 'drivers' failed
make: *** [drivers] Error 2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 02/38] drm/i915: Make all GPU resets atomic
  2019-01-18 14:00 ` [PATCH 02/38] drm/i915: Make all GPU resets atomic Chris Wilson
@ 2019-01-18 14:22   ` Mika Kuoppala
  0 siblings, 0 replies; 66+ messages in thread
From: Mika Kuoppala @ 2019-01-18 14:22 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> In preparation for the next few commits, make resetting the GPU atomic.
> Currently, we have prepared gen6+ for atomic resetting of individual
> engines, but now there is a requirement to perform the whole device
> level reset (just the register poking) from inside an atomic context.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/i915_reset.c | 50 +++++++++++++++++--------------
>  1 file changed, 27 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index 342d9ee42601..b9d0ea70361c 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -144,14 +144,14 @@ static int i915_do_reset(struct drm_i915_private *i915,
>  
>  	/* Assert reset for at least 20 usec, and wait for acknowledgement. */
>  	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> -	usleep_range(50, 200);
> -	err = wait_for(i915_in_reset(pdev), 500);
> +	udelay(50);
> +	err = wait_for_atomic(i915_in_reset(pdev), 50);
>  
>  	/* Clear the reset request. */
>  	pci_write_config_byte(pdev, I915_GDRST, 0);
> -	usleep_range(50, 200);
> +	udelay(50);
>  	if (!err)
> -		err = wait_for(!i915_in_reset(pdev), 500);
> +		err = wait_for_atomic(!i915_in_reset(pdev), 50);
>  
>  	return err;
>  }
> @@ -171,7 +171,7 @@ static int g33_do_reset(struct drm_i915_private *i915,
>  	struct pci_dev *pdev = i915->drm.pdev;
>  
>  	pci_write_config_byte(pdev, I915_GDRST, GRDOM_RESET_ENABLE);
> -	return wait_for(g4x_reset_complete(pdev), 500);
> +	return wait_for_atomic(g4x_reset_complete(pdev), 50);
>  }
>  
>  static int g4x_do_reset(struct drm_i915_private *dev_priv,
> @@ -182,13 +182,13 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
>  	int ret;
>  
>  	/* WaVcpClkGateDisableForMediaReset:ctg,elk */
> -	I915_WRITE(VDECCLK_GATE_D,
> -		   I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
> -	POSTING_READ(VDECCLK_GATE_D);
> +	I915_WRITE_FW(VDECCLK_GATE_D,
> +		      I915_READ(VDECCLK_GATE_D) | VCP_UNIT_CLOCK_GATE_DISABLE);
> +	POSTING_READ_FW(VDECCLK_GATE_D);
>  
>  	pci_write_config_byte(pdev, I915_GDRST,
>  			      GRDOM_MEDIA | GRDOM_RESET_ENABLE);
> -	ret =  wait_for(g4x_reset_complete(pdev), 500);
> +	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
>  	if (ret) {
>  		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
>  		goto out;
> @@ -196,7 +196,7 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
>  
>  	pci_write_config_byte(pdev, I915_GDRST,
>  			      GRDOM_RENDER | GRDOM_RESET_ENABLE);
> -	ret =  wait_for(g4x_reset_complete(pdev), 500);
> +	ret =  wait_for_atomic(g4x_reset_complete(pdev), 50);
>  	if (ret) {
>  		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
>  		goto out;
> @@ -205,9 +205,9 @@ static int g4x_do_reset(struct drm_i915_private *dev_priv,
>  out:
>  	pci_write_config_byte(pdev, I915_GDRST, 0);
>  
> -	I915_WRITE(VDECCLK_GATE_D,
> -		   I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
> -	POSTING_READ(VDECCLK_GATE_D);
> +	I915_WRITE_FW(VDECCLK_GATE_D,
> +		      I915_READ(VDECCLK_GATE_D) & ~VCP_UNIT_CLOCK_GATE_DISABLE);
> +	POSTING_READ_FW(VDECCLK_GATE_D);
>  
>  	return ret;
>  }
> @@ -218,27 +218,29 @@ static int ironlake_do_reset(struct drm_i915_private *dev_priv,
>  {
>  	int ret;
>  
> -	I915_WRITE(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
> -	ret = intel_wait_for_register(dev_priv,
> -				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> -				      500);
> +	I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_RENDER | ILK_GRDOM_RESET_ENABLE);
> +	ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR,
> +					   ILK_GRDOM_RESET_ENABLE, 0,
> +					   5000, 0,
> +					   NULL);
>  	if (ret) {
>  		DRM_DEBUG_DRIVER("Wait for render reset failed\n");
>  		goto out;
>  	}
>  
> -	I915_WRITE(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
> -	ret = intel_wait_for_register(dev_priv,
> -				      ILK_GDSR, ILK_GRDOM_RESET_ENABLE, 0,
> -				      500);
> +	I915_WRITE_FW(ILK_GDSR, ILK_GRDOM_MEDIA | ILK_GRDOM_RESET_ENABLE);
> +	ret = __intel_wait_for_register_fw(dev_priv, ILK_GDSR,
> +					   ILK_GRDOM_RESET_ENABLE, 0,
> +					   5000, 0,
> +					   NULL);
>  	if (ret) {
>  		DRM_DEBUG_DRIVER("Wait for media reset failed\n");
>  		goto out;
>  	}
>  
>  out:
> -	I915_WRITE(ILK_GDSR, 0);
> -	POSTING_READ(ILK_GDSR);
> +	I915_WRITE_FW(ILK_GDSR, 0);
> +	POSTING_READ_FW(ILK_GDSR);
>  	return ret;
>  }
>  
> @@ -572,7 +574,9 @@ int intel_gpu_reset(struct drm_i915_private *i915, unsigned int engine_mask)
>  		ret = -ENODEV;
>  		if (reset) {
>  			GEM_TRACE("engine_mask=%x\n", engine_mask);
> +			preempt_disable();
>  			ret = reset(i915, engine_mask, retry);
> +			preempt_enable();
>  		}
>  		if (ret != -ETIMEDOUT || engine_mask != ALL_ENGINES)
>  			break;
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 05/38] drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest
  2019-01-18 14:00 ` [PATCH 05/38] drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest Chris Wilson
@ 2019-01-18 14:29   ` Mika Kuoppala
  0 siblings, 0 replies; 66+ messages in thread
From: Mika Kuoppala @ 2019-01-18 14:29 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

Chris Wilson <chris@chris-wilson.co.uk> writes:

> Trim the struct_mutex hold and exclude the call to i915_gem_set_wedged()
> as a reminder that it must be callable without struct_mutex held.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Michal Wajdeczko <michal.wajdeczko@intel.com>
> Cc: Mika Kuoppala <mika.kuoppala@intel.com>

Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>

> ---
>  drivers/gpu/drm/i915/selftests/intel_hangcheck.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> index 67431355cd6e..28144fd72550 100644
> --- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> +++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
> @@ -389,16 +389,16 @@ static int igt_wedged_reset(void *arg)
>  	/* Check that we can recover a wedged device with a GPU reset */
>  
>  	igt_global_reset_lock(i915);
> -	mutex_lock(&i915->drm.struct_mutex);
>  	wakeref = intel_runtime_pm_get(i915);
>  
>  	i915_gem_set_wedged(i915);
> -	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
>  
> +	mutex_lock(&i915->drm.struct_mutex);
> +	GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
>  	i915_reset(i915, ALL_ENGINES, NULL);
> +	mutex_unlock(&i915->drm.struct_mutex);
>  
>  	intel_runtime_pm_put(i915, wakeref);
> -	mutex_unlock(&i915->drm.struct_mutex);
>  	igt_global_reset_unlock(i915);
>  
>  	return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
> -- 
> 2.20.1
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA
  2019-01-18 14:00 ` [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA Chris Wilson
@ 2019-01-18 16:03   ` Tvrtko Ursulin
  2019-01-18 16:06     ` Chris Wilson
  2019-01-22 14:19     ` Chris Wilson
  0 siblings, 2 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-18 16:03 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:00, Chris Wilson wrote:
> Our goal is to remove struct_mutex and replace it with fine grained
> locking. One of the thorny issues is our eviction logic for reclaiming
> space for an execbuffer (or GTT mmaping, among a few other examples).
> While eviction itself is easy to move under a per-VM mutex, performing
> the activity tracking is less agreeable. One solution is not to do any
> MRU tracking and do a simple coarse evaluation during eviction of
> active/inactive, with a loose temporal ordering of last
> insertion/evaluation. That keeps all the locking constrained to when we
> are manipulating the VM itself, neatly avoiding the tricky handling of
> possible recursive locking during execbuf and elsewhere.
> 
> Note that discarding the MRU is unlikely to impact upon our efficiency
> to reclaim VM space (where we think a LRU model is best) as our
> current strategy is to use random idle replacement first before doing
> a search, and over time the use of softpinned 48b per-ppGTT is growing
> (thereby eliminating any need to perform any eviction searches, in
> theory at least).

There is still no mention of list consolidation.
 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem.c               | 10 +--
>   drivers/gpu/drm/i915/i915_gem_evict.c         | 71 ++++++++++++-------
>   drivers/gpu/drm/i915/i915_gem_gtt.c           | 15 ++--
>   drivers/gpu/drm/i915/i915_gem_gtt.h           | 26 +------
>   drivers/gpu/drm/i915/i915_gem_shrinker.c      |  8 ++-
>   drivers/gpu/drm/i915/i915_gem_stolen.c        |  3 +-
>   drivers/gpu/drm/i915/i915_gpu_error.c         | 37 +++++-----
>   drivers/gpu/drm/i915/i915_vma.c               |  9 +--
>   .../gpu/drm/i915/selftests/i915_gem_evict.c   |  4 +-
>   drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |  2 +-
>   10 files changed, 84 insertions(+), 101 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d20b42386c3c..f45186ddb236 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -253,10 +253,7 @@ i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
>   
>   	pinned = ggtt->vm.reserved;
>   	mutex_lock(&dev->struct_mutex);
> -	list_for_each_entry(vma, &ggtt->vm.active_list, vm_link)
> -		if (i915_vma_is_pinned(vma))
> -			pinned += vma->node.size;
> -	list_for_each_entry(vma, &ggtt->vm.inactive_list, vm_link)
> +	list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link)
>   		if (i915_vma_is_pinned(vma))
>   			pinned += vma->node.size;
>   	mutex_unlock(&dev->struct_mutex);
> @@ -1539,13 +1536,10 @@ static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
>   	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
>   
>   	for_each_ggtt_vma(vma, obj) {
> -		if (i915_vma_is_active(vma))
> -			continue;
> -
>   		if (!drm_mm_node_allocated(&vma->node))
>   			continue;
>   
> -		list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
> +		list_move_tail(&vma->vm_link, &vma->vm->bound_list);
>   	}
>   
>   	i915 = to_i915(obj->base.dev);
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
> index f6855401f247..5cfe4b75e7d6 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -126,14 +126,10 @@ i915_gem_evict_something(struct i915_address_space *vm,
>   	struct drm_i915_private *dev_priv = vm->i915;
>   	struct drm_mm_scan scan;
>   	struct list_head eviction_list;
> -	struct list_head *phases[] = {
> -		&vm->inactive_list,
> -		&vm->active_list,
> -		NULL,
> -	}, **phase;
>   	struct i915_vma *vma, *next;
>   	struct drm_mm_node *node;
>   	enum drm_mm_insert_mode mode;
> +	struct i915_vma *active;
>   	int ret;
>   
>   	lockdep_assert_held(&vm->i915->drm.struct_mutex);

This is the existing comment not included in the diff here:

	/*
	 * The goal is to evict objects and amalgamate space in LRU order.
	 * The oldest idle objects reside on the inactive list, which is in
	 * retirement order. The next objects to retire are those in flight,
	 * on the active list, again in retirement order.
	 *
	 * The retirement sequence is thus:
	 *   1. Inactive objects (already retired)
	 *   2. Active objects (will stall on unbinding)
	 *
	 * On each list, the oldest objects lie at the HEAD with the freshest
	 * object on the TAIL.
	 */

I appeal once more you just reword the first and last paragraph to
accurately reflect the fact there is only one list now.

> @@ -169,17 +165,46 @@ i915_gem_evict_something(struct i915_address_space *vm,
>   	 */
>   	if (!(flags & PIN_NONBLOCK))
>   		i915_retire_requests(dev_priv);
> -	else
> -		phases[1] = NULL;
>   
>   search_again:
> +	active = NULL;
>   	INIT_LIST_HEAD(&eviction_list);
> -	phase = phases;
> -	do {
> -		list_for_each_entry(vma, *phase, vm_link)
> -			if (mark_free(&scan, vma, flags, &eviction_list))
> -				goto found;
> -	} while (*++phase);
> +	list_for_each_entry_safe(vma, next, &vm->bound_list, vm_link) {
> +		/*
> +		 * We keep this list in a rough least-recently scanned order
> +		 * of active elements (inactive elements are cheap to reap).
> +		 * New entries are added to the end, and we move anything we
> +		 * scan to the end. The assumption is that the working set
> +		 * of applications is either steady state (and thanks to the
> +		 * userspace bo cache it almost always is) or volatile and
> +		 * frequently replaced after a frame, which are self-evicting!
> +		 * Given that assumption, the MRU order of the scan list is
> +		 * fairly static, and keeping it in least-recently scan order
> +		 * is suitable.
> +		 *
> +		 * To notice when we complete one full cycle, we record the
> +		 * first active element seen, before moving it to the tail.
> +		 */
> +		if (i915_vma_is_active(vma)) {
> +			if (vma == active) {
> +				if (flags & PIN_NONBLOCK)
> +					break;
> +
> +				active = ERR_PTR(-EAGAIN);
> +			}
> +
> +			if (active != ERR_PTR(-EAGAIN)) {
> +				if (!active)
> +					active = vma;
> +
> +				list_move_tail(&vma->vm_link, &vm->bound_list);
> +				continue;
> +			}
> +		}
> +
> +		if (mark_free(&scan, vma, flags, &eviction_list))
> +			goto found;
> +	}
>   
>   	/* Nothing found, clean up and bail out! */
>   	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
> @@ -388,11 +413,6 @@ int i915_gem_evict_for_node(struct i915_address_space *vm,
>    */
>   int i915_gem_evict_vm(struct i915_address_space *vm)
>   {
> -	struct list_head *phases[] = {
> -		&vm->inactive_list,
> -		&vm->active_list,
> -		NULL
> -	}, **phase;
>   	struct list_head eviction_list;
>   	struct i915_vma *vma, *next;
>   	int ret;
> @@ -412,16 +432,13 @@ int i915_gem_evict_vm(struct i915_address_space *vm)
>   	}
>   
>   	INIT_LIST_HEAD(&eviction_list);
> -	phase = phases;
> -	do {
> -		list_for_each_entry(vma, *phase, vm_link) {
> -			if (i915_vma_is_pinned(vma))
> -				continue;
> +	list_for_each_entry(vma, &vm->bound_list, vm_link) {
> +		if (i915_vma_is_pinned(vma))
> +			continue;
>   
> -			__i915_vma_pin(vma);
> -			list_add(&vma->evict_link, &eviction_list);
> -		}
> -	} while (*++phase);
> +		__i915_vma_pin(vma);
> +		list_add(&vma->evict_link, &eviction_list);
> +	}
>   
>   	ret = 0;
>   	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 9081e3bc5a59..2ad9070a54c1 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -491,9 +491,8 @@ static void i915_address_space_init(struct i915_address_space *vm, int subclass)
>   
>   	stash_init(&vm->free_pages);
>   
> -	INIT_LIST_HEAD(&vm->active_list);
> -	INIT_LIST_HEAD(&vm->inactive_list);
>   	INIT_LIST_HEAD(&vm->unbound_list);
> +	INIT_LIST_HEAD(&vm->bound_list);
>   }
>   
>   static void i915_address_space_fini(struct i915_address_space *vm)
> @@ -2111,8 +2110,7 @@ void i915_ppgtt_close(struct i915_address_space *vm)
>   static void ppgtt_destroy_vma(struct i915_address_space *vm)
>   {
>   	struct list_head *phases[] = {
> -		&vm->active_list,
> -		&vm->inactive_list,
> +		&vm->bound_list,
>   		&vm->unbound_list,
>   		NULL,
>   	}, **phase;
> @@ -2135,8 +2133,7 @@ void i915_ppgtt_release(struct kref *kref)
>   
>   	ppgtt_destroy_vma(&ppgtt->vm);
>   
> -	GEM_BUG_ON(!list_empty(&ppgtt->vm.active_list));
> -	GEM_BUG_ON(!list_empty(&ppgtt->vm.inactive_list));
> +	GEM_BUG_ON(!list_empty(&ppgtt->vm.bound_list));
>   	GEM_BUG_ON(!list_empty(&ppgtt->vm.unbound_list));
>   
>   	ppgtt->vm.cleanup(&ppgtt->vm);
> @@ -2801,8 +2798,7 @@ void i915_ggtt_cleanup_hw(struct drm_i915_private *dev_priv)
>   	mutex_lock(&dev_priv->drm.struct_mutex);
>   	i915_gem_fini_aliasing_ppgtt(dev_priv);
>   
> -	GEM_BUG_ON(!list_empty(&ggtt->vm.active_list));
> -	list_for_each_entry_safe(vma, vn, &ggtt->vm.inactive_list, vm_link)
> +	list_for_each_entry_safe(vma, vn, &ggtt->vm.bound_list, vm_link)
>   		WARN_ON(i915_vma_unbind(vma));
>   
>   	if (drm_mm_node_allocated(&ggtt->error_capture))
> @@ -3514,8 +3510,7 @@ void i915_gem_restore_gtt_mappings(struct drm_i915_private *dev_priv)
>   	ggtt->vm.closed = true; /* skip rewriting PTE on VMA unbind */
>   
>   	/* clflush objects bound into the GGTT and rebind them. */
> -	GEM_BUG_ON(!list_empty(&ggtt->vm.active_list));
> -	list_for_each_entry_safe(vma, vn, &ggtt->vm.inactive_list, vm_link) {
> +	list_for_each_entry_safe(vma, vn, &ggtt->vm.bound_list, vm_link) {
>   		struct drm_i915_gem_object *obj = vma->obj;
>   
>   		if (!(vma->flags & I915_VMA_GLOBAL_BIND))
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index a0039ea97cdc..bd679c8c56dd 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -299,32 +299,12 @@ struct i915_address_space {
>   	struct i915_page_directory_pointer *scratch_pdp; /* GEN8+ & 48b PPGTT */
>   
>   	/**
> -	 * List of objects currently involved in rendering.
> -	 *
> -	 * Includes buffers having the contents of their GPU caches
> -	 * flushed, not necessarily primitives. last_read_req
> -	 * represents when the rendering involved will be completed.
> -	 *
> -	 * A reference is held on the buffer while on this list.
> +	 * List of vma currently bound.
>   	 */
> -	struct list_head active_list;
> +	struct list_head bound_list;
>   
>   	/**
> -	 * LRU list of objects which are not in the ringbuffer and
> -	 * are ready to unbind, but are still in the GTT.
> -	 *
> -	 * last_read_req is NULL while an object is in this list.
> -	 *
> -	 * A reference is not held on the buffer while on this list,
> -	 * as merely being GTT-bound shouldn't prevent its being
> -	 * freed, and we'll pull it off the list in the free path.
> -	 */
> -	struct list_head inactive_list;
> -
> -	/**
> -	 * List of vma that have been unbound.
> -	 *
> -	 * A reference is not held on the buffer while on this list.
> +	 * List of vma that are not unbound.
>   	 */
>   	struct list_head unbound_list;
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index 8ceecb026910..a76d6c95c824 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -462,9 +462,13 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
>   
>   	/* We also want to clear any cached iomaps as they wrap vmap */
>   	list_for_each_entry_safe(vma, next,
> -				 &i915->ggtt.vm.inactive_list, vm_link) {
> +				 &i915->ggtt.vm.bound_list, vm_link) {
>   		unsigned long count = vma->node.size >> PAGE_SHIFT;
> -		if (vma->iomap && i915_vma_unbind(vma) == 0)
> +
> +		if (!vma->iomap || i915_vma_is_active(vma))
> +			continue;
> +
> +		if (i915_vma_unbind(vma) == 0)
>   			freed_pages += count;
>   	}
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
> index 9df615eea2d8..a9e365789686 100644
> --- a/drivers/gpu/drm/i915/i915_gem_stolen.c
> +++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
> @@ -701,7 +701,8 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_i915_private *dev_priv
>   	vma->pages = obj->mm.pages;
>   	vma->flags |= I915_VMA_GLOBAL_BIND;
>   	__i915_vma_set_map_and_fenceable(vma);
> -	list_move_tail(&vma->vm_link, &ggtt->vm.inactive_list);
> +
> +	list_move_tail(&vma->vm_link, &ggtt->vm.bound_list);
>   
>   	spin_lock(&dev_priv->mm.obj_lock);
>   	list_move_tail(&obj->mm.link, &dev_priv->mm.bound_list);
> diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
> index afccffa9f3f9..6f2fcad0a6ee 100644
> --- a/drivers/gpu/drm/i915/i915_gpu_error.c
> +++ b/drivers/gpu/drm/i915/i915_gpu_error.c
> @@ -1121,7 +1121,7 @@ static void capture_bo(struct drm_i915_error_buffer *err,
>   
>   static u32 capture_error_bo(struct drm_i915_error_buffer *err,
>   			    int count, struct list_head *head,
> -			    bool pinned_only)
> +			    bool active_only, bool pinned_only)

I still dislike the two booleans since it is unreadable at call
sites and only callers are either "true, false" and "false, true".
So basically mutually exclusive.

It is only a local function but still you did not even bother replying
any time I mentioned it (three or four rounds now)..

It is okay to stay like this, but stale comments and commit message
have to be improved IMO.

Regards,

Tvrtko

>   {
>   	struct i915_vma *vma;
>   	int i = 0;
> @@ -1130,6 +1130,9 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
>   		if (!vma->obj)
>   			continue;
>   
> +		if (active_only && !i915_vma_is_active(vma))
> +			continue;
> +
>   		if (pinned_only && !i915_vma_is_pinned(vma))
>   			continue;
>   
> @@ -1599,14 +1602,16 @@ static void gem_capture_vm(struct i915_gpu_state *error,
>   	int count;
>   
>   	count = 0;
> -	list_for_each_entry(vma, &vm->active_list, vm_link)
> -		count++;
> +	list_for_each_entry(vma, &vm->bound_list, vm_link)
> +		if (i915_vma_is_active(vma))
> +			count++;
>   
>   	active_bo = NULL;
>   	if (count)
>   		active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
>   	if (active_bo)
> -		count = capture_error_bo(active_bo, count, &vm->active_list, false);
> +		count = capture_error_bo(active_bo, count, &vm->bound_list,
> +					 true, false);
>   	else
>   		count = 0;
>   
> @@ -1644,28 +1649,20 @@ static void capture_pinned_buffers(struct i915_gpu_state *error)
>   	struct i915_address_space *vm = &error->i915->ggtt.vm;
>   	struct drm_i915_error_buffer *bo;
>   	struct i915_vma *vma;
> -	int count_inactive, count_active;
> -
> -	count_inactive = 0;
> -	list_for_each_entry(vma, &vm->inactive_list, vm_link)
> -		count_inactive++;
> +	int count;
>   
> -	count_active = 0;
> -	list_for_each_entry(vma, &vm->active_list, vm_link)
> -		count_active++;
> +	count = 0;
> +	list_for_each_entry(vma, &vm->bound_list, vm_link)
> +		count++;
>   
>   	bo = NULL;
> -	if (count_inactive + count_active)
> -		bo = kcalloc(count_inactive + count_active,
> -			     sizeof(*bo), GFP_ATOMIC);
> +	if (count)
> +		bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
>   	if (!bo)
>   		return;
>   
> -	count_inactive = capture_error_bo(bo, count_inactive,
> -					  &vm->active_list, true);
> -	count_active = capture_error_bo(bo + count_inactive, count_active,
> -					&vm->inactive_list, true);
> -	error->pinned_bo_count = count_inactive + count_active;
> +	error->pinned_bo_count =
> +		capture_error_bo(bo, count, &vm->bound_list, false, true);
>   	error->pinned_bo = bo;
>   }
>   
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index 5b4d78cdb4ca..7de28baffb8f 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -79,9 +79,6 @@ __i915_vma_retire(struct i915_vma *vma, struct i915_request *rq)
>   	if (--vma->active_count)
>   		return;
>   
> -	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
> -	list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
> -
>   	GEM_BUG_ON(!i915_gem_object_is_active(obj));
>   	if (--obj->active_count)
>   		return;
> @@ -659,7 +656,7 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
>   	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
>   	GEM_BUG_ON(!i915_gem_valid_gtt_space(vma, cache_level));
>   
> -	list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
> +	list_move_tail(&vma->vm_link, &vma->vm->bound_list);
>   
>   	if (vma->obj) {
>   		struct drm_i915_gem_object *obj = vma->obj;
> @@ -1003,10 +1000,8 @@ int i915_vma_move_to_active(struct i915_vma *vma,
>   	 * add the active reference first and queue for it to be dropped
>   	 * *last*.
>   	 */
> -	if (!i915_gem_active_isset(active) && !vma->active_count++) {
> -		list_move_tail(&vma->vm_link, &vma->vm->active_list);
> +	if (!i915_gem_active_isset(active) && !vma->active_count++)
>   		obj->active_count++;
> -	}
>   	i915_gem_active_set(active, rq);
>   	GEM_BUG_ON(!i915_vma_is_active(vma));
>   	GEM_BUG_ON(!obj->active_count);
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
> index 543d618c152b..9e6b9304f2bd 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
> @@ -90,7 +90,7 @@ static int populate_ggtt(struct drm_i915_private *i915)
>   		goto cleanup;
>   	}
>   
> -	if (list_empty(&i915->ggtt.vm.inactive_list)) {
> +	if (list_empty(&i915->ggtt.vm.bound_list)) {
>   		pr_err("No objects on the GGTT inactive list!\n");
>   		return -EINVAL;
>   	}
> @@ -111,7 +111,7 @@ static void unpin_ggtt(struct drm_i915_private *i915)
>   {
>   	struct i915_vma *vma;
>   
> -	list_for_each_entry(vma, &i915->ggtt.vm.inactive_list, vm_link)
> +	list_for_each_entry(vma, &i915->ggtt.vm.bound_list, vm_link)
>   		i915_vma_unpin(vma);
>   }
>   
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> index fea8ab14e79d..852b06cb50a0 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> @@ -1237,7 +1237,7 @@ static void track_vma_bind(struct i915_vma *vma)
>   	__i915_gem_object_pin_pages(obj);
>   
>   	vma->pages = obj->mm.pages;
> -	list_move_tail(&vma->vm_link, &vma->vm->inactive_list);
> +	list_move_tail(&vma->vm_link, &vma->vm->bound_list);
>   }
>   
>   static int exercise_mock(struct drm_i915_private *i915,
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 08/38] drm/i915: Pull VM lists under the VM mutex.
  2019-01-18 14:00 ` [PATCH 08/38] drm/i915: Pull VM lists under the VM mutex Chris Wilson
@ 2019-01-18 16:04   ` Tvrtko Ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-18 16:04 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:00, Chris Wilson wrote:
> A starting point to counter the pervasive struct_mutex. For the goal of
> avoiding (or at least blocking under them!) global locks during user
> request submission, a simple but important step is being able to manage
> each clients GTT separately. For which, we want to replace using the
> struct_mutex as the guard for all things GTT/VM and switch instead to a
> specific mutex inside i915_address_space.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

> ---
>   drivers/gpu/drm/i915/i915_gem.c                 | 14 ++++++++------
>   drivers/gpu/drm/i915/i915_gem_evict.c           |  2 ++
>   drivers/gpu/drm/i915/i915_gem_gtt.c             | 15 +++++++++++++--
>   drivers/gpu/drm/i915/i915_gem_shrinker.c        |  4 ++++
>   drivers/gpu/drm/i915/i915_gem_stolen.c          |  2 ++
>   drivers/gpu/drm/i915/i915_vma.c                 | 11 +++++++++++
>   drivers/gpu/drm/i915/selftests/i915_gem_evict.c |  3 +++
>   drivers/gpu/drm/i915/selftests/i915_gem_gtt.c   |  3 +++
>   8 files changed, 46 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index f45186ddb236..538fa5404603 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -245,18 +245,19 @@ int
>   i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
>   			    struct drm_file *file)
>   {
> -	struct drm_i915_private *dev_priv = to_i915(dev);
> -	struct i915_ggtt *ggtt = &dev_priv->ggtt;
> +	struct i915_ggtt *ggtt = &to_i915(dev)->ggtt;
>   	struct drm_i915_gem_get_aperture *args = data;
>   	struct i915_vma *vma;
>   	u64 pinned;
>   
> +	mutex_lock(&ggtt->vm.mutex);
> +
>   	pinned = ggtt->vm.reserved;
> -	mutex_lock(&dev->struct_mutex);
>   	list_for_each_entry(vma, &ggtt->vm.bound_list, vm_link)
>   		if (i915_vma_is_pinned(vma))
>   			pinned += vma->node.size;
> -	mutex_unlock(&dev->struct_mutex);
> +
> +	mutex_unlock(&ggtt->vm.mutex);
>   
>   	args->aper_size = ggtt->vm.total;
>   	args->aper_available_size = args->aper_size - pinned;
> @@ -1529,20 +1530,21 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
>   
>   static void i915_gem_object_bump_inactive_ggtt(struct drm_i915_gem_object *obj)
>   {
> -	struct drm_i915_private *i915;
> +	struct drm_i915_private *i915 = to_i915(obj->base.dev);
>   	struct list_head *list;
>   	struct i915_vma *vma;
>   
>   	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
>   
> +	mutex_lock(&i915->ggtt.vm.mutex);
>   	for_each_ggtt_vma(vma, obj) {
>   		if (!drm_mm_node_allocated(&vma->node))
>   			continue;
>   
>   		list_move_tail(&vma->vm_link, &vma->vm->bound_list);
>   	}
> +	mutex_unlock(&i915->ggtt.vm.mutex);
>   
> -	i915 = to_i915(obj->base.dev);
>   	spin_lock(&i915->mm.obj_lock);
>   	list = obj->bind_count ? &i915->mm.bound_list : &i915->mm.unbound_list;
>   	list_move_tail(&obj->mm.link, list);
> diff --git a/drivers/gpu/drm/i915/i915_gem_evict.c b/drivers/gpu/drm/i915/i915_gem_evict.c
> index 5cfe4b75e7d6..dc137701acb8 100644
> --- a/drivers/gpu/drm/i915/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/i915_gem_evict.c
> @@ -432,6 +432,7 @@ int i915_gem_evict_vm(struct i915_address_space *vm)
>   	}
>   
>   	INIT_LIST_HEAD(&eviction_list);
> +	mutex_lock(&vm->mutex);
>   	list_for_each_entry(vma, &vm->bound_list, vm_link) {
>   		if (i915_vma_is_pinned(vma))
>   			continue;
> @@ -439,6 +440,7 @@ int i915_gem_evict_vm(struct i915_address_space *vm)
>   		__i915_vma_pin(vma);
>   		list_add(&vma->evict_link, &eviction_list);
>   	}
> +	mutex_unlock(&vm->mutex);
>   
>   	ret = 0;
>   	list_for_each_entry_safe(vma, next, &eviction_list, evict_link) {
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 2ad9070a54c1..49b00996a15e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -1931,7 +1931,10 @@ static struct i915_vma *pd_vma_create(struct gen6_hw_ppgtt *ppgtt, int size)
>   	vma->ggtt_view.type = I915_GGTT_VIEW_ROTATED; /* prevent fencing */
>   
>   	INIT_LIST_HEAD(&vma->obj_link);
> +
> +	mutex_lock(&vma->vm->mutex);
>   	list_add(&vma->vm_link, &vma->vm->unbound_list);
> +	mutex_unlock(&vma->vm->mutex);
>   
>   	return vma;
>   }
> @@ -3504,9 +3507,10 @@ void i915_gem_restore_gtt_mappings(struct drm_i915_private *dev_priv)
>   
>   	i915_check_and_clear_faults(dev_priv);
>   
> +	mutex_lock(&ggtt->vm.mutex);
> +
>   	/* First fill our portion of the GTT with scratch pages */
>   	ggtt->vm.clear_range(&ggtt->vm, 0, ggtt->vm.total);
> -
>   	ggtt->vm.closed = true; /* skip rewriting PTE on VMA unbind */
>   
>   	/* clflush objects bound into the GGTT and rebind them. */
> @@ -3516,19 +3520,26 @@ void i915_gem_restore_gtt_mappings(struct drm_i915_private *dev_priv)
>   		if (!(vma->flags & I915_VMA_GLOBAL_BIND))
>   			continue;
>   
> +		mutex_unlock(&ggtt->vm.mutex);
> +
>   		if (!i915_vma_unbind(vma))
> -			continue;
> +			goto lock;
>   
>   		WARN_ON(i915_vma_bind(vma,
>   				      obj ? obj->cache_level : 0,
>   				      PIN_UPDATE));
>   		if (obj)
>   			WARN_ON(i915_gem_object_set_to_gtt_domain(obj, false));
> +
> +lock:
> +		mutex_lock(&ggtt->vm.mutex);
>   	}
>   
>   	ggtt->vm.closed = false;
>   	i915_ggtt_invalidate(dev_priv);
>   
> +	mutex_unlock(&ggtt->vm.mutex);
> +
>   	if (INTEL_GEN(dev_priv) >= 8) {
>   		struct intel_ppat *ppat = &dev_priv->ppat;
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem_shrinker.c b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> index a76d6c95c824..6da795c7e62e 100644
> --- a/drivers/gpu/drm/i915/i915_gem_shrinker.c
> +++ b/drivers/gpu/drm/i915/i915_gem_shrinker.c
> @@ -461,6 +461,7 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
>   					       I915_SHRINK_VMAPS);
>   
>   	/* We also want to clear any cached iomaps as they wrap vmap */
> +	mutex_lock(&i915->ggtt.vm.mutex);
>   	list_for_each_entry_safe(vma, next,
>   				 &i915->ggtt.vm.bound_list, vm_link) {
>   		unsigned long count = vma->node.size >> PAGE_SHIFT;
> @@ -468,9 +469,12 @@ i915_gem_shrinker_vmap(struct notifier_block *nb, unsigned long event, void *ptr
>   		if (!vma->iomap || i915_vma_is_active(vma))
>   			continue;
>   
> +		mutex_unlock(&i915->ggtt.vm.mutex);
>   		if (i915_vma_unbind(vma) == 0)
>   			freed_pages += count;
> +		mutex_lock(&i915->ggtt.vm.mutex);
>   	}
> +	mutex_unlock(&i915->ggtt.vm.mutex);
>   
>   out:
>   	shrinker_unlock(i915, unlock);
> diff --git a/drivers/gpu/drm/i915/i915_gem_stolen.c b/drivers/gpu/drm/i915/i915_gem_stolen.c
> index a9e365789686..74a9661479ca 100644
> --- a/drivers/gpu/drm/i915/i915_gem_stolen.c
> +++ b/drivers/gpu/drm/i915/i915_gem_stolen.c
> @@ -702,7 +702,9 @@ i915_gem_object_create_stolen_for_preallocated(struct drm_i915_private *dev_priv
>   	vma->flags |= I915_VMA_GLOBAL_BIND;
>   	__i915_vma_set_map_and_fenceable(vma);
>   
> +	mutex_lock(&ggtt->vm.mutex);
>   	list_move_tail(&vma->vm_link, &ggtt->vm.bound_list);
> +	mutex_unlock(&ggtt->vm.mutex);
>   
>   	spin_lock(&dev_priv->mm.obj_lock);
>   	list_move_tail(&obj->mm.link, &dev_priv->mm.bound_list);
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index 7de28baffb8f..dcbd0d345c72 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -213,7 +213,10 @@ vma_create(struct drm_i915_gem_object *obj,
>   	}
>   	rb_link_node(&vma->obj_node, rb, p);
>   	rb_insert_color(&vma->obj_node, &obj->vma_tree);
> +
> +	mutex_lock(&vm->mutex);
>   	list_add(&vma->vm_link, &vm->unbound_list);
> +	mutex_unlock(&vm->mutex);
>   
>   	return vma;
>   
> @@ -656,7 +659,9 @@ i915_vma_insert(struct i915_vma *vma, u64 size, u64 alignment, u64 flags)
>   	GEM_BUG_ON(!drm_mm_node_allocated(&vma->node));
>   	GEM_BUG_ON(!i915_gem_valid_gtt_space(vma, cache_level));
>   
> +	mutex_lock(&vma->vm->mutex);
>   	list_move_tail(&vma->vm_link, &vma->vm->bound_list);
> +	mutex_unlock(&vma->vm->mutex);
>   
>   	if (vma->obj) {
>   		struct drm_i915_gem_object *obj = vma->obj;
> @@ -689,8 +694,10 @@ i915_vma_remove(struct i915_vma *vma)
>   
>   	vma->ops->clear_pages(vma);
>   
> +	mutex_lock(&vma->vm->mutex);
>   	drm_mm_remove_node(&vma->node);
>   	list_move_tail(&vma->vm_link, &vma->vm->unbound_list);
> +	mutex_unlock(&vma->vm->mutex);
>   
>   	/*
>   	 * Since the unbound list is global, only move to that list if
> @@ -802,7 +809,11 @@ static void __i915_vma_destroy(struct i915_vma *vma)
>   	GEM_BUG_ON(i915_gem_active_isset(&vma->last_fence));
>   
>   	list_del(&vma->obj_link);
> +
> +	mutex_lock(&vma->vm->mutex);
>   	list_del(&vma->vm_link);
> +	mutex_unlock(&vma->vm->mutex);
> +
>   	if (vma->obj)
>   		rb_erase(&vma->obj_node, &vma->obj->vma_tree);
>   
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
> index 9e6b9304f2bd..c8deb961a020 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_evict.c
> @@ -109,10 +109,13 @@ static int populate_ggtt(struct drm_i915_private *i915)
>   
>   static void unpin_ggtt(struct drm_i915_private *i915)
>   {
> +	struct i915_ggtt *ggtt = &i915->ggtt;
>   	struct i915_vma *vma;
>   
> +	mutex_lock(&ggtt->vm.mutex);
>   	list_for_each_entry(vma, &i915->ggtt.vm.bound_list, vm_link)
>   		i915_vma_unpin(vma);
> +	mutex_unlock(&ggtt->vm.mutex);
>   }
>   
>   static void cleanup_objects(struct drm_i915_private *i915)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> index 852b06cb50a0..35eb40e5de91 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> @@ -1237,7 +1237,10 @@ static void track_vma_bind(struct i915_vma *vma)
>   	__i915_gem_object_pin_pages(obj);
>   
>   	vma->pages = obj->mm.pages;
> +
> +	mutex_lock(&vma->vm->mutex);
>   	list_move_tail(&vma->vm_link, &vma->vm->bound_list);
> +	mutex_unlock(&vma->vm->mutex);
>   }
>   
>   static int exercise_mock(struct drm_i915_private *i915,
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA
  2019-01-18 16:03   ` Tvrtko Ursulin
@ 2019-01-18 16:06     ` Chris Wilson
  2019-01-22 14:19     ` Chris Wilson
  1 sibling, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-18 16:06 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-18 16:03:27)
> 
> On 18/01/2019 14:00, Chris Wilson wrote:
> > Our goal is to remove struct_mutex and replace it with fine grained
> > locking. One of the thorny issues is our eviction logic for reclaiming
> > space for an execbuffer (or GTT mmaping, among a few other examples).
> > While eviction itself is easy to move under a per-VM mutex, performing
> > the activity tracking is less agreeable. One solution is not to do any
> > MRU tracking and do a simple coarse evaluation during eviction of
> > active/inactive, with a loose temporal ordering of last
> > insertion/evaluation. That keeps all the locking constrained to when we
> > are manipulating the VM itself, neatly avoiding the tricky handling of
> > possible recursive locking during execbuf and elsewhere.
> > 
> > Note that discarding the MRU is unlikely to impact upon our efficiency
> > to reclaim VM space (where we think a LRU model is best) as our
> > current strategy is to use random idle replacement first before doing
> > a search, and over time the use of softpinned 48b per-ppGTT is growing
> > (thereby eliminating any need to perform any eviction searches, in
> > theory at least).
> 
> There is still no mention of list consolidation.

There's 2 paragraphs about it above.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 09/38] drm/i915: Move vma lookup to its own lock
  2019-01-18 14:00 ` [PATCH 09/38] drm/i915: Move vma lookup to its own lock Chris Wilson
@ 2019-01-18 16:14   ` Tvrtko Ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-18 16:14 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:00, Chris Wilson wrote:
> Remove the struct_mutex requirement for looking up the vma for an
> object.

With the change log added:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

It's not cool to keep omitting it and creating extra cross referencing 
work every time you post it. It's not like we don't have enough work 
already..

Regards,

Tvrtko

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_debugfs.c       |  6 +--
>   drivers/gpu/drm/i915/i915_gem.c           | 33 +++++++-----
>   drivers/gpu/drm/i915/i915_gem_object.h    | 45 +++++++++-------
>   drivers/gpu/drm/i915/i915_vma.c           | 66 ++++++++++++++++-------
>   drivers/gpu/drm/i915/i915_vma.h           |  2 +-
>   drivers/gpu/drm/i915/selftests/i915_vma.c |  4 +-
>   6 files changed, 98 insertions(+), 58 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 3ec369980d40..2a6e4044f25b 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -159,14 +159,14 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
>   		   obj->mm.madv == I915_MADV_DONTNEED ? " purgeable" : "");
>   	if (obj->base.name)
>   		seq_printf(m, " (name: %d)", obj->base.name);
> -	list_for_each_entry(vma, &obj->vma_list, obj_link) {
> +	list_for_each_entry(vma, &obj->vma.list, obj_link) {
>   		if (i915_vma_is_pinned(vma))
>   			pin_count++;
>   	}
>   	seq_printf(m, " (pinned x %d)", pin_count);
>   	if (obj->pin_global)
>   		seq_printf(m, " (global)");
> -	list_for_each_entry(vma, &obj->vma_list, obj_link) {
> +	list_for_each_entry(vma, &obj->vma.list, obj_link) {
>   		if (!drm_mm_node_allocated(&vma->node))
>   			continue;
>   
> @@ -322,7 +322,7 @@ static int per_file_stats(int id, void *ptr, void *data)
>   	if (obj->base.name || obj->base.dma_buf)
>   		stats->shared += obj->base.size;
>   
> -	list_for_each_entry(vma, &obj->vma_list, obj_link) {
> +	list_for_each_entry(vma, &obj->vma.list, obj_link) {
>   		if (!drm_mm_node_allocated(&vma->node))
>   			continue;
>   
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 538fa5404603..15acd052da46 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -437,15 +437,19 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
>   	if (ret)
>   		return ret;
>   
> -	while ((vma = list_first_entry_or_null(&obj->vma_list,
> -					       struct i915_vma,
> -					       obj_link))) {
> +	spin_lock(&obj->vma.lock);
> +	while (!ret && (vma = list_first_entry_or_null(&obj->vma.list,
> +						       struct i915_vma,
> +						       obj_link))) {
>   		list_move_tail(&vma->obj_link, &still_in_list);
> +		spin_unlock(&obj->vma.lock);
> +
>   		ret = i915_vma_unbind(vma);
> -		if (ret)
> -			break;
> +
> +		spin_lock(&obj->vma.lock);
>   	}
> -	list_splice(&still_in_list, &obj->vma_list);
> +	list_splice(&still_in_list, &obj->vma.list);
> +	spin_unlock(&obj->vma.lock);
>   
>   	return ret;
>   }
> @@ -3489,7 +3493,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>   	 * reading an invalid PTE on older architectures.
>   	 */
>   restart:
> -	list_for_each_entry(vma, &obj->vma_list, obj_link) {
> +	list_for_each_entry(vma, &obj->vma.list, obj_link) {
>   		if (!drm_mm_node_allocated(&vma->node))
>   			continue;
>   
> @@ -3567,7 +3571,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>   			 */
>   		}
>   
> -		list_for_each_entry(vma, &obj->vma_list, obj_link) {
> +		list_for_each_entry(vma, &obj->vma.list, obj_link) {
>   			if (!drm_mm_node_allocated(&vma->node))
>   				continue;
>   
> @@ -3577,7 +3581,7 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>   		}
>   	}
>   
> -	list_for_each_entry(vma, &obj->vma_list, obj_link)
> +	list_for_each_entry(vma, &obj->vma.list, obj_link)
>   		vma->node.color = cache_level;
>   	i915_gem_object_set_cache_coherency(obj, cache_level);
>   	obj->cache_dirty = true; /* Always invalidate stale cachelines */
> @@ -4153,7 +4157,9 @@ void i915_gem_object_init(struct drm_i915_gem_object *obj,
>   {
>   	mutex_init(&obj->mm.lock);
>   
> -	INIT_LIST_HEAD(&obj->vma_list);
> +	spin_lock_init(&obj->vma.lock);
> +	INIT_LIST_HEAD(&obj->vma.list);
> +
>   	INIT_LIST_HEAD(&obj->lut_list);
>   	INIT_LIST_HEAD(&obj->batch_pool_link);
>   
> @@ -4319,14 +4325,13 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
>   		mutex_lock(&i915->drm.struct_mutex);
>   
>   		GEM_BUG_ON(i915_gem_object_is_active(obj));
> -		list_for_each_entry_safe(vma, vn,
> -					 &obj->vma_list, obj_link) {
> +		list_for_each_entry_safe(vma, vn, &obj->vma.list, obj_link) {
>   			GEM_BUG_ON(i915_vma_is_active(vma));
>   			vma->flags &= ~I915_VMA_PIN_MASK;
>   			i915_vma_destroy(vma);
>   		}
> -		GEM_BUG_ON(!list_empty(&obj->vma_list));
> -		GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma_tree));
> +		GEM_BUG_ON(!list_empty(&obj->vma.list));
> +		GEM_BUG_ON(!RB_EMPTY_ROOT(&obj->vma.tree));
>   
>   		/* This serializes freeing with the shrinker. Since the free
>   		 * is delayed, first by RCU then by the workqueue, we want the
> diff --git a/drivers/gpu/drm/i915/i915_gem_object.h b/drivers/gpu/drm/i915/i915_gem_object.h
> index cb1b0144d274..5a33b6d9f942 100644
> --- a/drivers/gpu/drm/i915/i915_gem_object.h
> +++ b/drivers/gpu/drm/i915/i915_gem_object.h
> @@ -87,24 +87,33 @@ struct drm_i915_gem_object {
>   
>   	const struct drm_i915_gem_object_ops *ops;
>   
> -	/**
> -	 * @vma_list: List of VMAs backed by this object
> -	 *
> -	 * The VMA on this list are ordered by type, all GGTT vma are placed
> -	 * at the head and all ppGTT vma are placed at the tail. The different
> -	 * types of GGTT vma are unordered between themselves, use the
> -	 * @vma_tree (which has a defined order between all VMA) to find an
> -	 * exact match.
> -	 */
> -	struct list_head vma_list;
> -	/**
> -	 * @vma_tree: Ordered tree of VMAs backed by this object
> -	 *
> -	 * All VMA created for this object are placed in the @vma_tree for
> -	 * fast retrieval via a binary search in i915_vma_instance().
> -	 * They are also added to @vma_list for easy iteration.
> -	 */
> -	struct rb_root vma_tree;
> +	struct {
> +		/**
> +		 * @vma.lock: protect the list/tree of vmas
> +		 */
> +		struct spinlock lock;
> +
> +		/**
> +		 * @vma.list: List of VMAs backed by this object
> +		 *
> +		 * The VMA on this list are ordered by type, all GGTT vma are
> +		 * placed at the head and all ppGTT vma are placed at the tail.
> +		 * The different types of GGTT vma are unordered between
> +		 * themselves, use the @vma.tree (which has a defined order
> +		 * between all VMA) to quickly find an exact match.
> +		 */
> +		struct list_head list;
> +
> +		/**
> +		 * @vma.tree: Ordered tree of VMAs backed by this object
> +		 *
> +		 * All VMA created for this object are placed in the @vma.tree
> +		 * for fast retrieval via a binary search in
> +		 * i915_vma_instance(). They are also added to @vma.list for
> +		 * easy iteration.
> +		 */
> +		struct rb_root tree;
> +	} vma;
>   
>   	/**
>   	 * @lut_list: List of vma lookup entries in use for this object.
> diff --git a/drivers/gpu/drm/i915/i915_vma.c b/drivers/gpu/drm/i915/i915_vma.c
> index dcbd0d345c72..d83b8ad5f859 100644
> --- a/drivers/gpu/drm/i915/i915_vma.c
> +++ b/drivers/gpu/drm/i915/i915_vma.c
> @@ -187,32 +187,52 @@ vma_create(struct drm_i915_gem_object *obj,
>   								i915_gem_object_get_stride(obj));
>   		GEM_BUG_ON(!is_power_of_2(vma->fence_alignment));
>   
> -		/*
> -		 * We put the GGTT vma at the start of the vma-list, followed
> -		 * by the ppGGTT vma. This allows us to break early when
> -		 * iterating over only the GGTT vma for an object, see
> -		 * for_each_ggtt_vma()
> -		 */
>   		vma->flags |= I915_VMA_GGTT;
> -		list_add(&vma->obj_link, &obj->vma_list);
> -	} else {
> -		list_add_tail(&vma->obj_link, &obj->vma_list);
>   	}
>   
> +	spin_lock(&obj->vma.lock);
> +
>   	rb = NULL;
> -	p = &obj->vma_tree.rb_node;
> +	p = &obj->vma.tree.rb_node;
>   	while (*p) {
>   		struct i915_vma *pos;
> +		long cmp;
>   
>   		rb = *p;
>   		pos = rb_entry(rb, struct i915_vma, obj_node);
> -		if (i915_vma_compare(pos, vm, view) < 0)
> +
> +		/*
> +		 * If the view already exists in the tree, another thread
> +		 * already created a matching vma, so return the older instance
> +		 * and dispose of ours.
> +		 */
> +		cmp = i915_vma_compare(pos, vm, view);
> +		if (cmp == 0) {
> +			spin_unlock(&obj->vma.lock);
> +			kmem_cache_free(vm->i915->vmas, vma);
> +			return pos;
> +		}
> +
> +		if (cmp < 0)
>   			p = &rb->rb_right;
>   		else
>   			p = &rb->rb_left;
>   	}
>   	rb_link_node(&vma->obj_node, rb, p);
> -	rb_insert_color(&vma->obj_node, &obj->vma_tree);
> +	rb_insert_color(&vma->obj_node, &obj->vma.tree);
> +
> +	if (i915_vma_is_ggtt(vma))
> +		/*
> +		 * We put the GGTT vma at the start of the vma-list, followed
> +		 * by the ppGGTT vma. This allows us to break early when
> +		 * iterating over only the GGTT vma for an object, see
> +		 * for_each_ggtt_vma()
> +		 */
> +		list_add(&vma->obj_link, &obj->vma.list);
> +	else
> +		list_add_tail(&vma->obj_link, &obj->vma.list);
> +
> +	spin_unlock(&obj->vma.lock);
>   
>   	mutex_lock(&vm->mutex);
>   	list_add(&vma->vm_link, &vm->unbound_list);
> @@ -232,7 +252,7 @@ vma_lookup(struct drm_i915_gem_object *obj,
>   {
>   	struct rb_node *rb;
>   
> -	rb = obj->vma_tree.rb_node;
> +	rb = obj->vma.tree.rb_node;
>   	while (rb) {
>   		struct i915_vma *vma = rb_entry(rb, struct i915_vma, obj_node);
>   		long cmp;
> @@ -272,16 +292,18 @@ i915_vma_instance(struct drm_i915_gem_object *obj,
>   {
>   	struct i915_vma *vma;
>   
> -	lockdep_assert_held(&obj->base.dev->struct_mutex);
>   	GEM_BUG_ON(view && !i915_is_ggtt(vm));
>   	GEM_BUG_ON(vm->closed);
>   
> +	spin_lock(&obj->vma.lock);
>   	vma = vma_lookup(obj, vm, view);
> -	if (!vma)
> +	spin_unlock(&obj->vma.lock);
> +
> +	/* vma_create() will resolve the race if another creates the vma */
> +	if (unlikely(!vma))
>   		vma = vma_create(obj, vm, view);
>   
>   	GEM_BUG_ON(!IS_ERR(vma) && i915_vma_compare(vma, vm, view));
> -	GEM_BUG_ON(!IS_ERR(vma) && vma_lookup(obj, vm, view) != vma);
>   	return vma;
>   }
>   
> @@ -808,14 +830,18 @@ static void __i915_vma_destroy(struct i915_vma *vma)
>   
>   	GEM_BUG_ON(i915_gem_active_isset(&vma->last_fence));
>   
> -	list_del(&vma->obj_link);
> -
>   	mutex_lock(&vma->vm->mutex);
>   	list_del(&vma->vm_link);
>   	mutex_unlock(&vma->vm->mutex);
>   
> -	if (vma->obj)
> -		rb_erase(&vma->obj_node, &vma->obj->vma_tree);
> +	if (vma->obj) {
> +		struct drm_i915_gem_object *obj = vma->obj;
> +
> +		spin_lock(&obj->vma.lock);
> +		list_del(&vma->obj_link);
> +		rb_erase(&vma->obj_node, &vma->obj->vma.tree);
> +		spin_unlock(&obj->vma.lock);
> +	}
>   
>   	rbtree_postorder_for_each_entry_safe(iter, n, &vma->active, node) {
>   		GEM_BUG_ON(i915_gem_active_isset(&iter->base));
> diff --git a/drivers/gpu/drm/i915/i915_vma.h b/drivers/gpu/drm/i915/i915_vma.h
> index 4f7c1c7599f4..7252abc73d3e 100644
> --- a/drivers/gpu/drm/i915/i915_vma.h
> +++ b/drivers/gpu/drm/i915/i915_vma.h
> @@ -425,7 +425,7 @@ void i915_vma_parked(struct drm_i915_private *i915);
>    * or the list is empty ofc.
>    */
>   #define for_each_ggtt_vma(V, OBJ) \
> -	list_for_each_entry(V, &(OBJ)->vma_list, obj_link)		\
> +	list_for_each_entry(V, &(OBJ)->vma.list, obj_link)		\
>   		for_each_until(!i915_vma_is_ggtt(V))
>   
>   #endif
> diff --git a/drivers/gpu/drm/i915/selftests/i915_vma.c b/drivers/gpu/drm/i915/selftests/i915_vma.c
> index ffa74290e054..f1008b07dfd2 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_vma.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_vma.c
> @@ -670,7 +670,7 @@ static int igt_vma_partial(void *arg)
>   		}
>   
>   		count = 0;
> -		list_for_each_entry(vma, &obj->vma_list, obj_link)
> +		list_for_each_entry(vma, &obj->vma.list, obj_link)
>   			count++;
>   		if (count != nvma) {
>   			pr_err("(%s) All partial vma were not recorded on the obj->vma_list: found %u, expected %u\n",
> @@ -699,7 +699,7 @@ static int igt_vma_partial(void *arg)
>   		i915_vma_unpin(vma);
>   
>   		count = 0;
> -		list_for_each_entry(vma, &obj->vma_list, obj_link)
> +		list_for_each_entry(vma, &obj->vma.list, obj_link)
>   			count++;
>   		if (count != nvma) {
>   			pr_err("(%s) allocated an extra full vma!\n", p->name);
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 10/38] drm/i915/selftests: Allocate mock ring/timeline per context
  2019-01-18 14:00 ` [PATCH 10/38] drm/i915/selftests: Allocate mock ring/timeline per context Chris Wilson
@ 2019-01-18 16:26   ` Tvrtko Ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-18 16:26 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:00, Chris Wilson wrote:
> To correctly simulate preemption between contexts, we need independent
> timelines along each context. Make it so.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/selftests/mock_engine.c | 90 ++++++++++----------
>   1 file changed, 47 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
> index 9fe5b2c8f8d4..8b8d51af7d6a 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_engine.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
> @@ -30,6 +30,36 @@ struct mock_ring {
>   	struct i915_timeline timeline;
>   };
>   
> +static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
> +{
> +	const unsigned long sz = PAGE_SIZE / 2;
> +	struct mock_ring *ring;
> +
> +	ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL);
> +	if (!ring)
> +		return NULL;
> +
> +	i915_timeline_init(engine->i915, &ring->timeline, engine->name);
> +
> +	ring->base.size = sz;
> +	ring->base.effective_size = sz;
> +	ring->base.vaddr = (void *)(ring + 1);
> +	ring->base.timeline = &ring->timeline;
> +
> +	INIT_LIST_HEAD(&ring->base.request_list);
> +	intel_ring_update_space(&ring->base);
> +
> +	return &ring->base;
> +}
> +
> +static void mock_ring_free(struct intel_ring *base)
> +{
> +	struct mock_ring *ring = container_of(base, typeof(*ring), base);
> +
> +	i915_timeline_fini(&ring->timeline);
> +	kfree(ring);
> +}
> +
>   static struct mock_request *first_request(struct mock_engine *engine)
>   {
>   	return list_first_entry_or_null(&engine->hw_queue,
> @@ -80,6 +110,9 @@ static void mock_context_unpin(struct intel_context *ce)
>   static void mock_context_destroy(struct intel_context *ce)
>   {
>   	GEM_BUG_ON(ce->pin_count);
> +
> +	if (ce->ring)
> +		mock_ring_free(ce->ring);
>   }
>   
>   static const struct intel_context_ops mock_context_ops = {
> @@ -93,13 +126,22 @@ mock_context_pin(struct intel_engine_cs *engine,
>   {
>   	struct intel_context *ce = to_intel_context(ctx, engine);
>   
> -	if (!ce->pin_count++) {
> -		i915_gem_context_get(ctx);
> -		ce->ring = engine->buffer;
> -		ce->ops = &mock_context_ops;
> +	if (ce->pin_count++)
> +		return ce;
> +
> +	if (!ce->ring) {
> +		ce->ring = mock_ring(engine);
> +		if (!ce->ring)
> +			goto err;
>   	}
>   
> +	ce->ops = &mock_context_ops;
> +	i915_gem_context_get(ctx);
>   	return ce;
> +
> +err:
> +	ce->pin_count = 0;
> +	return ERR_PTR(-ENOMEM);
>   }
>   
>   static int mock_request_alloc(struct i915_request *request)
> @@ -143,36 +185,6 @@ static void mock_submit_request(struct i915_request *request)
>   	spin_unlock_irq(&engine->hw_lock);
>   }
>   
> -static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
> -{
> -	const unsigned long sz = PAGE_SIZE / 2;
> -	struct mock_ring *ring;
> -
> -	ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL);
> -	if (!ring)
> -		return NULL;
> -
> -	i915_timeline_init(engine->i915, &ring->timeline, engine->name);
> -
> -	ring->base.size = sz;
> -	ring->base.effective_size = sz;
> -	ring->base.vaddr = (void *)(ring + 1);
> -	ring->base.timeline = &ring->timeline;
> -
> -	INIT_LIST_HEAD(&ring->base.request_list);
> -	intel_ring_update_space(&ring->base);
> -
> -	return &ring->base;
> -}
> -
> -static void mock_ring_free(struct intel_ring *base)
> -{
> -	struct mock_ring *ring = container_of(base, typeof(*ring), base);
> -
> -	i915_timeline_fini(&ring->timeline);
> -	kfree(ring);
> -}
> -
>   struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   				    const char *name,
>   				    int id)
> @@ -207,17 +219,11 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   	timer_setup(&engine->hw_delay, hw_delay_complete, 0);
>   	INIT_LIST_HEAD(&engine->hw_queue);
>   
> -	engine->base.buffer = mock_ring(&engine->base);
> -	if (!engine->base.buffer)
> -		goto err_breadcrumbs;
> -
>   	if (IS_ERR(intel_context_pin(i915->kernel_context, &engine->base)))
> -		goto err_ring;
> +		goto err_breadcrumbs;
>   
>   	return &engine->base;
>   
> -err_ring:
> -	mock_ring_free(engine->base.buffer);
>   err_breadcrumbs:
>   	intel_engine_fini_breadcrumbs(&engine->base);
>   	i915_timeline_fini(&engine->base.timeline);
> @@ -260,8 +266,6 @@ void mock_engine_free(struct intel_engine_cs *engine)
>   
>   	__intel_context_unpin(engine->i915->kernel_context, engine);
>   
> -	mock_ring_free(engine->buffer);
> -
>   	intel_engine_fini_breadcrumbs(engine);
>   	i915_timeline_fini(&engine->timeline);
>   
> 

I was sure I reviewed this yesterday or so, but even my sent mail folder 
disagrees.. strange.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 12/38] drm/i915: Move list of timelines under its own lock
  2019-01-18 14:00 ` [PATCH 12/38] drm/i915: Move list of timelines under its own lock Chris Wilson
@ 2019-01-18 16:28   ` Tvrtko Ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-18 16:28 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:00, Chris Wilson wrote:
> Currently, the list of timelines is serialised by the struct_mutex, but
> to alleviate difficulties with using that mutex in future, move the
> list management under its own dedicated mutex.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

> ---
>   drivers/gpu/drm/i915/i915_drv.h               |  5 +-
>   drivers/gpu/drm/i915/i915_gem.c               | 88 +++++++++----------
>   drivers/gpu/drm/i915/i915_reset.c             |  8 +-
>   drivers/gpu/drm/i915/i915_timeline.c          | 38 ++++++--
>   drivers/gpu/drm/i915/i915_timeline.h          |  3 +
>   .../gpu/drm/i915/selftests/mock_gem_device.c  |  7 +-
>   .../gpu/drm/i915/selftests/mock_timeline.c    |  3 +-
>   7 files changed, 94 insertions(+), 58 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 94680b15bed0..3913900600b7 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1975,7 +1975,10 @@ struct drm_i915_private {
>   		void (*resume)(struct drm_i915_private *);
>   		void (*cleanup_engine)(struct intel_engine_cs *engine);
>   
> -		struct list_head timelines;
> +		struct i915_gt_timelines {
> +			struct mutex mutex; /* protects list, tainted by GPU */
> +			struct list_head list;
> +		} timelines;
>   
>   		struct list_head active_rings;
>   		struct list_head closed_vma;
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 15acd052da46..0b44059dfab2 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3222,33 +3222,6 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
>   	return ret;
>   }
>   
> -static long wait_for_timeline(struct i915_timeline *tl,
> -			      unsigned int flags, long timeout)
> -{
> -	struct i915_request *rq;
> -
> -	rq = i915_gem_active_get_unlocked(&tl->last_request);
> -	if (!rq)
> -		return timeout;
> -
> -	/*
> -	 * "Race-to-idle".
> -	 *
> -	 * Switching to the kernel context is often used a synchronous
> -	 * step prior to idling, e.g. in suspend for flushing all
> -	 * current operations to memory before sleeping. These we
> -	 * want to complete as quickly as possible to avoid prolonged
> -	 * stalls, so allow the gpu to boost to maximum clocks.
> -	 */
> -	if (flags & I915_WAIT_FOR_IDLE_BOOST)
> -		gen6_rps_boost(rq, NULL);
> -
> -	timeout = i915_request_wait(rq, flags, timeout);
> -	i915_request_put(rq);
> -
> -	return timeout;
> -}
> -
>   static int wait_for_engines(struct drm_i915_private *i915)
>   {
>   	if (wait_for(intel_engines_are_idle(i915), I915_IDLE_ENGINES_TIMEOUT)) {
> @@ -3265,6 +3238,8 @@ static int wait_for_engines(struct drm_i915_private *i915)
>   int i915_gem_wait_for_idle(struct drm_i915_private *i915,
>   			   unsigned int flags, long timeout)
>   {
> +	struct i915_timeline *tl;
> +
>   	GEM_TRACE("flags=%x (%s), timeout=%ld%s\n",
>   		  flags, flags & I915_WAIT_LOCKED ? "locked" : "unlocked",
>   		  timeout, timeout == MAX_SCHEDULE_TIMEOUT ? " (forever)" : "");
> @@ -3273,17 +3248,44 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
>   	if (!READ_ONCE(i915->gt.awake))
>   		return 0;
>   
> +	mutex_lock(&i915->gt.timelines.mutex);
> +	list_for_each_entry(tl, &i915->gt.timelines.list, link) {
> +		struct i915_request *rq;
> +
> +		rq = i915_gem_active_get_unlocked(&tl->last_request);
> +		if (!rq)
> +			continue;
> +
> +		mutex_unlock(&i915->gt.timelines.mutex);
> +
> +		/*
> +		 * "Race-to-idle".
> +		 *
> +		 * Switching to the kernel context is often used a synchronous
> +		 * step prior to idling, e.g. in suspend for flushing all
> +		 * current operations to memory before sleeping. These we
> +		 * want to complete as quickly as possible to avoid prolonged
> +		 * stalls, so allow the gpu to boost to maximum clocks.
> +		 */
> +		if (flags & I915_WAIT_FOR_IDLE_BOOST)
> +			gen6_rps_boost(rq, NULL);
> +
> +		timeout = i915_request_wait(rq, flags, timeout);
> +		i915_request_put(rq);
> +		if (timeout < 0)
> +			return timeout;
> +
> +		/* restart after reacquiring the lock */
> +		mutex_lock(&i915->gt.timelines.mutex);
> +		tl = list_entry(&i915->gt.timelines.list, typeof(*tl), link);
> +	}
> +	mutex_unlock(&i915->gt.timelines.mutex);
> +
>   	if (flags & I915_WAIT_LOCKED) {
> -		struct i915_timeline *tl;
>   		int err;
>   
>   		lockdep_assert_held(&i915->drm.struct_mutex);
>   
> -		list_for_each_entry(tl, &i915->gt.timelines, link) {
> -			timeout = wait_for_timeline(tl, flags, timeout);
> -			if (timeout < 0)
> -				return timeout;
> -		}
>   		if (GEM_SHOW_DEBUG() && !timeout) {
>   			/* Presume that timeout was non-zero to begin with! */
>   			dev_warn(&i915->drm.pdev->dev,
> @@ -3297,17 +3299,6 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915,
>   
>   		i915_retire_requests(i915);
>   		GEM_BUG_ON(i915->gt.active_requests);
> -	} else {
> -		struct intel_engine_cs *engine;
> -		enum intel_engine_id id;
> -
> -		for_each_engine(engine, i915, id) {
> -			struct i915_timeline *tl = &engine->timeline;
> -
> -			timeout = wait_for_timeline(tl, flags, timeout);
> -			if (timeout < 0)
> -				return timeout;
> -		}
>   	}
>   
>   	return 0;
> @@ -5008,6 +4999,8 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>   		dev_priv->gt.cleanup_engine = intel_engine_cleanup;
>   	}
>   
> +	i915_timelines_init(dev_priv);
> +
>   	ret = i915_gem_init_userptr(dev_priv);
>   	if (ret)
>   		return ret;
> @@ -5130,8 +5123,10 @@ int i915_gem_init(struct drm_i915_private *dev_priv)
>   err_uc_misc:
>   	intel_uc_fini_misc(dev_priv);
>   
> -	if (ret != -EIO)
> +	if (ret != -EIO) {
>   		i915_gem_cleanup_userptr(dev_priv);
> +		i915_timelines_fini(dev_priv);
> +	}
>   
>   	if (ret == -EIO) {
>   		mutex_lock(&dev_priv->drm.struct_mutex);
> @@ -5182,6 +5177,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv)
>   
>   	intel_uc_fini_misc(dev_priv);
>   	i915_gem_cleanup_userptr(dev_priv);
> +	i915_timelines_fini(dev_priv);
>   
>   	i915_gem_drain_freed_objects(dev_priv);
>   
> @@ -5284,7 +5280,6 @@ int i915_gem_init_early(struct drm_i915_private *dev_priv)
>   	if (!dev_priv->priorities)
>   		goto err_dependencies;
>   
> -	INIT_LIST_HEAD(&dev_priv->gt.timelines);
>   	INIT_LIST_HEAD(&dev_priv->gt.active_rings);
>   	INIT_LIST_HEAD(&dev_priv->gt.closed_vma);
>   
> @@ -5328,7 +5323,6 @@ void i915_gem_cleanup_early(struct drm_i915_private *dev_priv)
>   	GEM_BUG_ON(!llist_empty(&dev_priv->mm.free_list));
>   	GEM_BUG_ON(atomic_read(&dev_priv->mm.free_count));
>   	WARN_ON(dev_priv->mm.object_count);
> -	WARN_ON(!list_empty(&dev_priv->gt.timelines));
>   
>   	kmem_cache_destroy(dev_priv->priorities);
>   	kmem_cache_destroy(dev_priv->dependencies);
> diff --git a/drivers/gpu/drm/i915/i915_reset.c b/drivers/gpu/drm/i915/i915_reset.c
> index d44b095e2860..12e5a2bc825c 100644
> --- a/drivers/gpu/drm/i915/i915_reset.c
> +++ b/drivers/gpu/drm/i915/i915_reset.c
> @@ -850,7 +850,8 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>   	 *
>   	 * No more can be submitted until we reset the wedged bit.
>   	 */
> -	list_for_each_entry(tl, &i915->gt.timelines, link) {
> +	mutex_lock(&i915->gt.timelines.mutex);
> +	list_for_each_entry(tl, &i915->gt.timelines.list, link) {
>   		struct i915_request *rq;
>   		long timeout;
>   
> @@ -872,9 +873,12 @@ bool i915_gem_unset_wedged(struct drm_i915_private *i915)
>   		timeout = dma_fence_default_wait(&rq->fence, true,
>   						 MAX_SCHEDULE_TIMEOUT);
>   		i915_request_put(rq);
> -		if (timeout < 0)
> +		if (timeout < 0) {
> +			mutex_unlock(&i915->gt.timelines.mutex);
>   			goto unlock;
> +		}
>   	}
> +	mutex_unlock(&i915->gt.timelines.mutex);
>   
>   	intel_engines_sanitize(i915, false);
>   
> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
> index 4667cc08c416..84550f17d3df 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_timeline.c
> @@ -13,7 +13,7 @@ void i915_timeline_init(struct drm_i915_private *i915,
>   			struct i915_timeline *timeline,
>   			const char *name)
>   {
> -	lockdep_assert_held(&i915->drm.struct_mutex);
> +	struct i915_gt_timelines *gt = &i915->gt.timelines;
>   
>   	/*
>   	 * Ideally we want a set of engines on a single leaf as we expect
> @@ -23,9 +23,12 @@ void i915_timeline_init(struct drm_i915_private *i915,
>   	 */
>   	BUILD_BUG_ON(KSYNCMAP < I915_NUM_ENGINES);
>   
> +	timeline->i915 = i915;
>   	timeline->name = name;
>   
> -	list_add(&timeline->link, &i915->gt.timelines);
> +	mutex_lock(&gt->mutex);
> +	list_add(&timeline->link, &gt->list);
> +	mutex_unlock(&gt->mutex);
>   
>   	/* Called during early_init before we know how many engines there are */
>   
> @@ -39,6 +42,17 @@ void i915_timeline_init(struct drm_i915_private *i915,
>   	i915_syncmap_init(&timeline->sync);
>   }
>   
> +void i915_timelines_init(struct drm_i915_private *i915)
> +{
> +	struct i915_gt_timelines *gt = &i915->gt.timelines;
> +
> +	mutex_init(&gt->mutex);
> +	INIT_LIST_HEAD(&gt->list);
> +
> +	/* via i915_gem_wait_for_idle() */
> +	i915_gem_shrinker_taints_mutex(i915, &gt->mutex);
> +}
> +
>   /**
>    * i915_timelines_park - called when the driver idles
>    * @i915: the drm_i915_private device
> @@ -51,11 +65,11 @@ void i915_timeline_init(struct drm_i915_private *i915,
>    */
>   void i915_timelines_park(struct drm_i915_private *i915)
>   {
> +	struct i915_gt_timelines *gt = &i915->gt.timelines;
>   	struct i915_timeline *timeline;
>   
> -	lockdep_assert_held(&i915->drm.struct_mutex);
> -
> -	list_for_each_entry(timeline, &i915->gt.timelines, link) {
> +	mutex_lock(&gt->mutex);
> +	list_for_each_entry(timeline, &gt->list, link) {
>   		/*
>   		 * All known fences are completed so we can scrap
>   		 * the current sync point tracking and start afresh,
> @@ -64,15 +78,20 @@ void i915_timelines_park(struct drm_i915_private *i915)
>   		 */
>   		i915_syncmap_free(&timeline->sync);
>   	}
> +	mutex_unlock(&gt->mutex);
>   }
>   
>   void i915_timeline_fini(struct i915_timeline *timeline)
>   {
> +	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
> +
>   	GEM_BUG_ON(!list_empty(&timeline->requests));
>   
>   	i915_syncmap_free(&timeline->sync);
>   
> +	mutex_lock(&gt->mutex);
>   	list_del(&timeline->link);
> +	mutex_unlock(&gt->mutex);
>   }
>   
>   struct i915_timeline *
> @@ -99,6 +118,15 @@ void __i915_timeline_free(struct kref *kref)
>   	kfree(timeline);
>   }
>   
> +void i915_timelines_fini(struct drm_i915_private *i915)
> +{
> +	struct i915_gt_timelines *gt = &i915->gt.timelines;
> +
> +	GEM_BUG_ON(!list_empty(&gt->list));
> +
> +	mutex_destroy(&gt->mutex);
> +}
> +
>   #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
>   #include "selftests/mock_timeline.c"
>   #include "selftests/i915_timeline.c"
> diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
> index 38c1e15e927a..87ad2dd31c20 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.h
> +++ b/drivers/gpu/drm/i915/i915_timeline.h
> @@ -66,6 +66,7 @@ struct i915_timeline {
>   
>   	struct list_head link;
>   	const char *name;
> +	struct drm_i915_private *i915;
>   
>   	struct kref kref;
>   };
> @@ -134,6 +135,8 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
>   	return __i915_timeline_sync_is_later(tl, fence->context, fence->seqno);
>   }
>   
> +void i915_timelines_init(struct drm_i915_private *i915);
>   void i915_timelines_park(struct drm_i915_private *i915);
> +void i915_timelines_fini(struct drm_i915_private *i915);
>   
>   #endif
> diff --git a/drivers/gpu/drm/i915/selftests/mock_gem_device.c b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> index 888c6978bc54..41ae502361d7 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_gem_device.c
> @@ -68,13 +68,14 @@ static void mock_device_release(struct drm_device *dev)
>   	i915_gem_contexts_fini(i915);
>   	mutex_unlock(&i915->drm.struct_mutex);
>   
> +	i915_timelines_fini(i915);
> +
>   	drain_workqueue(i915->wq);
>   	i915_gem_drain_freed_objects(i915);
>   
>   	mutex_lock(&i915->drm.struct_mutex);
>   	mock_fini_ggtt(i915);
>   	mutex_unlock(&i915->drm.struct_mutex);
> -	WARN_ON(!list_empty(&i915->gt.timelines));
>   
>   	destroy_workqueue(i915->wq);
>   
> @@ -226,7 +227,8 @@ struct drm_i915_private *mock_gem_device(void)
>   	if (!i915->priorities)
>   		goto err_dependencies;
>   
> -	INIT_LIST_HEAD(&i915->gt.timelines);
> +	i915_timelines_init(i915);
> +
>   	INIT_LIST_HEAD(&i915->gt.active_rings);
>   	INIT_LIST_HEAD(&i915->gt.closed_vma);
>   
> @@ -253,6 +255,7 @@ struct drm_i915_private *mock_gem_device(void)
>   	i915_gem_contexts_fini(i915);
>   err_unlock:
>   	mutex_unlock(&i915->drm.struct_mutex);
> +	i915_timelines_fini(i915);
>   	kmem_cache_destroy(i915->priorities);
>   err_dependencies:
>   	kmem_cache_destroy(i915->dependencies);
> diff --git a/drivers/gpu/drm/i915/selftests/mock_timeline.c b/drivers/gpu/drm/i915/selftests/mock_timeline.c
> index dcf3b16f5a07..cf39ccd9fc05 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_timeline.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_timeline.c
> @@ -10,6 +10,7 @@
>   
>   void mock_timeline_init(struct i915_timeline *timeline, u64 context)
>   {
> +	timeline->i915 = NULL;
>   	timeline->fence_context = context;
>   
>   	spin_lock_init(&timeline->lock);
> @@ -24,5 +25,5 @@ void mock_timeline_init(struct i915_timeline *timeline, u64 context)
>   
>   void mock_timeline_fini(struct i915_timeline *timeline)
>   {
> -	i915_timeline_fini(timeline);
> +	i915_syncmap_free(&timeline->sync);
>   }
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 15/38] drm/i915: Allocate a status page for each timeline
  2019-01-18 14:00 ` [PATCH 15/38] drm/i915: Allocate a status page for each timeline Chris Wilson
@ 2019-01-21 11:18   ` Tvrtko Ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-21 11:18 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:00, Chris Wilson wrote:
> Allocate a page for use as a status page by a group of timelines, as we
> only need a dword of storage for each (rounded up to the cacheline for
> safety) we can pack multiple timelines into the same page. Each timeline
> will then be able to track its own HW seqno.
> 
> v2: Reuse the common per-engine HWSP for the solitary ringbuffer
> timeline, so that we do not have to emit (using per-gen specialised
> vfuncs) the breadcrumb into the distinct timeline HWSP and instead can
> keep on using the common MI_STORE_DWORD_INDEX. However, to maintain the
> sleight-of-hand for the global/per-context seqno switchover, we will
> store both temporarily (and so use a custom offset for the shared timeline
> HWSP until the switch over).
> 
> v3: Keep things simple and allocate a page for each timeline, page
> sharing comes next.
> 
> v4: I was caught repeating the same MI_STORE_DWORD_IMM over and over
> again in selftests.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_timeline.c          | 106 +++++-
>   drivers/gpu/drm/i915/i915_timeline.h          |  21 +-
>   drivers/gpu/drm/i915/intel_engine_cs.c        |  64 ++--
>   drivers/gpu/drm/i915/intel_lrc.c              |  22 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.c       |  10 +-
>   drivers/gpu/drm/i915/intel_ringbuffer.h       |   6 +-
>   .../drm/i915/selftests/i915_live_selftests.h  |   1 +
>   .../drm/i915/selftests/i915_mock_selftests.h  |   2 +-
>   .../gpu/drm/i915/selftests/i915_timeline.c    | 314 +++++++++++++++++-
>   drivers/gpu/drm/i915/selftests/mock_engine.c  |  17 +-
>   10 files changed, 512 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_timeline.c b/drivers/gpu/drm/i915/i915_timeline.c
> index 84550f17d3df..a7d902e9eaf1 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/i915_timeline.c
> @@ -9,11 +9,38 @@
>   #include "i915_timeline.h"
>   #include "i915_syncmap.h"
>   
> -void i915_timeline_init(struct drm_i915_private *i915,
> -			struct i915_timeline *timeline,
> -			const char *name)
> +static int hwsp_alloc(struct i915_timeline *timeline)
> +{
> +	struct drm_i915_private *i915 = timeline->i915;
> +	struct drm_i915_gem_object *obj;
> +	struct i915_vma *vma;
> +
> +	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
> +	if (IS_ERR(obj))
> +		return PTR_ERR(obj);
> +
> +	i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
> +
> +	vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
> +	if (IS_ERR(vma)) {
> +		i915_gem_object_put(obj);
> +		return PTR_ERR(vma);
> +	}
> +
> +	timeline->hwsp_ggtt = vma;
> +	timeline->hwsp_offset = 0;
> +
> +	return 0;
> +}
> +
> +int i915_timeline_init(struct drm_i915_private *i915,
> +		       struct i915_timeline *timeline,
> +		       const char *name,
> +		       struct i915_vma *global_hwsp)
>   {
>   	struct i915_gt_timelines *gt = &i915->gt.timelines;
> +	void *vaddr;
> +	int err;
>   
>   	/*
>   	 * Ideally we want a set of engines on a single leaf as we expect
> @@ -25,10 +52,27 @@ void i915_timeline_init(struct drm_i915_private *i915,
>   
>   	timeline->i915 = i915;
>   	timeline->name = name;
> +	timeline->pin_count = 0;
> +
> +	if (global_hwsp) {
> +		timeline->hwsp_ggtt = i915_vma_get(global_hwsp);
> +		timeline->hwsp_offset = I915_GEM_HWS_SEQNO_ADDR;
> +	} else {
> +		err = hwsp_alloc(timeline);
> +		if (err)
> +			return err;
> +	}
>   
> -	mutex_lock(&gt->mutex);
> -	list_add(&timeline->link, &gt->list);
> -	mutex_unlock(&gt->mutex);
> +	vaddr = i915_gem_object_pin_map(timeline->hwsp_ggtt->obj, I915_MAP_WB);
> +	if (IS_ERR(vaddr)) {
> +		i915_vma_put(timeline->hwsp_ggtt);
> +		return PTR_ERR(vaddr);
> +	}
> +
> +	timeline->hwsp_seqno =
> +		memset(vaddr + timeline->hwsp_offset,
> +		       0,
> +		       sizeof(*timeline->hwsp_seqno));
>   
>   	/* Called during early_init before we know how many engines there are */
>   
> @@ -40,6 +84,12 @@ void i915_timeline_init(struct drm_i915_private *i915,
>   	INIT_LIST_HEAD(&timeline->requests);
>   
>   	i915_syncmap_init(&timeline->sync);
> +
> +	mutex_lock(&gt->mutex);
> +	list_add(&timeline->link, &gt->list);
> +	mutex_unlock(&gt->mutex);
> +
> +	return 0;
>   }
>   
>   void i915_timelines_init(struct drm_i915_private *i915)
> @@ -85,6 +135,7 @@ void i915_timeline_fini(struct i915_timeline *timeline)
>   {
>   	struct i915_gt_timelines *gt = &timeline->i915->gt.timelines;
>   
> +	GEM_BUG_ON(timeline->pin_count);
>   	GEM_BUG_ON(!list_empty(&timeline->requests));
>   
>   	i915_syncmap_free(&timeline->sync);
> @@ -92,23 +143,62 @@ void i915_timeline_fini(struct i915_timeline *timeline)
>   	mutex_lock(&gt->mutex);
>   	list_del(&timeline->link);
>   	mutex_unlock(&gt->mutex);
> +
> +	i915_gem_object_unpin_map(timeline->hwsp_ggtt->obj);
> +	i915_vma_put(timeline->hwsp_ggtt);
>   }
>   
>   struct i915_timeline *
> -i915_timeline_create(struct drm_i915_private *i915, const char *name)
> +i915_timeline_create(struct drm_i915_private *i915,
> +		     const char *name,
> +		     struct i915_vma *global_hwsp)
>   {
>   	struct i915_timeline *timeline;
> +	int err;
>   
>   	timeline = kzalloc(sizeof(*timeline), GFP_KERNEL);
>   	if (!timeline)
>   		return ERR_PTR(-ENOMEM);
>   
> -	i915_timeline_init(i915, timeline, name);
> +	err = i915_timeline_init(i915, timeline, name, global_hwsp);
> +	if (err) {
> +		kfree(timeline);
> +		return ERR_PTR(err);
> +	}
> +
>   	kref_init(&timeline->kref);
>   
>   	return timeline;
>   }
>   
> +int i915_timeline_pin(struct i915_timeline *tl)
> +{
> +	int err;
> +
> +	if (tl->pin_count++)
> +		return 0;
> +	GEM_BUG_ON(!tl->pin_count);
> +
> +	err = i915_vma_pin(tl->hwsp_ggtt, 0, 0, PIN_GLOBAL | PIN_HIGH);
> +	if (err)
> +		goto unpin;
> +
> +	return 0;
> +
> +unpin:
> +	tl->pin_count = 0;
> +	return err;
> +}
> +
> +void i915_timeline_unpin(struct i915_timeline *tl)
> +{
> +	GEM_BUG_ON(!tl->pin_count);
> +	if (--tl->pin_count)
> +		return;
> +
> +	__i915_vma_unpin(tl->hwsp_ggtt);
> +}
> +
>   void __i915_timeline_free(struct kref *kref)
>   {
>   	struct i915_timeline *timeline =
> diff --git a/drivers/gpu/drm/i915/i915_timeline.h b/drivers/gpu/drm/i915/i915_timeline.h
> index 87ad2dd31c20..0c3739d53d79 100644
> --- a/drivers/gpu/drm/i915/i915_timeline.h
> +++ b/drivers/gpu/drm/i915/i915_timeline.h
> @@ -32,6 +32,8 @@
>   #include "i915_syncmap.h"
>   #include "i915_utils.h"
>   
> +struct i915_vma;
> +
>   struct i915_timeline {
>   	u64 fence_context;
>   	u32 seqno;
> @@ -40,6 +42,11 @@ struct i915_timeline {
>   #define TIMELINE_CLIENT 0 /* default subclass */
>   #define TIMELINE_ENGINE 1
>   
> +	unsigned int pin_count;
> +	const u32 *hwsp_seqno;
> +	struct i915_vma *hwsp_ggtt;
> +	u32 hwsp_offset;
> +
>   	/**
>   	 * List of breadcrumbs associated with GPU requests currently
>   	 * outstanding.
> @@ -71,9 +78,10 @@ struct i915_timeline {
>   	struct kref kref;
>   };
>   
> -void i915_timeline_init(struct drm_i915_private *i915,
> -			struct i915_timeline *tl,
> -			const char *name);
> +int i915_timeline_init(struct drm_i915_private *i915,
> +		       struct i915_timeline *tl,
> +		       const char *name,
> +		       struct i915_vma *hwsp);
>   void i915_timeline_fini(struct i915_timeline *tl);
>   
>   static inline void
> @@ -96,7 +104,9 @@ i915_timeline_set_subclass(struct i915_timeline *timeline,
>   }
>   
>   struct i915_timeline *
> -i915_timeline_create(struct drm_i915_private *i915, const char *name);
> +i915_timeline_create(struct drm_i915_private *i915,
> +		     const char *name,
> +		     struct i915_vma *global_hwsp);
>   
>   static inline struct i915_timeline *
>   i915_timeline_get(struct i915_timeline *timeline)
> @@ -135,6 +145,9 @@ static inline bool i915_timeline_sync_is_later(struct i915_timeline *tl,
>   	return __i915_timeline_sync_is_later(tl, fence->context, fence->seqno);
>   }
>   
> +int i915_timeline_pin(struct i915_timeline *tl);
> +void i915_timeline_unpin(struct i915_timeline *tl);
> +
>   void i915_timelines_init(struct drm_i915_private *i915);
>   void i915_timelines_park(struct drm_i915_private *i915);
>   void i915_timelines_fini(struct drm_i915_private *i915);
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 4b4b7358c482..c850d131d8c3 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -484,26 +484,6 @@ static void intel_engine_init_execlist(struct intel_engine_cs *engine)
>   	execlists->queue = RB_ROOT_CACHED;
>   }
>   
> -/**
> - * intel_engines_setup_common - setup engine state not requiring hw access
> - * @engine: Engine to setup.
> - *
> - * Initializes @engine@ structure members shared between legacy and execlists
> - * submission modes which do not require hardware access.
> - *
> - * Typically done early in the submission mode specific engine setup stage.
> - */
> -void intel_engine_setup_common(struct intel_engine_cs *engine)
> -{
> -	i915_timeline_init(engine->i915, &engine->timeline, engine->name);
> -	i915_timeline_set_subclass(&engine->timeline, TIMELINE_ENGINE);
> -
> -	intel_engine_init_execlist(engine);
> -	intel_engine_init_hangcheck(engine);
> -	intel_engine_init_batch_pool(engine);
> -	intel_engine_init_cmd_parser(engine);
> -}
> -
>   static void cleanup_status_page(struct intel_engine_cs *engine)
>   {
>   	struct i915_vma *vma;
> @@ -601,6 +581,44 @@ static int init_status_page(struct intel_engine_cs *engine)
>   	return ret;
>   }
>   
> +/**
> + * intel_engines_setup_common - setup engine state not requiring hw access
> + * @engine: Engine to setup.
> + *
> + * Initializes @engine@ structure members shared between legacy and execlists
> + * submission modes which do not require hardware access.
> + *
> + * Typically done early in the submission mode specific engine setup stage.
> + */
> +int intel_engine_setup_common(struct intel_engine_cs *engine)
> +{
> +	int err;
> +
> +	err = init_status_page(engine);
> +	if (err)
> +		return err;
> +
> +	err = i915_timeline_init(engine->i915,
> +				 &engine->timeline,
> +				 engine->name,
> +				 engine->status_page.vma);
> +	if (err)
> +		goto err_hwsp;
> +
> +	i915_timeline_set_subclass(&engine->timeline, TIMELINE_ENGINE);
> +
> +	intel_engine_init_execlist(engine);
> +	intel_engine_init_hangcheck(engine);
> +	intel_engine_init_batch_pool(engine);
> +	intel_engine_init_cmd_parser(engine);
> +
> +	return 0;
> +
> +err_hwsp:
> +	cleanup_status_page(engine);
> +	return err;
> +}
> +
>   static void __intel_context_unpin(struct i915_gem_context *ctx,
>   				  struct intel_engine_cs *engine)
>   {
> @@ -653,14 +671,8 @@ int intel_engine_init_common(struct intel_engine_cs *engine)
>   	if (ret)
>   		goto err_unpin_preempt;
>   
> -	ret = init_status_page(engine);
> -	if (ret)
> -		goto err_breadcrumbs;
> -
>   	return 0;
>   
> -err_breadcrumbs:
> -	intel_engine_fini_breadcrumbs(engine);
>   err_unpin_preempt:
>   	if (i915->preempt_context)
>   		__intel_context_unpin(i915->preempt_context, engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index dabebb68759f..ba59241add47 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2199,10 +2199,14 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
>   	engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
>   }
>   
> -static void
> +static int
>   logical_ring_setup(struct intel_engine_cs *engine)
>   {
> -	intel_engine_setup_common(engine);
> +	int err;
> +
> +	err = intel_engine_setup_common(engine);
> +	if (err)
> +		return err;
>   
>   	/* Intentionally left blank. */
>   	engine->buffer = NULL;
> @@ -2212,6 +2216,8 @@ logical_ring_setup(struct intel_engine_cs *engine)
>   
>   	logical_ring_default_vfuncs(engine);
>   	logical_ring_default_irqs(engine);
> +
> +	return 0;
>   }
>   
>   static int logical_ring_init(struct intel_engine_cs *engine)
> @@ -2260,7 +2266,9 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
>   {
>   	int ret;
>   
> -	logical_ring_setup(engine);
> +	ret = logical_ring_setup(engine);
> +	if (ret)
> +		return ret;
>   
>   	/* Override some for render ring. */
>   	engine->init_context = gen8_init_rcs_context;
> @@ -2290,7 +2298,11 @@ int logical_render_ring_init(struct intel_engine_cs *engine)
>   
>   int logical_xcs_ring_init(struct intel_engine_cs *engine)
>   {
> -	logical_ring_setup(engine);
> +	int err;
> +
> +	err = logical_ring_setup(engine);
> +	if (err)
> +		return err;
>   
>   	return logical_ring_init(engine);
>   }
> @@ -2624,7 +2636,7 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
>   		goto error_deref_obj;
>   	}
>   
> -	timeline = i915_timeline_create(ctx->i915, ctx->name);
> +	timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
>   	if (IS_ERR(timeline)) {
>   		ret = PTR_ERR(timeline);
>   		goto error_deref_obj;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index d72012b42f20..5887304bc3ae 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1539,9 +1539,13 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>   	struct intel_ring *ring;
>   	int err;
>   
> -	intel_engine_setup_common(engine);
> +	err = intel_engine_setup_common(engine);
> +	if (err)
> +		return err;
>   
> -	timeline = i915_timeline_create(engine->i915, engine->name);
> +	timeline = i915_timeline_create(engine->i915,
> +					engine->name,
> +					engine->status_page.vma);
>   	if (IS_ERR(timeline)) {
>   		err = PTR_ERR(timeline);
>   		goto err;
> @@ -1565,6 +1569,8 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine)
>   	if (err)
>   		goto err_unpin;
>   
> +	GEM_BUG_ON(ring->timeline->hwsp_ggtt != engine->status_page.vma);
> +
>   	return 0;
>   
>   err_unpin:
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index d1a82610e0c1..de0099ea926b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -720,7 +720,9 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
>   #define I915_GEM_HWS_INDEX_ADDR (I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
>   #define I915_GEM_HWS_PREEMPT_INDEX	0x32
>   #define I915_GEM_HWS_PREEMPT_ADDR (I915_GEM_HWS_PREEMPT_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
> -#define I915_GEM_HWS_SCRATCH_INDEX	0x40
> +#define I915_GEM_HWS_SEQNO		0x40
> +#define I915_GEM_HWS_SEQNO_ADDR (I915_GEM_HWS_SEQNO << MI_STORE_DWORD_INDEX_SHIFT)
> +#define I915_GEM_HWS_SCRATCH_INDEX	0x80
>   #define I915_GEM_HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH_INDEX << MI_STORE_DWORD_INDEX_SHIFT)
>   
>   #define I915_HWS_CSB_BUF0_INDEX		0x10
> @@ -826,7 +828,7 @@ intel_ring_set_tail(struct intel_ring *ring, unsigned int tail)
>   
>   void intel_engine_write_global_seqno(struct intel_engine_cs *engine, u32 seqno);
>   
> -void intel_engine_setup_common(struct intel_engine_cs *engine);
> +int intel_engine_setup_common(struct intel_engine_cs *engine);
>   int intel_engine_init_common(struct intel_engine_cs *engine);
>   void intel_engine_cleanup_common(struct intel_engine_cs *engine);
>   
> diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> index a15713cae3b3..76b4f87fc853 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> @@ -13,6 +13,7 @@ selftest(sanitycheck, i915_live_sanitycheck) /* keep first (igt selfcheck) */
>   selftest(uncore, intel_uncore_live_selftests)
>   selftest(workarounds, intel_workarounds_live_selftests)
>   selftest(requests, i915_request_live_selftests)
> +selftest(timelines, i915_timeline_live_selftests)
>   selftest(objects, i915_gem_object_live_selftests)
>   selftest(dmabuf, i915_gem_dmabuf_live_selftests)
>   selftest(coherency, i915_gem_coherency_live_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> index 1b70208eeea7..4a83a1c6c406 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_mock_selftests.h
> @@ -16,7 +16,7 @@ selftest(syncmap, i915_syncmap_mock_selftests)
>   selftest(uncore, intel_uncore_mock_selftests)
>   selftest(engine, intel_engine_cs_mock_selftests)
>   selftest(breadcrumbs, intel_breadcrumbs_mock_selftests)
> -selftest(timelines, i915_gem_timeline_mock_selftests)
> +selftest(timelines, i915_timeline_mock_selftests)
>   selftest(requests, i915_request_mock_selftests)
>   selftest(objects, i915_gem_object_mock_selftests)
>   selftest(dmabuf, i915_gem_dmabuf_mock_selftests)
> diff --git a/drivers/gpu/drm/i915/selftests/i915_timeline.c b/drivers/gpu/drm/i915/selftests/i915_timeline.c
> index 19f1c6a5c8fb..15a33ec7217d 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_timeline.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_timeline.c
> @@ -256,7 +256,7 @@ static int bench_sync(void *arg)
>   	return 0;
>   }
>   
> -int i915_gem_timeline_mock_selftests(void)
> +int i915_timeline_mock_selftests(void)
>   {
>   	static const struct i915_subtest tests[] = {
>   		SUBTEST(igt_sync),
> @@ -265,3 +265,315 @@ int i915_gem_timeline_mock_selftests(void)
>   
>   	return i915_subtests(tests, NULL);
>   }
> +
> +static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
> +{
> +	u32 *cs;
> +
> +	cs = intel_ring_begin(rq, 4);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	if (INTEL_GEN(rq->i915) >= 8) {
> +		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +		*cs++ = addr;
> +		*cs++ = 0;
> +		*cs++ = value;
> +	} else if (INTEL_GEN(rq->i915) >= 4) {
> +		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
> +		*cs++ = 0;
> +		*cs++ = addr;
> +		*cs++ = value;
> +	} else {
> +		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
> +		*cs++ = addr;
> +		*cs++ = value;
> +		*cs++ = MI_NOOP;
> +	}
> +
> +	intel_ring_advance(rq, cs);
> +
> +	return 0;
> +}
> +
> +static u32 hwsp_address(const struct i915_timeline *tl)
> +{
> +	return i915_ggtt_offset(tl->hwsp_ggtt) + tl->hwsp_offset;
> +}
> +
> +static struct i915_request *
> +tl_write(struct i915_timeline *tl, struct intel_engine_cs *engine, u32 value)
> +{
> +	struct i915_request *rq;
> +	int err;
> +
> +	lockdep_assert_held(tl->i915->drm.struct_mutex); /* lazy with rq refs */
> +
> +	err = i915_timeline_pin(tl);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	rq = i915_request_alloc(engine, engine->i915->kernel_context);
> +	if (IS_ERR(rq))
> +		goto out_unpin;
> +
> +	err = emit_ggtt_store_dw(rq, hwsp_address(tl), value);
> +	i915_request_add(rq);
> +	if (err)
> +		rq = ERR_PTR(err);
> +
> +out_unpin:
> +	i915_timeline_unpin(tl);
> +	return rq;
> +}
> +
> +static int live_hwsp_engine(void *arg)
> +{
> +#define NUM_TIMELINES 4096
> +	struct drm_i915_private *i915 = arg;
> +	struct i915_timeline **timelines;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	intel_wakeref_t wakeref;
> +	unsigned long count, n;
> +	int err = 0;
> +
> +	/*
> +	 * Create a bunch of timelines and check we can write
> +	 * independently to each of their breadcrumb slots.
> +	 */
> +
> +	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
> +				   sizeof(*timelines),
> +				   GFP_KERNEL);
> +	if (!timelines)
> +		return -ENOMEM;
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	wakeref = intel_runtime_pm_get(i915);
> +
> +	count = 0;
> +	for_each_engine(engine, i915, id) {
> +		if (!intel_engine_can_store_dword(engine))
> +			continue;
> +
> +		for (n = 0; n < NUM_TIMELINES; n++) {
> +			struct i915_timeline *tl;
> +			struct i915_request *rq;
> +
> +			tl = i915_timeline_create(i915, "live", NULL);
> +			if (IS_ERR(tl)) {
> +				err = PTR_ERR(tl);
> +				goto out;
> +			}
> +
> +			if (*tl->hwsp_seqno != tl->seqno) {
> +				pr_err("Timeline %lu created with incorrect breadcrumb, found %x, expected %x\n",
> +				       count, *tl->hwsp_seqno, tl->seqno);
> +				err = -EINVAL;
> +				i915_timeline_put(tl);
> +				goto out;
> +			}

Above block is repeated three times so you could wrap it together with 
timeline creation in a single helper.

> +
> +			rq = tl_write(tl, engine, count);
> +			if (IS_ERR(rq)) {
> +				pr_err("Failed to write to timeline!\n");
> +				i915_timeline_put(tl);
> +				err = PTR_ERR(rq);
> +				goto out;
> +			}
> +
> +			timelines[count++] = tl;
> +		}
> +	}
> +
> +	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, MAX_SCHEDULE_TIMEOUT);
> +
> +out:
> +	for (n = 0; n < count; n++) {
> +		struct i915_timeline *tl = timelines[n];
> +
> +		if (!err && *tl->hwsp_seqno != n) {
> +			pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
> +			       n, *tl->hwsp_seqno);
> +			err = -EINVAL;
> +		}
> +		i915_timeline_put(tl);
> +	}

Okay this loop is only repeated twice so no such strong argument to 
consolidate it.

> +
> +	intel_runtime_pm_put(i915, wakeref);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +
> +	kvfree(timelines);
> +
> +	return err;
> +#undef NUM_TIMELINES
> +}
> +
> +static int live_hwsp_alternate(void *arg)
> +{
> +#define NUM_TIMELINES 4096
> +	struct drm_i915_private *i915 = arg;
> +	struct i915_timeline **timelines;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	intel_wakeref_t wakeref;
> +	unsigned long count, n;
> +	int err = 0;
> +
> +	/*
> +	 * Create a bunch of timelines and check we can write
> +	 * independently to each of their breadcrumb slots with adjacent
> +	 * engines.
> +	 */
> +
> +	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
> +				   sizeof(*timelines),
> +				   GFP_KERNEL);
> +	if (!timelines)
> +		return -ENOMEM;
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	wakeref = intel_runtime_pm_get(i915);
> +
> +	count = 0;
> +	for (n = 0; n < NUM_TIMELINES; n++) {
> +		for_each_engine(engine, i915, id) {
> +			struct i915_timeline *tl;
> +			struct i915_request *rq;
> +
> +			if (!intel_engine_can_store_dword(engine))
> +				continue;
> +
> +			tl = i915_timeline_create(i915, "live", NULL);
> +			if (IS_ERR(tl)) {
> +				err = PTR_ERR(tl);
> +				goto out;
> +			}
> +
> +			if (*tl->hwsp_seqno != tl->seqno) {
> +				pr_err("Timeline %lu created with incorrect breadcrumb, found %x, expected %x\n",
> +				       count, *tl->hwsp_seqno, tl->seqno);
> +				err = -EINVAL;
> +				i915_timeline_put(tl);
> +				goto out;
> +			}
> +
> +			rq = tl_write(tl, engine, count);
> +			if (IS_ERR(rq)) {
> +				pr_err("Failed to write to timeline!\n");
> +				i915_timeline_put(tl);
> +				err = PTR_ERR(rq);
> +				goto out;
> +			}
> +
> +			timelines[count++] = tl;
> +		}
> +	}
> +
> +	err = i915_gem_wait_for_idle(i915, I915_WAIT_LOCKED, MAX_SCHEDULE_TIMEOUT);
> +
> +out:
> +	for (n = 0; n < count; n++) {
> +		struct i915_timeline *tl = timelines[n];
> +
> +		if (!err && *tl->hwsp_seqno != n) {
> +			pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
> +			       n, *tl->hwsp_seqno);
> +			err = -EINVAL;
> +		}
> +		i915_timeline_put(tl);
> +	}
> +
> +	intel_runtime_pm_put(i915, wakeref);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +
> +	kvfree(timelines);
> +
> +	return err;
> +#undef NUM_TIMELINES
> +}
> +
> +static int live_hwsp_recycle(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	intel_wakeref_t wakeref;
> +	unsigned long count;
> +	int err = 0;
> +
> +	/*
> +	 * Check seqno writes into one timeline at a time. We expect to
> +	 * recycle the breadcrumb slot between iterations and neither
> +	 * want to confuse ourselves or the GPU.
> +	 */
> +
> +	mutex_lock(&i915->drm.struct_mutex);
> +	wakeref = intel_runtime_pm_get(i915);
> +
> +	count = 0;
> +	for_each_engine(engine, i915, id) {
> +		IGT_TIMEOUT(end_time);
> +
> +		if (!intel_engine_can_store_dword(engine))
> +			continue;
> +
> +		do {
> +			struct i915_timeline *tl;
> +			struct i915_request *rq;
> +
> +			tl = i915_timeline_create(i915, "live", NULL);
> +			if (IS_ERR(tl)) {
> +				err = PTR_ERR(tl);
> +				goto out;
> +			}
> +
> +			if (*tl->hwsp_seqno != tl->seqno) {
> +				pr_err("Timeline %lu created with incorrect breadcrumb, found %x, expected %x\n",
> +				       count, *tl->hwsp_seqno, tl->seqno);
> +				err = -EINVAL;
> +				i915_timeline_put(tl);
> +				goto out;
> +			}
> +
> +			rq = tl_write(tl, engine, count);
> +			if (IS_ERR(rq)) {
> +				pr_err("Failed to write to timeline!\n");
> +				i915_timeline_put(tl);
> +				err = PTR_ERR(rq);
> +				goto out;
> +			}
> +
> +			i915_request_wait(rq, I915_WAIT_LOCKED, MAX_SCHEDULE_TIMEOUT);
> +			if (*tl->hwsp_seqno != count) {
> +				pr_err("Invalid seqno stored in timeline %lu, found 0x%x\n",
> +				       count, *tl->hwsp_seqno);
> +				err = -EINVAL;
> +			}
> +
> +			i915_timeline_put(tl);
> +			count++;
> +
> +			if (err)
> +				goto out;
> +		} while (!__igt_timeout(end_time, NULL));
> +	}
> +
> +out:
> +	intel_runtime_pm_put(i915, wakeref);
> +	mutex_unlock(&i915->drm.struct_mutex);
> +
> +	return err;
> +}
> +
> +int i915_timeline_live_selftests(struct drm_i915_private *i915)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(live_hwsp_recycle),
> +		SUBTEST(live_hwsp_engine),
> +		SUBTEST(live_hwsp_alternate),
> +	};
> +
> +	return i915_subtests(tests, i915);
> +}
> diff --git a/drivers/gpu/drm/i915/selftests/mock_engine.c b/drivers/gpu/drm/i915/selftests/mock_engine.c
> index 968a7e139a67..acd27c7e807b 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_engine.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_engine.c
> @@ -34,12 +34,20 @@ static struct intel_ring *mock_ring(struct intel_engine_cs *engine)
>   {
>   	const unsigned long sz = PAGE_SIZE / 2;
>   	struct mock_ring *ring;
> +	int err;
>   
>   	ring = kzalloc(sizeof(*ring) + sz, GFP_KERNEL);
>   	if (!ring)
>   		return NULL;
>   
> -	i915_timeline_init(engine->i915, &ring->timeline, engine->name);
> +	err = i915_timeline_init(engine->i915,
> +				 &ring->timeline,
> +				 engine->name,
> +				 NULL);
> +	if (err) {
> +		kfree(ring);
> +		return NULL;
> +	}
>   
>   	ring->base.size = sz;
>   	ring->base.effective_size = sz;
> @@ -209,7 +217,11 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   	engine->base.emit_breadcrumb = mock_emit_breadcrumb;
>   	engine->base.submit_request = mock_submit_request;
>   
> -	i915_timeline_init(i915, &engine->base.timeline, engine->base.name);
> +	if (i915_timeline_init(i915,
> +			       &engine->base.timeline,
> +			       engine->base.name,
> +			       NULL))
> +		goto err_free;
>   	i915_timeline_set_subclass(&engine->base.timeline, TIMELINE_ENGINE);
>   
>   	intel_engine_init_breadcrumbs(&engine->base);
> @@ -227,6 +239,7 @@ struct intel_engine_cs *mock_engine(struct drm_i915_private *i915,
>   err_breadcrumbs:
>   	intel_engine_fini_breadcrumbs(&engine->base);
>   	i915_timeline_fini(&engine->base.timeline);
> +err_free:
>   	kfree(engine);
>   	return NULL;
>   }
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 25/38] drm/i915/pmu: Always sample an active ringbuffer
  2019-01-18 14:00 ` [PATCH 25/38] drm/i915/pmu: Always sample an active ringbuffer Chris Wilson
@ 2019-01-22  9:20   ` Tvrtko Ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-22  9:20 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:00, Chris Wilson wrote:
> As we no longer have a precise indication of requests queued to an
> engine, make no presumptions and just sample the ring registers to see
> if the engine is busy.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_pmu.c | 47 +++++++++++----------------------
>   1 file changed, 16 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c
> index b1cb2d3cae16..452589a7e473 100644
> --- a/drivers/gpu/drm/i915/i915_pmu.c
> +++ b/drivers/gpu/drm/i915/i915_pmu.c
> @@ -148,14 +148,6 @@ void i915_pmu_gt_unparked(struct drm_i915_private *i915)
>   	spin_unlock_irq(&i915->pmu.lock);
>   }
>   
> -static bool grab_forcewake(struct drm_i915_private *i915, bool fw)
> -{
> -	if (!fw)
> -		intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
> -
> -	return true;
> -}
> -
>   static void
>   add_sample(struct i915_pmu_sample *sample, u32 val)
>   {
> @@ -168,7 +160,6 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned int period_ns)
>   	struct intel_engine_cs *engine;
>   	enum intel_engine_id id;
>   	intel_wakeref_t wakeref;
> -	bool fw = false;
>   
>   	if ((dev_priv->pmu.enable & ENGINE_SAMPLE_MASK) == 0)
>   		return;
> @@ -180,37 +171,31 @@ engines_sample(struct drm_i915_private *dev_priv, unsigned int period_ns)
>   	if (!wakeref)
>   		return;
>   
> +	intel_uncore_forcewake_get(dev_priv, FORCEWAKE_ALL);
>   	for_each_engine(engine, dev_priv, id) {
> -		u32 current_seqno = intel_engine_get_seqno(engine);
> -		u32 last_seqno = intel_engine_last_submit(engine);
> +		typeof(engine->pmu) *pmu = &engine->pmu;

Or name the struct?

>   		u32 val;
>   
> -		val = !i915_seqno_passed(current_seqno, last_seqno);
> -
> -		if (val)
> -			add_sample(&engine->pmu.sample[I915_SAMPLE_BUSY],
> -				   period_ns);
> +		val = I915_READ_FW(RING_MI_MODE(engine->mmio_base));
> +		if (val & MODE_IDLE)
> +			continue;
>   
> -		if (val && (engine->pmu.enable &
> -		    (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA)))) {
> -			fw = grab_forcewake(dev_priv, fw);
> +		add_sample(&pmu->sample[I915_SAMPLE_BUSY], period_ns);
>   
> +		if (pmu->enable &
> +		    (BIT(I915_SAMPLE_WAIT) | BIT(I915_SAMPLE_SEMA))) {
>   			val = I915_READ_FW(RING_CTL(engine->mmio_base));
> -		} else {
> -			val = 0;
> -		}
>   
> -		if (val & RING_WAIT)
> -			add_sample(&engine->pmu.sample[I915_SAMPLE_WAIT],
> -				   period_ns);
> +			if (val & RING_WAIT)
> +				add_sample(&pmu->sample[I915_SAMPLE_WAIT],
> +					   period_ns);
>   
> -		if (val & RING_WAIT_SEMAPHORE)
> -			add_sample(&engine->pmu.sample[I915_SAMPLE_SEMA],
> -				   period_ns);
> +			if (val & RING_WAIT_SEMAPHORE)
> +				add_sample(&pmu->sample[I915_SAMPLE_SEMA],
> +					   period_ns);
> +		}
>   	}
> -
> -	if (fw)
> -		intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
> +	intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL);
>   
>   	intel_runtime_pm_put(dev_priv, wakeref);
>   }
> 

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 27/38] drm/i915: Introduce the i915_user_extension_method
  2019-01-18 14:00 ` [PATCH 27/38] drm/i915: Introduce the i915_user_extension_method Chris Wilson
@ 2019-01-22  9:31   ` Tvrtko Ursulin
  2019-01-22 10:47     ` Chris Wilson
  0 siblings, 1 reply; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-22  9:31 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:00, Chris Wilson wrote:
> An idea for extending uABI inspired by Vulkan's extension chains.
> Instead of expanding the data struct for each ioctl every time we need
> to add a new feature, define an extension chain instead. As we add
> optional interfaces to control the ioctl, we define a new extension
> struct that can be linked into the ioctl data only when required by the
> user. The key advantage being able to ignore large control structs for
> optional interfaces/extensions, while being able to process them in a
> consistent manner.
> 
> In comparison to other extensible ioctls, the key difference is the
> use of a linked chain of extension structs vs an array of tagged
> pointers. For example,
> 
> struct drm_amdgpu_cs_chunk {
>          __u32           chunk_id;
>          __u32           length_dw;
>          __u64           chunk_data;
> };
> 
> struct drm_amdgpu_cs_in {
>          __u32           ctx_id;
>          __u32           bo_list_handle;
>          __u32           num_chunks;
>          __u32           _pad;
>          __u64           chunks;
> };
> 
> allows userspace to pass in array of pointers to extension structs, but
> must therefore keep constructing that array along side the command stream.
> In dynamic situations like that, a linked list is preferred and does not
> similar from extra cache line misses as the extension structs themselves

s/similar/suffer/ I think.

> must still be loaded separate to the chunks array.
> 
> v2: Apply the tail call optimisation directly to nip the worry of stack
> overflow in the bud.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/Makefile               |  1 +
>   drivers/gpu/drm/i915/i915_user_extensions.c | 42 +++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_user_extensions.h | 20 ++++++++++
>   include/uapi/drm/i915_drm.h                 | 20 ++++++++++
>   4 files changed, 83 insertions(+)
>   create mode 100644 drivers/gpu/drm/i915/i915_user_extensions.c
>   create mode 100644 drivers/gpu/drm/i915/i915_user_extensions.h
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 611115ed00db..f206fbc85cee 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -45,6 +45,7 @@ i915-y := i915_drv.o \
>   	  i915_sw_fence.o \
>   	  i915_syncmap.o \
>   	  i915_sysfs.o \
> +	  i915_user_extensions.o \
>   	  intel_csr.o \
>   	  intel_device_info.o \
>   	  intel_pm.o \
> diff --git a/drivers/gpu/drm/i915/i915_user_extensions.c b/drivers/gpu/drm/i915/i915_user_extensions.c
> new file mode 100644
> index 000000000000..5d90c368f185
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_user_extensions.c
> @@ -0,0 +1,42 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2018 Intel Corporation
> + */
> +
> +#include <linux/sched/signal.h>
> +#include <linux/uaccess.h>
> +#include <uapi/drm/i915_drm.h>
> +
> +#include "i915_user_extensions.h"
> +
> +int i915_user_extensions(struct i915_user_extension __user *ext,
> +			 const i915_user_extension_fn *tbl,
> +			 unsigned long count,

I would be tempted to limit the count to unsigned int. 4 billion 
extensions should be enough for everyone. :)

ABI can remain u64, but implementation I think in this form does not 
need it.

> +			 void *data)
> +{
> +	while (ext) {
> +		int err;
> +		u64 x;
> +
> +		cond_resched();
> +		if (signal_pending(current))
> +			return -EINTR;

What was your thinking behind this? It feels like, since here we are not 
doing any explicit wait/sleeps, that at this level the code shouldn't 
bother with it.

> +
> +		if (get_user(x, &ext->name))
> +			return -EFAULT;
> +
> +		err = -EINVAL;
> +		if (x < count && tbl[x])
> +			err = tbl[x](ext, data);
> +		if (err)
> +			return err;

We talked about providing unwind on failure ie. option for destructor 
call backs. You gave up on that?

> +
> +		if (get_user(x, &ext->next_extension))
> +			return -EFAULT;
> +
> +		ext = u64_to_user_ptr(x);
> +	}
> +
> +	return 0;
> +}
> diff --git a/drivers/gpu/drm/i915/i915_user_extensions.h b/drivers/gpu/drm/i915/i915_user_extensions.h
> new file mode 100644
> index 000000000000..313a510b068a
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/i915_user_extensions.h
> @@ -0,0 +1,20 @@
> +/*
> + * SPDX-License-Identifier: MIT
> + *
> + * Copyright © 2018 Intel Corporation
> + */
> +
> +#ifndef I915_USER_EXTENSIONS_H
> +#define I915_USER_EXTENSIONS_H
> +
> +struct i915_user_extension;
> +
> +typedef int (*i915_user_extension_fn)(struct i915_user_extension __user *ext,
> +				      void *data);
> +
> +int i915_user_extensions(struct i915_user_extension __user *ext,
> +			 const i915_user_extension_fn *tbl,
> +			 unsigned long count,
> +			 void *data);
> +
> +#endif /* I915_USER_EXTENSIONS_H */
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 298b2e197744..6ee2221838da 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -62,6 +62,26 @@ extern "C" {
>   #define I915_ERROR_UEVENT		"ERROR"
>   #define I915_RESET_UEVENT		"RESET"
>   
> +/*
> + * i915_user_extension: Base class for defining a chain of extensions
> + *
> + * Many interfaces need to grow over time. In most cases we can simply
> + * extend the struct and have userspace pass in more data. Another option,
> + * as demonstrated by Vulkan's approach to providing extensions for forward
> + * and backward compatibility, is to use a list of optional structs to
> + * provide those extra details.
> + *
> + * The key advantage to using an extension chain is that it allows us to
> + * redefine the interface more easily than an ever growing struct of
> + * increasing complexity, and for large parts of that interface to be
> + * entirely optional. The downside is more pointer chasing; chasing across
> + * the __user boundary with pointers encapsulated inside u64.
> + */
> +struct i915_user_extension {
> +	__u64 next_extension;
> +	__u64 name;

s/name/id/ ?

> +};
> +
>   /*
>    * MOCS indexes used for GPU surfaces, defining the cacheability of the
>    * surface data and the coherency for this data wrt. CPU vs. GPU accesses.
> 

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 27/38] drm/i915: Introduce the i915_user_extension_method
  2019-01-22  9:31   ` Tvrtko Ursulin
@ 2019-01-22 10:47     ` Chris Wilson
  2019-01-22 11:05       ` Tvrtko Ursulin
  0 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-22 10:47 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-22 09:31:31)
> 
> On 18/01/2019 14:00, Chris Wilson wrote:
> > +/*
> > + * SPDX-License-Identifier: MIT
> > + *
> > + * Copyright © 2018 Intel Corporation
> > + */
> > +
> > +#include <linux/sched/signal.h>
> > +#include <linux/uaccess.h>
> > +#include <uapi/drm/i915_drm.h>
> > +
> > +#include "i915_user_extensions.h"
> > +
> > +int i915_user_extensions(struct i915_user_extension __user *ext,
> > +                      const i915_user_extension_fn *tbl,
> > +                      unsigned long count,
> 
> I would be tempted to limit the count to unsigned int. 4 billion 
> extensions should be enough for everyone. :)

I just picked the natural type, thinking having it the same width as
ARRAY_SIZE might save a few questions from semantic analysers.

> > +{
> > +     while (ext) {
> > +             int err;
> > +             u64 x;
> > +
> > +             cond_resched();
> > +             if (signal_pending(current))
> > +                     return -EINTR;
> 
> What was your thinking behind this? It feels like, since here we are not 
> doing any explicit wait/sleeps, that at this level the code shouldn't 
> bother with it.

This ties into the discussion vvv

> > +             if (get_user(x, &ext->name))
> > +                     return -EFAULT;
> > +
> > +             err = -EINVAL;
> > +             if (x < count && tbl[x])
> > +                     err = tbl[x](ext, data);
> > +             if (err)
> > +                     return err;
> 
> We talked about providing unwind on failure ie. option for destructor 
> call backs. You gave up on that?

The patch is simply called 'merde'. Yes, unwind on failure does not lend
itself to a simple tail call implementation :) And it doesn't lend
itself nicely to writing the stacked cleanup handlers either. (So I
stuck with the solution of just doing a single cleanup on failure, safe
in the knowledge that the most complicated case in this series is
trivial.)

Thinking about the issues with providing a stack for unwind, leads to
the nasty question of how big a stack exactly do we want to provide.
Limiting the chain is required for defense against misuse, but what
depth? For the chained parameter setup of a single shot context create,
we could easily be into the dozens of parameters and extensions blocks.
The extreme I've been contemplating is a multi-batch execbuf setup (with
all the fancy extensions for e.g. semi-privileged fancy register setup),
for that I've been thinking about how this interface would extend to
supporting many chained chunks. What makes a good upper bound for stack
depth? 32? 64? 512? Pick a number, it won't be enough for someone. (Now,
really passing that much information through an ioctl means our design
is broken ;)

So... The break on interrupt there was for the silly protection against
recursion, if it doesn't result in an invalid command.

Another reason the patch was called merde.

I think the chained API extension is very powerful. Being able to do
arbitrary things like a single-shot context creation ioctl and still be
able to redefine/extend the interface as needs demands, is compelling.

> > +/*
> > + * i915_user_extension: Base class for defining a chain of extensions
> > + *
> > + * Many interfaces need to grow over time. In most cases we can simply
> > + * extend the struct and have userspace pass in more data. Another option,
> > + * as demonstrated by Vulkan's approach to providing extensions for forward
> > + * and backward compatibility, is to use a list of optional structs to
> > + * provide those extra details.
> > + *
> > + * The key advantage to using an extension chain is that it allows us to
> > + * redefine the interface more easily than an ever growing struct of
> > + * increasing complexity, and for large parts of that interface to be
> > + * entirely optional. The downside is more pointer chasing; chasing across
> > + * the __user boundary with pointers encapsulated inside u64.
> > + */
> > +struct i915_user_extension {
> > +     __u64 next_extension;
> > +     __u64 name;
> 
> s/name/id/ ?

I think name is common parlance for extensions/parameters? At least I've
been using it like it was :) And I was trying to retrospectively restrict
'id' for handles tracked by an idr! :)
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 27/38] drm/i915: Introduce the i915_user_extension_method
  2019-01-22 10:47     ` Chris Wilson
@ 2019-01-22 11:05       ` Tvrtko Ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-22 11:05 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 22/01/2019 10:47, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-01-22 09:31:31)
>>
>> On 18/01/2019 14:00, Chris Wilson wrote:
>>> +/*
>>> + * SPDX-License-Identifier: MIT
>>> + *
>>> + * Copyright © 2018 Intel Corporation
>>> + */
>>> +
>>> +#include <linux/sched/signal.h>
>>> +#include <linux/uaccess.h>
>>> +#include <uapi/drm/i915_drm.h>
>>> +
>>> +#include "i915_user_extensions.h"
>>> +
>>> +int i915_user_extensions(struct i915_user_extension __user *ext,
>>> +                      const i915_user_extension_fn *tbl,
>>> +                      unsigned long count,
>>
>> I would be tempted to limit the count to unsigned int. 4 billion
>> extensions should be enough for everyone. :)
> 
> I just picked the natural type, thinking having it the same width as
> ARRAY_SIZE might save a few questions from semantic analysers.
> 
>>> +{
>>> +     while (ext) {
>>> +             int err;
>>> +             u64 x;
>>> +
>>> +             cond_resched();
>>> +             if (signal_pending(current))
>>> +                     return -EINTR;
>>
>> What was your thinking behind this? It feels like, since here we are not
>> doing any explicit wait/sleeps, that at this level the code shouldn't
>> bother with it.
> 
> This ties into the discussion vvv
> 
>>> +             if (get_user(x, &ext->name))
>>> +                     return -EFAULT;
>>> +
>>> +             err = -EINVAL;
>>> +             if (x < count && tbl[x])
>>> +                     err = tbl[x](ext, data);
>>> +             if (err)
>>> +                     return err;
>>
>> We talked about providing unwind on failure ie. option for destructor
>> call backs. You gave up on that?
> 
> The patch is simply called 'merde'. Yes, unwind on failure does not lend
> itself to a simple tail call implementation :) And it doesn't lend
> itself nicely to writing the stacked cleanup handlers either. (So I
> stuck with the solution of just doing a single cleanup on failure, safe
> in the knowledge that the most complicated case in this series is
> trivial.)
> 
> Thinking about the issues with providing a stack for unwind, leads to
> the nasty question of how big a stack exactly do we want to provide.
> Limiting the chain is required for defense against misuse, but what
> depth? For the chained parameter setup of a single shot context create,
> we could easily be into the dozens of parameters and extensions blocks.
> The extreme I've been contemplating is a multi-batch execbuf setup (with
> all the fancy extensions for e.g. semi-privileged fancy register setup),
> for that I've been thinking about how this interface would extend to
> supporting many chained chunks. What makes a good upper bound for stack
> depth? 32? 64? 512? Pick a number, it won't be enough for someone. (Now,
> really passing that much information through an ioctl means our design
> is broken ;)
> 
> So... The break on interrupt there was for the silly protection against
> recursion, if it doesn't result in an invalid command.
> 
> Another reason the patch was called merde.
> 
> I think the chained API extension is very powerful. Being able to do
> arbitrary things like a single-shot context creation ioctl and still be
> able to redefine/extend the interface as needs demands, is compelling.

I did not think about unwind implementation details. :)

We could make the ABI double linked list? But the feeling is bad..

Or have some reasonable stack size and if overflows fall back to a very 
inefficient lookup from root to walk backwards. Sounds okay for this use 
case since we would never overflow even a small stack, and even the 
inefficient walk on unwind would be a) fast and b) rare.

But since you mention stack sizes like 512 I wonder I did not quite 
grasp how much you'd want to use extensions in the future. :)

>>> +/*
>>> + * i915_user_extension: Base class for defining a chain of extensions
>>> + *
>>> + * Many interfaces need to grow over time. In most cases we can simply
>>> + * extend the struct and have userspace pass in more data. Another option,
>>> + * as demonstrated by Vulkan's approach to providing extensions for forward
>>> + * and backward compatibility, is to use a list of optional structs to
>>> + * provide those extra details.
>>> + *
>>> + * The key advantage to using an extension chain is that it allows us to
>>> + * redefine the interface more easily than an ever growing struct of
>>> + * increasing complexity, and for large parts of that interface to be
>>> + * entirely optional. The downside is more pointer chasing; chasing across
>>> + * the __user boundary with pointers encapsulated inside u64.
>>> + */
>>> +struct i915_user_extension {
>>> +     __u64 next_extension;
>>> +     __u64 name;
>>
>> s/name/id/ ?
> 
> I think name is common parlance for extensions/parameters? At least I've
> been using it like it was :) And I was trying to retrospectively restrict
> 'id' for handles tracked by an idr! :)

I don't know if it is common, it is completely new to me. Maybe it is.. 
Or how about type?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA
  2019-01-18 16:03   ` Tvrtko Ursulin
  2019-01-18 16:06     ` Chris Wilson
@ 2019-01-22 14:19     ` Chris Wilson
  2019-01-25 10:46       ` Tvrtko Ursulin
  1 sibling, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-22 14:19 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-18 16:03:27)
> 
> On 18/01/2019 14:00, Chris Wilson wrote:
> > Our goal is to remove struct_mutex and replace it with fine grained
> > locking. One of the thorny issues is our eviction logic for reclaiming
> > space for an execbuffer (or GTT mmaping, among a few other examples).
> > While eviction itself is easy to move under a per-VM mutex, performing
> > the activity tracking is less agreeable. One solution is not to do any
> > MRU tracking and do a simple coarse evaluation during eviction of
> > active/inactive, with a loose temporal ordering of last
> > insertion/evaluation. That keeps all the locking constrained to when we
> > are manipulating the VM itself, neatly avoiding the tricky handling of
> > possible recursive locking during execbuf and elsewhere.
> > 
> > Note that discarding the MRU is unlikely to impact upon our efficiency
> > to reclaim VM space (where we think a LRU model is best) as our
> > current strategy is to use random idle replacement first before doing
> > a search, and over time the use of softpinned 48b per-ppGTT is growing
> > (thereby eliminating any need to perform any eviction searches, in
> > theory at least).
> 
> There is still no mention of list consolidation.

"Note that discarding the MRU (currently implemented as a pair of lists,
to avoid scanning the active list for a NONBLOCKING search)"

Is that enough to make it clear?
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 28/38] drm/i915: Create/destroy VM (ppGTT) for use with contexts
  2019-01-18 14:00 ` [PATCH 28/38] drm/i915: Create/destroy VM (ppGTT) for use with contexts Chris Wilson
@ 2019-01-23 11:30   ` Tvrtko Ursulin
  2019-01-23 11:51     ` Chris Wilson
  2019-01-24 15:58     ` [PATCH v3] " Chris Wilson
  0 siblings, 2 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-23 11:30 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:00, Chris Wilson wrote:
> In preparation to making the ppGTT binding for a context explicit (to
> facilitate reusing the same ppGTT between different contexts), allow the
> user to create and destroy named ppGTT.

Needs some more text to explain uAPI usage, or even better kerneldoc in 
i915_drm.h.

> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_drv.c         |  2 +
>   drivers/gpu/drm/i915/i915_drv.h         |  3 +
>   drivers/gpu/drm/i915/i915_gem_context.c | 83 +++++++++++++++++++++++++
>   drivers/gpu/drm/i915/i915_gem_context.h |  5 ++
>   drivers/gpu/drm/i915/i915_gem_gtt.h     |  2 +
>   include/uapi/drm/i915_drm.h             | 10 +++
>   6 files changed, 105 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index f462a4d28af4..56bd087f1c4e 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -3003,6 +3003,8 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
>   	DRM_IOCTL_DEF_DRV(I915_PERF_ADD_CONFIG, i915_perf_add_config_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
>   	DRM_IOCTL_DEF_DRV(I915_PERF_REMOVE_CONFIG, i915_perf_remove_config_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
>   	DRM_IOCTL_DEF_DRV(I915_QUERY, i915_query_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(I915_GEM_VM_CREATE, i915_gem_vm_create_ioctl, DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(I915_GEM_VM_DESTROY, i915_gem_vm_destroy_ioctl, DRM_RENDER_ALLOW),
>   };
>   
>   static struct drm_driver driver = {
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 74bccb153359..cb11c23823c7 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -217,6 +217,9 @@ struct drm_i915_file_private {
>   	} mm;
>   	struct idr context_idr;
>   
> +	struct mutex vm_lock;
> +	struct idr vm_idr;
> +
>   	struct intel_rps_client {
>   		atomic_t boosts;
>   	} rps_client;
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index e8334c4bc130..7c90981704bf 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -598,19 +598,29 @@ static int context_idr_cleanup(int id, void *p, void *data)
>   	return 0;
>   }
>   
> +static int vm_idr_cleanup(int id, void *p, void *data)
> +{
> +	i915_ppgtt_put(p);
> +	return 0;
> +}
> +
>   int i915_gem_context_open(struct drm_i915_private *i915,
>   			  struct drm_file *file)
>   {
>   	struct drm_i915_file_private *file_priv = file->driver_priv;
>   	struct i915_gem_context *ctx;
>   
> +	mutex_init(&file_priv->vm_lock);
> +
>   	idr_init(&file_priv->context_idr);
> +	idr_init_base(&file_priv->vm_idr, 1);
>   
>   	mutex_lock(&i915->drm.struct_mutex);
>   	ctx = i915_gem_create_context(i915, file_priv);
>   	mutex_unlock(&i915->drm.struct_mutex);
>   	if (IS_ERR(ctx)) {
>   		idr_destroy(&file_priv->context_idr);
> +		idr_destroy(&file_priv->vm_idr);
>   		return PTR_ERR(ctx);
>   	}
>   
> @@ -627,6 +637,79 @@ void i915_gem_context_close(struct drm_file *file)
>   
>   	idr_for_each(&file_priv->context_idr, context_idr_cleanup, NULL);
>   	idr_destroy(&file_priv->context_idr);
> +
> +	idr_for_each(&file_priv->vm_idr, vm_idr_cleanup, NULL);
> +	idr_destroy(&file_priv->vm_idr);
> +
> +	mutex_destroy(&file_priv->vm_lock);
> +}
> +
> +int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data,
> +			     struct drm_file *file)
> +{
> +	struct drm_i915_private *i915 = to_i915(dev);
> +	struct drm_i915_gem_vm_create *args = data;
> +	struct drm_i915_file_private *file_priv = file->driver_priv;
> +	struct i915_hw_ppgtt *ppgtt;
> +	int err;
> +
> +	if (!HAS_FULL_PPGTT(i915))
> +		return -ENODEV;
> +
> +	if (args->flags)
> +		return -EINVAL;
> +
> +	if (args->extensions)
> +		return -EINVAL;
> +
> +	ppgtt = i915_ppgtt_create(i915, file_priv);
> +	if (IS_ERR(ppgtt))
> +		return PTR_ERR(ppgtt);
> +
> +	err = mutex_lock_interruptible(&file_priv->vm_lock);
> +	if (err)
> +		goto err_put;
> +
> +	err = idr_alloc(&file_priv->vm_idr, ppgtt, 0, 0, GFP_KERNEL);
> +	mutex_unlock(&file_priv->vm_lock);
> +	if (err < 0)
> +		goto err_put;

	else if (GEM_WARN_ON(err == 0) {
		err = -EINVAL;
		goto err_put;
	}

?

Or a GEM_BUG_ON(err == 0) if you must. :)

> +
> +	ppgtt->user_handle = err;
> +	args->id = err;
> +	return 0;
> +
> +err_put:
> +	i915_ppgtt_put(ppgtt);
> +	return err;
> +}
> +
> +int i915_gem_vm_destroy_ioctl(struct drm_device *dev, void *data,

Can we cheat and declare data as u32 * here and so avoid need for the 
local, since there are only two dereferences?

> +			      struct drm_file *file)
> +{
> +	struct drm_i915_file_private *file_priv = file->driver_priv;
> +	struct i915_hw_ppgtt *ppgtt;
> +	int err;
> +	u32 id;
> +
> +	id = *(u32 *)data;
> +	if (!id)
> +		return -ENOENT;
> +
> +	err = mutex_lock_interruptible(&file_priv->vm_lock);
> +	if (err)
> +		return err;
> +
> +	ppgtt = idr_remove(&file_priv->vm_idr, id);
> +	if (ppgtt)

GEM_WARN_ON(id != ppgtt->user_handle) too much paranoia?

> +		ppgtt->user_handle = 0;
> +
> +	mutex_unlock(&file_priv->vm_lock);
> +	if (!ppgtt)
> +		return -ENOENT;
> +
> +	i915_ppgtt_put(ppgtt);
> +	return 0;

Or end with simply:

i915_ppgtt_put(ppgtt);

return ppgtt ? 0 : -ENOENT;

?

>   }
>   
>   static struct i915_request *
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 1e41a97b8007..9786f86b659d 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -348,6 +348,11 @@ void i915_gem_context_release(struct kref *ctx_ref);
>   struct i915_gem_context *
>   i915_gem_context_create_gvt(struct drm_device *dev);
>   
> +int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data,
> +			     struct drm_file *file);
> +int i915_gem_vm_destroy_ioctl(struct drm_device *dev, void *data,
> +			      struct drm_file *file);
> +
>   int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
>   				  struct drm_file *file);
>   int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 03ade71b8d9a..61698609a62c 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -396,6 +396,8 @@ struct i915_hw_ppgtt {
>   		struct i915_page_directory_pointer pdp;	/* GEN8+ */
>   		struct i915_page_directory pd;		/* GEN6-7 */
>   	};
> +
> +	u32 user_handle;
>   };
>   
>   struct gen6_hw_ppgtt {
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 6ee2221838da..c3336c931a95 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -339,6 +339,8 @@ typedef struct _drm_i915_sarea {
>   #define DRM_I915_PERF_ADD_CONFIG	0x37
>   #define DRM_I915_PERF_REMOVE_CONFIG	0x38
>   #define DRM_I915_QUERY			0x39
> +#define DRM_I915_GEM_VM_CREATE		0x3a
> +#define DRM_I915_GEM_VM_DESTROY		0x3b
>   
>   #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
>   #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
> @@ -397,6 +399,8 @@ typedef struct _drm_i915_sarea {
>   #define DRM_IOCTL_I915_PERF_ADD_CONFIG	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config)
>   #define DRM_IOCTL_I915_PERF_REMOVE_CONFIG	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_REMOVE_CONFIG, __u64)
>   #define DRM_IOCTL_I915_QUERY			DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, struct drm_i915_query)
> +#define DRM_IOCTL_I915_GEM_VM_CREATE	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_VM_CREATE, struct drm_i915_gem_vm_create)
> +#define DRM_IOCTL_I915_GEM_VM_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_VM_DESTROY, uint32_t)
>   
>   /* Allow drivers to submit batchbuffers directly to hardware, relying
>    * on the security mechanisms provided by hardware.
> @@ -1442,6 +1446,12 @@ struct drm_i915_gem_context_destroy {
>   	__u32 pad;
>   };

Some kernel doc overview of create and destroy usage, especially since 
destroy does not have it's own data structure.

> +struct drm_i915_gem_vm_create {
> +	__u64 extensions;
> +	__u32 flags;
> +	__u32 id; /* output: id of new vm */
> +};
> +
>   struct drm_i915_reg_read {
>   	/*
>   	 * Register offset.
> 

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 28/38] drm/i915: Create/destroy VM (ppGTT) for use with contexts
  2019-01-23 11:30   ` Tvrtko Ursulin
@ 2019-01-23 11:51     ` Chris Wilson
  2019-01-23 12:03       ` Tvrtko Ursulin
  2019-01-24 15:58     ` [PATCH v3] " Chris Wilson
  1 sibling, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-23 11:51 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-23 11:30:39)
> > +int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data,
> > +                          struct drm_file *file)
> > +{
> > +     struct drm_i915_private *i915 = to_i915(dev);
> > +     struct drm_i915_gem_vm_create *args = data;
> > +     struct drm_i915_file_private *file_priv = file->driver_priv;
> > +     struct i915_hw_ppgtt *ppgtt;
> > +     int err;
> > +
> > +     if (!HAS_FULL_PPGTT(i915))
> > +             return -ENODEV;
> > +
> > +     if (args->flags)
> > +             return -EINVAL;
> > +
> > +     if (args->extensions)
> > +             return -EINVAL;
> > +
> > +     ppgtt = i915_ppgtt_create(i915, file_priv);
> > +     if (IS_ERR(ppgtt))
> > +             return PTR_ERR(ppgtt);
> > +
> > +     err = mutex_lock_interruptible(&file_priv->vm_lock);
> > +     if (err)
> > +             goto err_put;
> > +
> > +     err = idr_alloc(&file_priv->vm_idr, ppgtt, 0, 0, GFP_KERNEL);
> > +     mutex_unlock(&file_priv->vm_lock);
> > +     if (err < 0)
> > +             goto err_put;
> 
>         else if (GEM_WARN_ON(err == 0) {
>                 err = -EINVAL;
>                 goto err_put;
>         }
> 
> ?
> 
> Or a GEM_BUG_ON(err == 0) if you must. :)

Not our bug :) We told it never to return 0.

> > +
> > +     ppgtt->user_handle = err;
> > +     args->id = err;
> > +     return 0;
> > +
> > +err_put:
> > +     i915_ppgtt_put(ppgtt);
> > +     return err;
> > +}
> > +
> > +int i915_gem_vm_destroy_ioctl(struct drm_device *dev, void *data,
> 
> Can we cheat and declare data as u32 * here and so avoid need for the 
> local, since there are only two dereferences?

Tempting, we need to cook up the macros to hide the function pointer
differences for the ioctl tables.

> > +                           struct drm_file *file)
> > +{
> > +     struct drm_i915_file_private *file_priv = file->driver_priv;
> > +     struct i915_hw_ppgtt *ppgtt;
> > +     int err;
> > +     u32 id;
> > +
> > +     id = *(u32 *)data;

Oh crikey, did I write this?

Remember, do it right the first time as I won't remember when I was
cutting corners.

> > +     if (!id)
> > +             return -ENOENT;
> > +
> > +     err = mutex_lock_interruptible(&file_priv->vm_lock);
> > +     if (err)
> > +             return err;
> > +
> > +     ppgtt = idr_remove(&file_priv->vm_idr, id);
> > +     if (ppgtt)
> 
> GEM_WARN_ON(id != ppgtt->user_handle) too much paranoia?

BUG_ON then!

> > +             ppgtt->user_handle = 0;
> > +
> > +     mutex_unlock(&file_priv->vm_lock);
> > +     if (!ppgtt)
> > +             return -ENOENT;
> > +
> > +     i915_ppgtt_put(ppgtt);
> > +     return 0;
> 
> Or end with simply:
> 
> i915_ppgtt_put(ppgtt);
> 
> return ppgtt ? 0 : -ENOENT;
> 
> ?

I feel a slight disappointment at the anticlimatic nature of this
function, leaving a gap, nay, a yearning, for more.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 29/38] drm/i915: Expose user control over the ppGTT associated with a context
  2019-01-18 14:01 ` [PATCH 29/38] drm/i915: Expose user control over the ppGTT associated with a context Chris Wilson
@ 2019-01-23 12:00   ` Tvrtko Ursulin
  2019-01-23 12:15     ` Chris Wilson
  0 siblings, 1 reply; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-23 12:00 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:01, Chris Wilson wrote:
> Allow the user to share ppGTT between contexts on the same fd. This
> gives the user the ability to relax their context isolation to share vm
> between their own contexts, but does not allow them to import a vm from
> another fd. The use case for sharing a vm is for the proposed virtual
> engine work where a context may be created explicitly to setup a load
> balancing engine, but always be run in conjunction with a second context
> for rcs/compute etc. By giving control to the user on how they setup the
> vm allows for them to have a single vm between all kernel contexts being
> used to emulate a single client context, similarly to how we use a
> single vm across all engines within a single kernel context today. It
> also allows for future specification a separate vm between engines
> inside a single kernel context should that be desired.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>   drivers/gpu/drm/i915/i915_gem_context.c       | 118 ++++++++-
>   drivers/gpu/drm/i915/i915_gem_gtt.c           |  17 +-
>   drivers/gpu/drm/i915/i915_gem_gtt.h           |  14 +-
>   drivers/gpu/drm/i915/selftests/huge_pages.c   |   1 -
>   .../gpu/drm/i915/selftests/i915_gem_context.c | 247 ++++++++++++++----
>   drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |   1 -
>   drivers/gpu/drm/i915/selftests/mock_context.c |   8 +-
>   include/uapi/drm/i915_drm.h                   |   1 +
>   8 files changed, 339 insertions(+), 68 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 7c90981704bf..f707241dbc78 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -109,6 +109,8 @@ static void lut_close(struct i915_gem_context *ctx)
>   		struct i915_vma *vma = rcu_dereference_raw(*slot);
>   
>   		radix_tree_iter_delete(&ctx->handles_vma, &iter, slot);
> +
> +		vma->open_count--;

I did not figure out what is this. A) why open coded vma management 
without any comments, and b) rest of the patch doesn't seem to touch 
this tree.

>   		__i915_gem_object_release_unless_active(vma->obj);
>   	}
>   	rcu_read_unlock();
> @@ -291,7 +293,7 @@ static void context_close(struct i915_gem_context *ctx)
>   	 */
>   	lut_close(ctx);
>   	if (ctx->ppgtt)
> -		i915_ppgtt_close(&ctx->ppgtt->vm);
> +		i915_ppgtt_close(ctx->ppgtt);

I'll need to figure out if it is okay for context to close the ppgtt 
instead of just dropping references to it. Like two contexts sharing 
ppgtt and one closes it, the other one should continue to work fine, no? 
Or even a third context is created sharing the same ppgtt.

>   
>   	ctx->file_priv = ERR_PTR(-EBADF);
>   	i915_gem_context_put(ctx);
> @@ -401,6 +403,23 @@ static void __destroy_hw_context(struct i915_gem_context *ctx,
>   	context_close(ctx);
>   }
>   
> +static void __set_ppgtt(struct i915_gem_context *ctx,
> +			struct i915_hw_ppgtt *ppgtt)
> +{
> +	if (ppgtt == ctx->ppgtt)
> +		return;
> +
> +	if (ctx->ppgtt) {
> +		i915_ppgtt_close(ctx->ppgtt);

Feels incorrect to close it if it could be shared and in use elsewhere.

> +		i915_ppgtt_put(ctx->ppgtt);
> +	}
> +
> +	i915_ppgtt_open(ppgtt);

Do we need some protection against trying to re-open a closed ppgtt here?

> +	ctx->ppgtt = i915_ppgtt_get(ppgtt);
> +
> +	ctx->desc_template = default_desc_template(ctx->i915, ppgtt);
> +}
> +
>   static struct i915_gem_context *
>   i915_gem_create_context(struct drm_i915_private *dev_priv,
>   			struct drm_i915_file_private *file_priv)
> @@ -427,8 +446,8 @@ i915_gem_create_context(struct drm_i915_private *dev_priv,
>   			return ERR_CAST(ppgtt);
>   		}
>   
> -		ctx->ppgtt = ppgtt;
> -		ctx->desc_template = default_desc_template(dev_priv, ppgtt);
> +		__set_ppgtt(ctx, ppgtt);
> +		i915_ppgtt_put(ppgtt);
>   	}
>   
>   	trace_i915_context_create(ctx);
> @@ -784,6 +803,87 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
>   	return 0;
>   }
>   
> +static int get_ppgtt(struct i915_gem_context *ctx, u64 *out)

u32 *out ?

> +{
> +	struct drm_i915_file_private *file_priv = ctx->file_priv;
> +	struct i915_hw_ppgtt *ppgtt;
> +	int ret;
> +
> +	/* XXX rcu acquire? */
> +	ppgtt = ctx->ppgtt;
> +	if (!ppgtt)
> +		return -ENODEV;
> +
> +	ret = mutex_lock_interruptible(&file_priv->vm_lock);
> +	if (ret)
> +		return ret;
> +
> +	ret = ppgtt->user_handle;
> +	if (!ret) {
> +		ret = idr_alloc(&file_priv->vm_idr, ppgtt, 0, 0, GFP_KERNEL);

GEM_WARN_ON(ret == 0) just in case?

> +		if (ret > 0) {
> +			ppgtt->user_handle = ret;
> +			i915_ppgtt_get(ppgtt);
> +		}
> +	}
> +
> +	mutex_unlock(&file_priv->vm_lock);
> +	if (ret < 0)
> +		return ret;
> +
> +	*out = ret;
> +	return 0;
> +}
> +
> +static int set_ppgtt(struct i915_gem_context *ctx, u32 id)
> +{
> +	struct drm_i915_file_private *file_priv = ctx->file_priv;
> +	struct i915_hw_ppgtt *ppgtt;
> +	int err;
> +
> +	err = mutex_lock_interruptible(&file_priv->vm_lock);
> +	if (err)
> +		return err;
> +
> +	ppgtt = idr_find(&file_priv->vm_idr, id);
> +	if (ppgtt)
> +		i915_ppgtt_get(ppgtt);
> +	mutex_unlock(&file_priv->vm_lock);
> +	if (!ppgtt)
> +		return -ENOENT;
> +
> +	err = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex);
> +	if (err)
> +		goto out;
> +
> +	/*
> +	 * We need to flush any requests using the current ppgtt before
> +	 * we release it as the requests do not hold a reference themselves,
> +	 * only indirectly through the context. By switching to the kernel
> +	 * context, we ensure that the TLBs are reloaded before using the
> +	 * same context again -- an extra layer of paranoia over wait_for_idle.
> +	 */
> +	err = i915_gem_switch_to_kernel_context(ctx->i915);
> +	if (err)
> +		goto out_unlock;
> +
> +	err = i915_gem_wait_for_idle(ctx->i915,
> +				     I915_WAIT_LOCKED |
> +				     I915_WAIT_INTERRUPTIBLE,
> +				     MAX_SCHEDULE_TIMEOUT);

This is a bit worrying. Every new client setting up their contexts 
causes a global sync point. Sounds bad for scalability. It may be that 
in practice the event might be happening only every few seconds, rather 
than multiple times per second, but I am only guessing. How difficult to 
make requests own ppgtt directly?

> +	if (err)
> +		goto out_unlock;
> +
> +	__set_ppgtt(ctx, ppgtt);
> +
> +out_unlock:
> +	mutex_unlock(&ctx->i915->drm.struct_mutex);
> +
> +out:
> +	i915_ppgtt_put(ppgtt);
> +	return err;
> +}
> +
>   static bool client_is_banned(struct drm_i915_file_private *file_priv)
>   {
>   	return atomic_read(&file_priv->ban_score) >= I915_CLIENT_SCORE_BANNED;
> @@ -896,6 +996,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>   	case I915_CONTEXT_PARAM_PRIORITY:
>   		args->value = ctx->sched.priority >> I915_USER_PRIORITY_SHIFT;
>   		break;
> +	case I915_CONTEXT_PARAM_VM:
> +		ret = get_ppgtt(ctx, &args->value);
> +		break;
>   	default:
>   		ret = -EINVAL;
>   		break;
> @@ -968,6 +1071,15 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>   		}
>   		break;
>   
> +	case I915_CONTEXT_PARAM_VM:
> +		if (args->size)
> +			ret = -EINVAL;
> +		else if (upper_32_bits(args->value))
> +			ret = -EINVAL;
> +		else
> +			ret = set_ppgtt(ctx, lower_32_bits(args->value));
> +		break;
> +
>   	default:
>   		ret = -EINVAL;
>   		break;
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
> index 49b00996a15e..27b40c14e745 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
> @@ -2104,10 +2104,21 @@ i915_ppgtt_create(struct drm_i915_private *i915,
>   	return ppgtt;
>   }
>   
> -void i915_ppgtt_close(struct i915_address_space *vm)
> +void i915_ppgtt_open(struct i915_hw_ppgtt *ppgtt)
>   {
> -	GEM_BUG_ON(vm->closed);
> -	vm->closed = true;
> +	GEM_BUG_ON(ppgtt->vm.closed);
> +
> +	ppgtt->open_count++;
> +}
> +
> +void i915_ppgtt_close(struct i915_hw_ppgtt *ppgtt)
> +{
> +	GEM_BUG_ON(!ppgtt->open_count);
> +	if (--ppgtt->open_count)
> +		return;
> +
> +	GEM_BUG_ON(ppgtt->vm.closed);
> +	ppgtt->vm.closed = true;
>   }
>   
>   static void ppgtt_destroy_vma(struct i915_address_space *vm)
> diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
> index 61698609a62c..bb750318f52a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_gtt.h
> +++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
> @@ -391,6 +391,8 @@ struct i915_hw_ppgtt {
>   	struct kref ref;
>   
>   	unsigned long pd_dirty_rings;
> +	unsigned int open_count;
> +
>   	union {
>   		struct i915_pml4 pml4;		/* GEN8+ & 48b PPGTT */
>   		struct i915_page_directory_pointer pdp;	/* GEN8+ */
> @@ -608,12 +610,16 @@ int i915_ppgtt_init_hw(struct drm_i915_private *dev_priv);
>   void i915_ppgtt_release(struct kref *kref);
>   struct i915_hw_ppgtt *i915_ppgtt_create(struct drm_i915_private *dev_priv,
>   					struct drm_i915_file_private *fpriv);
> -void i915_ppgtt_close(struct i915_address_space *vm);
> -static inline void i915_ppgtt_get(struct i915_hw_ppgtt *ppgtt)
> +
> +void i915_ppgtt_open(struct i915_hw_ppgtt *ppgtt);
> +void i915_ppgtt_close(struct i915_hw_ppgtt *ppgtt);
> +
> +static inline struct i915_hw_ppgtt *i915_ppgtt_get(struct i915_hw_ppgtt *ppgtt)
>   {
> -	if (ppgtt)
> -		kref_get(&ppgtt->ref);
> +	kref_get(&ppgtt->ref);
> +	return ppgtt;

Unrelated hunk?

>   }
> +
>   static inline void i915_ppgtt_put(struct i915_hw_ppgtt *ppgtt)
>   {
>   	if (ppgtt)
> diff --git a/drivers/gpu/drm/i915/selftests/huge_pages.c b/drivers/gpu/drm/i915/selftests/huge_pages.c
> index a9a2fa35876f..a7ee8e97bcee 100644
> --- a/drivers/gpu/drm/i915/selftests/huge_pages.c
> +++ b/drivers/gpu/drm/i915/selftests/huge_pages.c
> @@ -1734,7 +1734,6 @@ int i915_gem_huge_page_mock_selftests(void)
>   	err = i915_subtests(tests, ppgtt);
>   
>   out_close:
> -	i915_ppgtt_close(&ppgtt->vm);
>   	i915_ppgtt_put(ppgtt);
>   
>   out_unlock:
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> index 6a241745e78a..2864cfb82325 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> @@ -243,6 +243,12 @@ static int live_nop_switch(void *arg)
>   	return err;
>   }
>   
> +#define GEN8_HIGH_ADDRESS_BIT 47
> +static inline u64 gen8_canonical_addr(u64 address)
> +{
> +	return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
> +}

We could move the copy from i915_gem_execbuffer.c to i915_gem_utils.h or 
somewhere.

> +
>   static struct i915_vma *
>   gpu_fill_dw(struct i915_vma *vma, u64 offset, unsigned long count, u32 value)
>   {
> @@ -266,6 +272,7 @@ gpu_fill_dw(struct i915_vma *vma, u64 offset, unsigned long count, u32 value)
>   
>   	GEM_BUG_ON(offset + (count - 1) * PAGE_SIZE > vma->node.size);
>   	offset += vma->node.start;
> +	offset = gen8_canonical_addr(offset);
>   
>   	for (n = 0; n < count; n++) {
>   		if (gen >= 8) {
> @@ -391,6 +398,7 @@ static int gpu_fill(struct drm_i915_gem_object *obj,
>   		goto skip_request;
>   
>   	i915_gem_object_set_active_reference(batch->obj);
> +
>   	i915_vma_unpin(batch);
>   	i915_vma_close(batch);
>   
> @@ -439,7 +447,8 @@ static int cpu_fill(struct drm_i915_gem_object *obj, u32 value)
>   	return 0;
>   }
>   
> -static int cpu_check(struct drm_i915_gem_object *obj, unsigned int max)
> +static noinline int cpu_check(struct drm_i915_gem_object *obj,
> +			      unsigned int idx, unsigned int max)
>   {
>   	unsigned int n, m, needs_flush;
>   	int err;
> @@ -457,8 +466,10 @@ static int cpu_check(struct drm_i915_gem_object *obj, unsigned int max)
>   
>   		for (m = 0; m < max; m++) {
>   			if (map[m] != m) {
> -				pr_err("Invalid value at page %d, offset %d: found %x expected %x\n",
> -				       n, m, map[m], m);
> +				pr_err("%pS: Invalid value at object %d page %d/%ld, offset %d/%d: found %x expected %x\n",
> +				       __builtin_return_address(0), idx,
> +				       n, real_page_count(obj), m, max,
> +				       map[m], m);
>   				err = -EINVAL;
>   				goto out_unmap;
>   			}
> @@ -466,8 +477,9 @@ static int cpu_check(struct drm_i915_gem_object *obj, unsigned int max)
>   
>   		for (; m < DW_PER_PAGE; m++) {
>   			if (map[m] != STACK_MAGIC) {
> -				pr_err("Invalid value at page %d, offset %d: found %x expected %x\n",
> -				       n, m, map[m], STACK_MAGIC);
> +				pr_err("%pS: Invalid value at object %d page %d, offset %d: found %x expected %x (uninitialised)\n",
> +				       __builtin_return_address(0), idx, n, m,
> +				       map[m], STACK_MAGIC);
>   				err = -EINVAL;
>   				goto out_unmap;
>   			}
> @@ -545,12 +557,8 @@ static unsigned long max_dwords(struct drm_i915_gem_object *obj)
>   static int igt_ctx_exec(void *arg)
>   {
>   	struct drm_i915_private *i915 = arg;
> -	struct drm_i915_gem_object *obj = NULL;
> -	unsigned long ncontexts, ndwords, dw;
> -	struct drm_file *file;
> -	IGT_TIMEOUT(end_time);
> -	LIST_HEAD(objects);
> -	struct live_test t;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
>   	int err = -ENODEV;
>   
>   	/*
> @@ -562,38 +570,42 @@ static int igt_ctx_exec(void *arg)
>   	if (!DRIVER_CAPS(i915)->has_logical_contexts)
>   		return 0;
>   
> -	file = mock_file(i915);
> -	if (IS_ERR(file))
> -		return PTR_ERR(file);
> +	for_each_engine(engine, i915, id) {
> +		struct drm_i915_gem_object *obj = NULL;
> +		unsigned long ncontexts, ndwords, dw;
> +		struct drm_file *file;
> +		IGT_TIMEOUT(end_time);
> +		LIST_HEAD(objects);
> +		struct live_test t;
>   
> -	mutex_lock(&i915->drm.struct_mutex);
> +		if (!intel_engine_can_store_dword(engine))
> +			continue;
>   
> -	err = begin_live_test(&t, i915, __func__, "");
> -	if (err)
> -		goto out_unlock;
> +		if (!engine->context_size)
> +			continue; /* No logical context support in HW */
>   
> -	ncontexts = 0;
> -	ndwords = 0;
> -	dw = 0;
> -	while (!time_after(jiffies, end_time)) {
> -		struct intel_engine_cs *engine;
> -		struct i915_gem_context *ctx;
> -		unsigned int id;
> +		file = mock_file(i915);
> +		if (IS_ERR(file))
> +			return PTR_ERR(file);
>   
> -		ctx = i915_gem_create_context(i915, file->driver_priv);
> -		if (IS_ERR(ctx)) {
> -			err = PTR_ERR(ctx);
> +		mutex_lock(&i915->drm.struct_mutex);
> +
> +		err = begin_live_test(&t, i915, __func__, engine->name);
> +		if (err)
>   			goto out_unlock;
> -		}
>   
> -		for_each_engine(engine, i915, id) {
> +		ncontexts = 0;
> +		ndwords = 0;
> +		dw = 0;
> +		while (!time_after(jiffies, end_time)) {
> +			struct i915_gem_context *ctx;
>   			intel_wakeref_t wakeref;
>   
> -			if (!engine->context_size)
> -				continue; /* No logical context support in HW */
> -
> -			if (!intel_engine_can_store_dword(engine))
> -				continue;
> +			ctx = i915_gem_create_context(i915, file->driver_priv);
> +			if (IS_ERR(ctx)) {
> +				err = PTR_ERR(ctx);
> +				goto out_unlock;
> +			}
>   
>   			if (!obj) {
>   				obj = create_test_object(ctx, file, &objects);
> @@ -603,7 +615,6 @@ static int igt_ctx_exec(void *arg)
>   				}
>   			}
>   
> -			err = 0;
>   			with_intel_runtime_pm(i915, wakeref)
>   				err = gpu_fill(obj, ctx, engine, dw);
>   			if (err) {
> @@ -618,32 +629,158 @@ static int igt_ctx_exec(void *arg)
>   				obj = NULL;
>   				dw = 0;
>   			}
> +
>   			ndwords++;
> +			ncontexts++;
> +		}
> +
> +		pr_info("Submitted %lu contexts to %s, filling %lu dwords\n",
> +			ncontexts, engine->name, ndwords);
> +
> +		ncontexts = dw = 0;
> +		list_for_each_entry(obj, &objects, st_link) {
> +			unsigned int rem =
> +				min_t(unsigned int, ndwords - dw, max_dwords(obj));
> +
> +			err = cpu_check(obj, ncontexts++, rem);
> +			if (err)
> +				break;
> +
> +			dw += rem;
>   		}
> -		ncontexts++;
> +
> +out_unlock:
> +		if (end_live_test(&t))
> +			err = -EIO;
> +		mutex_unlock(&i915->drm.struct_mutex);
> +
> +		mock_file_free(i915, file);
> +		if (err)
> +			return err;
>   	}
> -	pr_info("Submitted %lu contexts (across %u engines), filling %lu dwords\n",
> -		ncontexts, RUNTIME_INFO(i915)->num_rings, ndwords);
>   
> -	dw = 0;
> -	list_for_each_entry(obj, &objects, st_link) {
> -		unsigned int rem =
> -			min_t(unsigned int, ndwords - dw, max_dwords(obj));
> +	return 0;
> +}
>   
> -		err = cpu_check(obj, rem);
> +static int igt_shared_ctx_exec(void *arg)
> +{
> +	struct drm_i915_private *i915 = arg;
> +	struct intel_engine_cs *engine;
> +	enum intel_engine_id id;
> +	int err = -ENODEV;
> +
> +	/*
> +	 * Create a few different contexts with the same mm and write
> +	 * through each ctx using the GPU making sure those writes end
> +	 * up in the expected pages of our obj.
> +	 */
> +
> +	for_each_engine(engine, i915, id) {
> +		unsigned long ncontexts, ndwords, dw;
> +		struct drm_i915_gem_object *obj = NULL;
> +		struct i915_gem_context *ctx = NULL;
> +		struct i915_gem_context *parent;
> +		struct drm_file *file;
> +		IGT_TIMEOUT(end_time);
> +		LIST_HEAD(objects);
> +		struct live_test t;
> +
> +		if (!intel_engine_can_store_dword(engine))
> +			continue;
> +
> +		file = mock_file(i915);
> +		if (IS_ERR(file))
> +			return PTR_ERR(file);
> +
> +		mutex_lock(&i915->drm.struct_mutex);
> +
> +		err = begin_live_test(&t, i915, __func__, engine->name);
>   		if (err)
> -			break;
> +			goto out_unlock;
>   
> -		dw += rem;
> -	}
> +		parent = i915_gem_create_context(i915, file->driver_priv);
> +		if (IS_ERR(parent)) {
> +			err = PTR_ERR(parent);
> +			if (err == -ENODEV) /* no logical ctx support */
> +				err = 0;
> +			goto out_unlock;
> +		}
> +
> +		if (!parent->ppgtt) {
> +			err = 0;
> +			goto out_unlock;
> +		}
> +
> +		ncontexts = 0;
> +		ndwords = 0;
> +		dw = 0;
> +		while (!time_after(jiffies, end_time)) {
> +			intel_wakeref_t wakeref;
> +
> +			if (ctx)
> +				__destroy_hw_context(ctx, file->driver_priv);
> +
> +			ctx = i915_gem_create_context(i915, file->driver_priv);
> +			if (IS_ERR(ctx)) {
> +				err = PTR_ERR(ctx);
> +				goto out_unlock;
> +			}
> +
> +			__set_ppgtt(ctx, parent->ppgtt);
> +
> +			if (!obj) {
> +				obj = create_test_object(parent, file, &objects);
> +				if (IS_ERR(obj)) {
> +					err = PTR_ERR(obj);
> +					goto out_unlock;
> +				}
> +			}
> +
> +			err = 0;
> +			with_intel_runtime_pm(i915, wakeref)
> +				err = gpu_fill(obj, ctx, engine, dw);
> +			if (err) {
> +				pr_err("Failed to fill dword %lu [%lu/%lu] with gpu (%s) in ctx %u [full-ppgtt? %s], err=%d\n",
> +				       ndwords, dw, max_dwords(obj),
> +				       engine->name, ctx->hw_id,
> +				       yesno(!!ctx->ppgtt), err);
> +				goto out_unlock;
> +			}
> +
> +			if (++dw == max_dwords(obj)) {
> +				obj = NULL;
> +				dw = 0;
> +			}
> +
> +			ndwords++;
> +			ncontexts++;
> +		}
> +		pr_info("Submitted %lu contexts to %s, filling %lu dwords\n",
> +			ncontexts, engine->name, ndwords);
> +
> +		ncontexts = dw = 0;
> +		list_for_each_entry(obj, &objects, st_link) {
> +			unsigned int rem =
> +				min_t(unsigned int, ndwords - dw, max_dwords(obj));
> +
> +			err = cpu_check(obj, ncontexts++, rem);
> +			if (err)
> +				break;
> +
> +			dw += rem;
> +		}
>   
>   out_unlock:
> -	if (end_live_test(&t))
> -		err = -EIO;
> -	mutex_unlock(&i915->drm.struct_mutex);
> +		if (end_live_test(&t))
> +			err = -EIO;
> +		mutex_unlock(&i915->drm.struct_mutex);
>   
> -	mock_file_free(i915, file);
> -	return err;
> +		mock_file_free(i915, file);
> +		if (err)
> +			return err;
> +	}
> +
> +	return 0;
>   }
>   
>   static int igt_ctx_readonly(void *arg)
> @@ -652,7 +789,7 @@ static int igt_ctx_readonly(void *arg)
>   	struct drm_i915_gem_object *obj = NULL;
>   	struct i915_gem_context *ctx;
>   	struct i915_hw_ppgtt *ppgtt;
> -	unsigned long ndwords, dw;
> +	unsigned long idx, ndwords, dw;
>   	struct drm_file *file;
>   	I915_RND_STATE(prng);
>   	IGT_TIMEOUT(end_time);
> @@ -733,6 +870,7 @@ static int igt_ctx_readonly(void *arg)
>   		ndwords, RUNTIME_INFO(i915)->num_rings);
>   
>   	dw = 0;
> +	idx = 0;
>   	list_for_each_entry(obj, &objects, st_link) {
>   		unsigned int rem =
>   			min_t(unsigned int, ndwords - dw, max_dwords(obj));
> @@ -742,7 +880,7 @@ static int igt_ctx_readonly(void *arg)
>   		if (i915_gem_object_is_readonly(obj))
>   			num_writes = 0;
>   
> -		err = cpu_check(obj, num_writes);
> +		err = cpu_check(obj, idx++, num_writes);
>   		if (err)
>   			break;
>   
> @@ -1214,6 +1352,7 @@ int i915_gem_context_live_selftests(struct drm_i915_private *dev_priv)
>   		SUBTEST(live_nop_switch),
>   		SUBTEST(igt_ctx_exec),
>   		SUBTEST(igt_ctx_readonly),
> +		SUBTEST(igt_shared_ctx_exec),
>   		SUBTEST(igt_vm_isolation),
>   	};
>   
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> index 35eb40e5de91..298c4e11deda 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
> @@ -1020,7 +1020,6 @@ static int exercise_ppgtt(struct drm_i915_private *dev_priv,
>   
>   	err = func(dev_priv, &ppgtt->vm, 0, ppgtt->vm.total, end_time);
>   
> -	i915_ppgtt_close(&ppgtt->vm);
>   	i915_ppgtt_put(ppgtt);
>   out_unlock:
>   	mutex_unlock(&dev_priv->drm.struct_mutex);
> diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
> index 2009e776b5d8..b9fd2a4b95e9 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_context.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_context.c
> @@ -59,13 +59,17 @@ mock_context(struct drm_i915_private *i915,
>   		goto err_handles;
>   
>   	if (name) {
> +		struct i915_hw_ppgtt *ppgtt;
> +
>   		ctx->name = kstrdup(name, GFP_KERNEL);
>   		if (!ctx->name)
>   			goto err_put;
>   
> -		ctx->ppgtt = mock_ppgtt(i915, name);
> -		if (!ctx->ppgtt)
> +		ppgtt = mock_ppgtt(i915, name);
> +		if (!ppgtt)
>   			goto err_put;
> +
> +		__set_ppgtt(ctx, ppgtt);
>   	}
>   
>   	return ctx;
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index c3336c931a95..eb19cf8a77ef 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1516,6 +1516,7 @@ struct drm_i915_gem_context_param {
>   #define   I915_CONTEXT_MAX_USER_PRIORITY	1023 /* inclusive */
>   #define   I915_CONTEXT_DEFAULT_PRIORITY		0
>   #define   I915_CONTEXT_MIN_USER_PRIORITY	-1023 /* inclusive */
> +#define I915_CONTEXT_PARAM_VM		0x7

And some kernel doc for the param of course. :)

>   	__u64 value;
>   };
>   
> 

Okay I went a bit back and forth with this patch since I didn't really 
understand the ctx and ppggt lifetime/open-close rules.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 28/38] drm/i915: Create/destroy VM (ppGTT) for use with contexts
  2019-01-23 11:51     ` Chris Wilson
@ 2019-01-23 12:03       ` Tvrtko Ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-23 12:03 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 23/01/2019 11:51, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-01-23 11:30:39)
>>> +int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data,
>>> +                          struct drm_file *file)
>>> +{
>>> +     struct drm_i915_private *i915 = to_i915(dev);
>>> +     struct drm_i915_gem_vm_create *args = data;
>>> +     struct drm_i915_file_private *file_priv = file->driver_priv;
>>> +     struct i915_hw_ppgtt *ppgtt;
>>> +     int err;
>>> +
>>> +     if (!HAS_FULL_PPGTT(i915))
>>> +             return -ENODEV;
>>> +
>>> +     if (args->flags)
>>> +             return -EINVAL;
>>> +
>>> +     if (args->extensions)
>>> +             return -EINVAL;
>>> +
>>> +     ppgtt = i915_ppgtt_create(i915, file_priv);
>>> +     if (IS_ERR(ppgtt))
>>> +             return PTR_ERR(ppgtt);
>>> +
>>> +     err = mutex_lock_interruptible(&file_priv->vm_lock);
>>> +     if (err)
>>> +             goto err_put;
>>> +
>>> +     err = idr_alloc(&file_priv->vm_idr, ppgtt, 0, 0, GFP_KERNEL);
>>> +     mutex_unlock(&file_priv->vm_lock);
>>> +     if (err < 0)
>>> +             goto err_put;
>>
>>          else if (GEM_WARN_ON(err == 0) {
>>                  err = -EINVAL;
>>                  goto err_put;
>>          }
>>
>> ?
>>
>> Or a GEM_BUG_ON(err == 0) if you must. :)
> 
> Not our bug :) We told it never to return 0.
> 
>>> +
>>> +     ppgtt->user_handle = err;
>>> +     args->id = err;
>>> +     return 0;
>>> +
>>> +err_put:
>>> +     i915_ppgtt_put(ppgtt);
>>> +     return err;
>>> +}
>>> +
>>> +int i915_gem_vm_destroy_ioctl(struct drm_device *dev, void *data,
>>
>> Can we cheat and declare data as u32 * here and so avoid need for the
>> local, since there are only two dereferences?
> 
> Tempting, we need to cook up the macros to hide the function pointer
> differences for the ioctl tables.

Probably not then, I was hoping that we could get away with function 
declaration and definition not being exactly the same and that would be 
all that's needed. If not never mind.

Regards,

Tvrtko

>>> +                           struct drm_file *file)
>>> +{
>>> +     struct drm_i915_file_private *file_priv = file->driver_priv;
>>> +     struct i915_hw_ppgtt *ppgtt;
>>> +     int err;
>>> +     u32 id;
>>> +
>>> +     id = *(u32 *)data;
> 
> Oh crikey, did I write this?
> 
> Remember, do it right the first time as I won't remember when I was
> cutting corners.
> 
>>> +     if (!id)
>>> +             return -ENOENT;
>>> +
>>> +     err = mutex_lock_interruptible(&file_priv->vm_lock);
>>> +     if (err)
>>> +             return err;
>>> +
>>> +     ppgtt = idr_remove(&file_priv->vm_idr, id);
>>> +     if (ppgtt)
>>
>> GEM_WARN_ON(id != ppgtt->user_handle) too much paranoia?
> 
> BUG_ON then!
> 
>>> +             ppgtt->user_handle = 0;
>>> +
>>> +     mutex_unlock(&file_priv->vm_lock);
>>> +     if (!ppgtt)
>>> +             return -ENOENT;
>>> +
>>> +     i915_ppgtt_put(ppgtt);
>>> +     return 0;
>>
>> Or end with simply:
>>
>> i915_ppgtt_put(ppgtt);
>>
>> return ppgtt ? 0 : -ENOENT;
>>
>> ?
> 
> I feel a slight disappointment at the anticlimatic nature of this
> function, leaving a gap, nay, a yearning, for more.
> -Chris
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 29/38] drm/i915: Expose user control over the ppGTT associated with a context
  2019-01-23 12:00   ` Tvrtko Ursulin
@ 2019-01-23 12:15     ` Chris Wilson
  0 siblings, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-23 12:15 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-23 12:00:49)
> 
> On 18/01/2019 14:01, Chris Wilson wrote:
> > Allow the user to share ppGTT between contexts on the same fd. This
> > gives the user the ability to relax their context isolation to share vm
> > between their own contexts, but does not allow them to import a vm from
> > another fd. The use case for sharing a vm is for the proposed virtual
> > engine work where a context may be created explicitly to setup a load
> > balancing engine, but always be run in conjunction with a second context
> > for rcs/compute etc. By giving control to the user on how they setup the
> > vm allows for them to have a single vm between all kernel contexts being
> > used to emulate a single client context, similarly to how we use a
> > single vm across all engines within a single kernel context today. It
> > also allows for future specification a separate vm between engines
> > inside a single kernel context should that be desired.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > ---
> >   drivers/gpu/drm/i915/i915_gem_context.c       | 118 ++++++++-
> >   drivers/gpu/drm/i915/i915_gem_gtt.c           |  17 +-
> >   drivers/gpu/drm/i915/i915_gem_gtt.h           |  14 +-
> >   drivers/gpu/drm/i915/selftests/huge_pages.c   |   1 -
> >   .../gpu/drm/i915/selftests/i915_gem_context.c | 247 ++++++++++++++----
> >   drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |   1 -
> >   drivers/gpu/drm/i915/selftests/mock_context.c |   8 +-
> >   include/uapi/drm/i915_drm.h                   |   1 +
> >   8 files changed, 339 insertions(+), 68 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> > index 7c90981704bf..f707241dbc78 100644
> > --- a/drivers/gpu/drm/i915/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> > @@ -109,6 +109,8 @@ static void lut_close(struct i915_gem_context *ctx)
> >               struct i915_vma *vma = rcu_dereference_raw(*slot);
> >   
> >               radix_tree_iter_delete(&ctx->handles_vma, &iter, slot);
> > +
> > +             vma->open_count--;
> 
> I did not figure out what is this. A) why open coded vma management 
> without any comments, and b) rest of the patch doesn't seem to touch 
> this tree.

As the vm may be shared between multiple contexts, we may then open the
same vma and record it in the different context lut. Each instance needs
to be accounted for.

> >               __i915_gem_object_release_unless_active(vma->obj);
> >       }
> >       rcu_read_unlock();
> > @@ -291,7 +293,7 @@ static void context_close(struct i915_gem_context *ctx)
> >        */
> >       lut_close(ctx);
> >       if (ctx->ppgtt)
> > -             i915_ppgtt_close(&ctx->ppgtt->vm);
> > +             i915_ppgtt_close(ctx->ppgtt);
> 
> I'll need to figure out if it is okay for context to close the ppgtt 
> instead of just dropping references to it. Like two contexts sharing 
> ppgtt and one closes it, the other one should continue to work fine, no? 
> Or even a third context is created sharing the same ppgtt.

ppgtt->open_count? We don't close until everyone agrees.

> >       ctx->file_priv = ERR_PTR(-EBADF);
> >       i915_gem_context_put(ctx);
> > @@ -401,6 +403,23 @@ static void __destroy_hw_context(struct i915_gem_context *ctx,
> >       context_close(ctx);
> >   }
> >   
> > +static void __set_ppgtt(struct i915_gem_context *ctx,
> > +                     struct i915_hw_ppgtt *ppgtt)
> > +{
> > +     if (ppgtt == ctx->ppgtt)
> > +             return;
> > +
> > +     if (ctx->ppgtt) {
> > +             i915_ppgtt_close(ctx->ppgtt);
> 
> Feels incorrect to close it if it could be shared and in use elsewhere.
> 
> > +             i915_ppgtt_put(ctx->ppgtt);
> > +     }
> > +
> > +     i915_ppgtt_open(ppgtt);
> 
> Do we need some protection against trying to re-open a closed ppgtt here?

We BUG_ON as a closed ppgtt shouldn't have been accessible via the file_priv.

> > +     ctx->ppgtt = i915_ppgtt_get(ppgtt);
> > +
> > +     ctx->desc_template = default_desc_template(ctx->i915, ppgtt);
> > +}
> > +
> >   static struct i915_gem_context *
> >   i915_gem_create_context(struct drm_i915_private *dev_priv,
> >                       struct drm_i915_file_private *file_priv)
> > @@ -427,8 +446,8 @@ i915_gem_create_context(struct drm_i915_private *dev_priv,
> >                       return ERR_CAST(ppgtt);
> >               }
> >   
> > -             ctx->ppgtt = ppgtt;
> > -             ctx->desc_template = default_desc_template(dev_priv, ppgtt);
> > +             __set_ppgtt(ctx, ppgtt);
> > +             i915_ppgtt_put(ppgtt);
> >       }
> >   
> >       trace_i915_context_create(ctx);
> > @@ -784,6 +803,87 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
> >       return 0;
> >   }
> >   
> > +static int get_ppgtt(struct i915_gem_context *ctx, u64 *out)
> 
> u32 *out ?

Oh, left over from planning for setting different vm on each engine. In
the end, I thought a separate setter/getter was sensible rather than
trying to overload a simple interface with a more complex one.

> > +{
> > +     struct drm_i915_file_private *file_priv = ctx->file_priv;
> > +     struct i915_hw_ppgtt *ppgtt;
> > +     int ret;
> > +
> > +     /* XXX rcu acquire? */
> > +     ppgtt = ctx->ppgtt;
> > +     if (!ppgtt)
> > +             return -ENODEV;
> > +
> > +     ret = mutex_lock_interruptible(&file_priv->vm_lock);
> > +     if (ret)
> > +             return ret;
> > +
> > +     ret = ppgtt->user_handle;
> > +     if (!ret) {
> > +             ret = idr_alloc(&file_priv->vm_idr, ppgtt, 0, 0, GFP_KERNEL);
> 
> GEM_WARN_ON(ret == 0) just in case?
> 
> > +             if (ret > 0) {
> > +                     ppgtt->user_handle = ret;
> > +                     i915_ppgtt_get(ppgtt);
> > +             }
> > +     }
> > +
> > +     mutex_unlock(&file_priv->vm_lock);
> > +     if (ret < 0)
> > +             return ret;
> > +
> > +     *out = ret;
> > +     return 0;
> > +}
> > +
> > +static int set_ppgtt(struct i915_gem_context *ctx, u32 id)
> > +{
> > +     struct drm_i915_file_private *file_priv = ctx->file_priv;
> > +     struct i915_hw_ppgtt *ppgtt;
> > +     int err;
> > +
> > +     err = mutex_lock_interruptible(&file_priv->vm_lock);
> > +     if (err)
> > +             return err;
> > +
> > +     ppgtt = idr_find(&file_priv->vm_idr, id);
> > +     if (ppgtt)
> > +             i915_ppgtt_get(ppgtt);
> > +     mutex_unlock(&file_priv->vm_lock);
> > +     if (!ppgtt)
> > +             return -ENOENT;
> > +
> > +     err = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex);
> > +     if (err)
> > +             goto out;
> > +
> > +     /*
> > +      * We need to flush any requests using the current ppgtt before
> > +      * we release it as the requests do not hold a reference themselves,
> > +      * only indirectly through the context. By switching to the kernel
> > +      * context, we ensure that the TLBs are reloaded before using the
> > +      * same context again -- an extra layer of paranoia over wait_for_idle.
> > +      */
> > +     err = i915_gem_switch_to_kernel_context(ctx->i915);
> > +     if (err)
> > +             goto out_unlock;
> > +
> > +     err = i915_gem_wait_for_idle(ctx->i915,
> > +                                  I915_WAIT_LOCKED |
> > +                                  I915_WAIT_INTERRUPTIBLE,
> > +                                  MAX_SCHEDULE_TIMEOUT);
> 
> This is a bit worrying. Every new client setting up their contexts 
> causes a global sync point. Sounds bad for scalability. It may be that 
> in practice the event might be happening only every few seconds, rather 
> than multiple times per second, but I am only guessing. How difficult to 
> make requests own ppgtt directly?

Or just add a retire callback from the kernel-context. The more we look,
the more use cases we find.

> > +static inline struct i915_hw_ppgtt *i915_ppgtt_get(struct i915_hw_ppgtt *ppgtt)
> >   {
> > -     if (ppgtt)
> > -             kref_get(&ppgtt->ref);
> > +     kref_get(&ppgtt->ref);
> > +     return ppgtt;
> 
> Unrelated hunk?

Am I not allowed to tidy up as I go along! I use it in this patch :)

> > diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> > index 6a241745e78a..2864cfb82325 100644
> > --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> > +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> > @@ -243,6 +243,12 @@ static int live_nop_switch(void *arg)
> >       return err;
> >   }
> >   
> > +#define GEN8_HIGH_ADDRESS_BIT 47
> > +static inline u64 gen8_canonical_addr(u64 address)
> > +{
> > +     return sign_extend64(address, GEN8_HIGH_ADDRESS_BIT);
> > +}
> 
> We could move the copy from i915_gem_execbuffer.c to i915_gem_utils.h or 
> somewhere.

It's probably not even worth it, this is a debug hunk for another
problem, now it's own selftest to expose the issue.

> Okay I went a bit back and forth with this patch since I didn't really 
> understand the ctx and ppggt lifetime/open-close rules.

And I didn't even notice I was replying to a reply.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* [PATCH v3] drm/i915: Create/destroy VM (ppGTT) for use with contexts
  2019-01-23 11:30   ` Tvrtko Ursulin
  2019-01-23 11:51     ` Chris Wilson
@ 2019-01-24 15:58     ` Chris Wilson
  1 sibling, 0 replies; 66+ messages in thread
From: Chris Wilson @ 2019-01-24 15:58 UTC (permalink / raw)
  To: intel-gfx

In preparation to making the ppGTT binding for a context explicit (to
facilitate reusing the same ppGTT between different contexts), allow the
user to create and destroy named ppGTT.

v2: Replace global barrier for swapping over the ppgtt and tlbs with a
local context barrier (Tvrtko)
v3: serialise with struct_mutex; it's lazy but required dammit

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
uAPI docs after the headache subsides.
-Chris
---
 drivers/gpu/drm/i915/i915_drv.c               |   2 +
 drivers/gpu/drm/i915/i915_drv.h               |   3 +
 drivers/gpu/drm/i915/i915_gem_context.c       | 244 +++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_context.h       |   5 +
 drivers/gpu/drm/i915/i915_gem_gtt.c           |  17 +-
 drivers/gpu/drm/i915/i915_gem_gtt.h           |  16 +-
 drivers/gpu/drm/i915/selftests/huge_pages.c   |   1 -
 .../gpu/drm/i915/selftests/i915_gem_context.c | 240 +++++++++++++----
 drivers/gpu/drm/i915/selftests/i915_gem_gtt.c |   1 -
 drivers/gpu/drm/i915/selftests/mock_context.c |   8 +-
 include/uapi/drm/i915_drm.h                   |  11 +
 11 files changed, 477 insertions(+), 71 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 337ee650d2de..601ef06fdce1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -3003,6 +3003,8 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_PERF_ADD_CONFIG, i915_perf_add_config_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_PERF_REMOVE_CONFIG, i915_perf_remove_config_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_QUERY, i915_query_ioctl, DRM_UNLOCKED|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_VM_CREATE, i915_gem_vm_create_ioctl, DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_VM_DESTROY, i915_gem_vm_destroy_ioctl, DRM_RENDER_ALLOW),
 };
 
 static struct drm_driver driver = {
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index a3e7a5bf0381..2f90e16b3b02 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -217,6 +217,9 @@ struct drm_i915_file_private {
 	} mm;
 	struct idr context_idr;
 
+	struct mutex vm_lock;
+	struct idr vm_idr;
+
 	struct intel_rps_client {
 		atomic_t boosts;
 	} rps_client;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index c2399542d0d2..c4cd2577f7c6 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -110,6 +110,8 @@ static void lut_close(struct i915_gem_context *ctx)
 		struct i915_vma *vma = rcu_dereference_raw(*slot);
 
 		radix_tree_iter_delete(&ctx->handles_vma, &iter, slot);
+
+		vma->open_count--;
 		__i915_gem_object_release_unless_active(vma->obj);
 	}
 	rcu_read_unlock();
@@ -293,7 +295,7 @@ static void context_close(struct i915_gem_context *ctx)
 	 */
 	lut_close(ctx);
 	if (ctx->ppgtt)
-		i915_ppgtt_close(&ctx->ppgtt->vm);
+		i915_ppgtt_close(ctx->ppgtt);
 
 	ctx->file_priv = ERR_PTR(-EBADF);
 	i915_gem_context_put(ctx);
@@ -424,6 +426,32 @@ static void __destroy_hw_context(struct i915_gem_context *ctx,
 	context_close(ctx);
 }
 
+static struct i915_hw_ppgtt *
+__set_ppgtt(struct i915_gem_context *ctx, struct i915_hw_ppgtt *ppgtt)
+{
+	struct i915_hw_ppgtt *old = ctx->ppgtt;
+
+	i915_ppgtt_open(ppgtt);
+	ctx->ppgtt = i915_ppgtt_get(ppgtt);
+
+	ctx->desc_template = default_desc_template(ctx->i915, ppgtt);
+
+	return old;
+}
+
+static void __assign_ppgtt(struct i915_gem_context *ctx,
+			   struct i915_hw_ppgtt *ppgtt)
+{
+	if (ppgtt == ctx->ppgtt)
+		return;
+
+	ppgtt = __set_ppgtt(ctx, ppgtt);
+	if (ppgtt) {
+		i915_ppgtt_close(ppgtt);
+		i915_ppgtt_put(ppgtt);
+	}
+}
+
 static struct i915_gem_context *
 i915_gem_create_context(struct drm_i915_private *dev_priv,
 			struct drm_i915_file_private *file_priv)
@@ -450,8 +478,8 @@ i915_gem_create_context(struct drm_i915_private *dev_priv,
 			return ERR_CAST(ppgtt);
 		}
 
-		ctx->ppgtt = ppgtt;
-		ctx->desc_template = default_desc_template(dev_priv, ppgtt);
+		__assign_ppgtt(ctx, ppgtt);
+		i915_ppgtt_put(ppgtt);
 	}
 
 	trace_i915_context_create(ctx);
@@ -632,19 +660,29 @@ static int context_idr_cleanup(int id, void *p, void *data)
 	return 0;
 }
 
+static int vm_idr_cleanup(int id, void *p, void *data)
+{
+	i915_ppgtt_put(p);
+	return 0;
+}
+
 int i915_gem_context_open(struct drm_i915_private *i915,
 			  struct drm_file *file)
 {
 	struct drm_i915_file_private *file_priv = file->driver_priv;
 	struct i915_gem_context *ctx;
 
+	mutex_init(&file_priv->vm_lock);
+
 	idr_init(&file_priv->context_idr);
+	idr_init_base(&file_priv->vm_idr, 1);
 
 	mutex_lock(&i915->drm.struct_mutex);
 	ctx = i915_gem_create_context(i915, file_priv);
 	mutex_unlock(&i915->drm.struct_mutex);
 	if (IS_ERR(ctx)) {
 		idr_destroy(&file_priv->context_idr);
+		idr_destroy(&file_priv->vm_idr);
 		return PTR_ERR(ctx);
 	}
 
@@ -661,6 +699,89 @@ void i915_gem_context_close(struct drm_file *file)
 
 	idr_for_each(&file_priv->context_idr, context_idr_cleanup, NULL);
 	idr_destroy(&file_priv->context_idr);
+
+	idr_for_each(&file_priv->vm_idr, vm_idr_cleanup, NULL);
+	idr_destroy(&file_priv->vm_idr);
+
+	mutex_destroy(&file_priv->vm_lock);
+}
+
+int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file)
+{
+	struct drm_i915_private *i915 = to_i915(dev);
+	struct drm_i915_gem_vm_control *args = data;
+	struct drm_i915_file_private *file_priv = file->driver_priv;
+	struct i915_hw_ppgtt *ppgtt;
+	int err;
+
+	if (!HAS_FULL_PPGTT(i915))
+		return -ENODEV;
+
+	if (args->flags)
+		return -EINVAL;
+
+	if (args->extensions)
+		return -EINVAL;
+
+	ppgtt = i915_ppgtt_create(i915, file_priv);
+	if (IS_ERR(ppgtt))
+		return PTR_ERR(ppgtt);
+
+	err = mutex_lock_interruptible(&file_priv->vm_lock);
+	if (err)
+		goto err_put;
+
+	err = idr_alloc(&file_priv->vm_idr, ppgtt, 0, 0, GFP_KERNEL);
+	mutex_unlock(&file_priv->vm_lock);
+	if (err < 0)
+		goto err_put;
+
+	GEM_BUG_ON(err == 0); /* reserved for default/unassigned ppgtt */
+	ppgtt->user_handle = err;
+	args->id = err;
+	return 0;
+
+err_put:
+	i915_ppgtt_put(ppgtt);
+	return err;
+}
+
+int i915_gem_vm_destroy_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file)
+{
+	struct drm_i915_file_private *file_priv = file->driver_priv;
+	struct drm_i915_gem_vm_control *args = data;
+	struct i915_hw_ppgtt *ppgtt;
+	int err;
+	u32 id;
+
+	if (args->flags)
+		return -EINVAL;
+
+	if (args->extensions)
+		return -EINVAL;
+
+	id = args->id;
+	if (!id)
+		return -ENOENT;
+
+	err = mutex_lock_interruptible(&file_priv->vm_lock);
+	if (err)
+		return err;
+
+	ppgtt = idr_remove(&file_priv->vm_idr, id);
+	if (ppgtt) {
+		GEM_BUG_ON(!ppgtt->user_handle);
+		ppgtt->user_handle = 0;
+	}
+
+	mutex_unlock(&file_priv->vm_lock);
+	if (!ppgtt)
+		return -ENOENT;
+
+	i915_ppgtt_put(ppgtt);
+	return 0;
 }
 
 static struct i915_request *
@@ -809,6 +930,110 @@ int i915_gem_switch_to_kernel_context(struct drm_i915_private *i915)
 	return 0;
 }
 
+static int get_ppgtt(struct i915_gem_context *ctx, u64 *out)
+{
+	struct drm_i915_file_private *file_priv = ctx->file_priv;
+	struct i915_hw_ppgtt *ppgtt;
+	int ret;
+
+	if (!ctx->ppgtt)
+		return -ENODEV;
+
+	/* XXX rcu acquire? */
+	ret = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex);
+	if (ret)
+		return ret;
+
+	ppgtt = i915_ppgtt_get(ctx->ppgtt);
+	mutex_unlock(&ctx->i915->drm.struct_mutex);
+
+	ret = mutex_lock_interruptible(&file_priv->vm_lock);
+	if (ret)
+		goto err_put;
+
+	if (!ppgtt->user_handle) {
+		ret = idr_alloc(&file_priv->vm_idr, ppgtt, 0, 0, GFP_KERNEL);
+		GEM_BUG_ON(!ret);
+		if (ret < 0)
+			goto err_unlock;
+
+		ppgtt->user_handle = ret;
+		i915_ppgtt_get(ppgtt);
+	}
+
+	*out = ppgtt->user_handle;
+	ret = 0;
+err_unlock:
+	mutex_unlock(&file_priv->vm_lock);
+err_put:
+	i915_ppgtt_put(ppgtt);
+	return ret;
+}
+
+static void set_ppgtt_barrier(void *data)
+{
+	struct i915_hw_ppgtt *old = data;
+
+	i915_ppgtt_close(old);
+	i915_ppgtt_put(old);
+}
+
+static int set_ppgtt(struct i915_gem_context *ctx,
+		    struct drm_i915_gem_context_param *args)
+{
+	struct drm_i915_file_private *file_priv = ctx->file_priv;
+	struct i915_hw_ppgtt *ppgtt, *old;
+	int err;
+
+	if (args->size)
+		return -EINVAL;
+
+	if (upper_32_bits(args->value))
+		return -EINVAL;
+
+	if (!ctx->ppgtt)
+		return -ENODEV;
+
+	err = mutex_lock_interruptible(&file_priv->vm_lock);
+	if (err)
+		return err;
+
+	ppgtt = idr_find(&file_priv->vm_idr, args->value);
+	if (ppgtt) {
+		GEM_BUG_ON(ppgtt->user_handle != args->value);
+		i915_ppgtt_get(ppgtt);
+	}
+	mutex_unlock(&file_priv->vm_lock);
+	if (!ppgtt)
+		return -ENOENT;
+
+	err = mutex_lock_interruptible(&ctx->i915->drm.struct_mutex);
+	if (err)
+		goto out;
+
+	old = __set_ppgtt(ctx, ppgtt);
+
+	/*
+	 * We need to flush any requests using the current ppgtt before
+	 * we release it as the requests do not hold a reference themselves,
+	 * only indirectly through the context.
+	 */
+	err = context_barrier_task(ctx, set_ppgtt_barrier, old);
+	if (err) {
+		ctx->ppgtt = old;
+		ctx->desc_template = default_desc_template(ctx->i915, old);
+
+		i915_ppgtt_close(ppgtt);
+		i915_ppgtt_put(ppgtt);
+	}
+
+	mutex_unlock(&ctx->i915->drm.struct_mutex);
+
+out:
+	i915_ppgtt_put(ppgtt);
+	return err;
+}
+
 static bool client_is_banned(struct drm_i915_file_private *file_priv)
 {
 	return atomic_read(&file_priv->ban_score) >= I915_CLIENT_SCORE_BANNED;
@@ -979,6 +1204,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_SSEU:
 		ret = get_sseu(ctx, args);
 		break;
+	case I915_CONTEXT_PARAM_VM:
+		ret = get_ppgtt(ctx, &args->value);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1278,9 +1506,6 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 		return -ENOENT;
 
 	switch (args->param) {
-	case I915_CONTEXT_PARAM_BAN_PERIOD:
-		ret = -EINVAL;
-		break;
 	case I915_CONTEXT_PARAM_NO_ZEROMAP:
 		if (args->size)
 			ret = -EINVAL;
@@ -1327,9 +1552,16 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 					I915_USER_PRIORITY(priority);
 		}
 		break;
+
 	case I915_CONTEXT_PARAM_SSEU:
 		ret = set_sseu(ctx, args);
 		break;
+
+	case I915_CONTEXT_PARAM_VM:
+		ret = set_ppgtt(ctx, args);
+		break;
+
+	case I915_CONTEXT_PARAM_BAN_PERIOD:
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
index 3217ece7b8b4..08f8bbf11100 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.h
+++ b/drivers/gpu/drm/i915/i915_gem_context.h
@@ -354,6 +354,11 @@ void i915_gem_context_release(struct kref *ctx_ref);
 struct i915_gem_context *
 i915_gem_context_create_gvt(struct drm_device *dev);
 
+int i915_gem_vm_create_ioctl(struct drm_device *dev, void *data,
+			     struct drm_file *file);
+int i915_gem_vm_destroy_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file);
+
 int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
 				  struct drm_file *file);
 int i915_gem_context_destroy_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index 49b00996a15e..27b40c14e745 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -2104,10 +2104,21 @@ i915_ppgtt_create(struct drm_i915_private *i915,
 	return ppgtt;
 }
 
-void i915_ppgtt_close(struct i915_address_space *vm)
+void i915_ppgtt_open(struct i915_hw_ppgtt *ppgtt)
 {
-	GEM_BUG_ON(vm->closed);
-	vm->closed = true;
+	GEM_BUG_ON(ppgtt->vm.closed);
+
+	ppgtt->open_count++;
+}
+
+void i915_ppgtt_close(struct i915_hw_ppgtt *ppgtt)
+{
+	GEM_BUG_ON(!ppgtt->open_count);
+	if (--ppgtt->open_count)
+		return;
+
+	GEM_BUG_ON(ppgtt->vm.closed);
+	ppgtt->vm.closed = true;
 }
 
 static void ppgtt_destroy_vma(struct i915_address_space *vm)
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
index 03ade71b8d9a..bb750318f52a 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.h
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -391,11 +391,15 @@ struct i915_hw_ppgtt {
 	struct kref ref;
 
 	unsigned long pd_dirty_rings;
+	unsigned int open_count;
+
 	union {
 		struct i915_pml4 pml4;		/* GEN8+ & 48b PPGTT */
 		struct i915_page_directory_pointer pdp;	/* GEN8+ */
 		struct i915_page_directory pd;		/* GEN6-7 */
 	};
+
+	u32 user_handle;
 };
 
 struct gen6_hw_ppgtt {
@@ -606,12 +610,16 @@ int i915_ppgtt_init_hw(struct drm_i915_private *dev_priv);
 void i915_ppgtt_release(struct kref *kref);
 struct i915_hw_ppgtt *i915_ppgtt_create(struct drm_i915_private *dev_priv,
 					struct drm_i915_file_private *fpriv);
-void i915_ppgtt_close(struct i915_address_space *vm);
-static inline void i915_ppgtt_get(struct i915_hw_ppgtt *ppgtt)
+
+void i915_ppgtt_open(struct i915_hw_ppgtt *ppgtt);
+void i915_ppgtt_close(struct i915_hw_ppgtt *ppgtt);
+
+static inline struct i915_hw_ppgtt *i915_ppgtt_get(struct i915_hw_ppgtt *ppgtt)
 {
-	if (ppgtt)
-		kref_get(&ppgtt->ref);
+	kref_get(&ppgtt->ref);
+	return ppgtt;
 }
+
 static inline void i915_ppgtt_put(struct i915_hw_ppgtt *ppgtt)
 {
 	if (ppgtt)
diff --git a/drivers/gpu/drm/i915/selftests/huge_pages.c b/drivers/gpu/drm/i915/selftests/huge_pages.c
index a9a2fa35876f..a7ee8e97bcee 100644
--- a/drivers/gpu/drm/i915/selftests/huge_pages.c
+++ b/drivers/gpu/drm/i915/selftests/huge_pages.c
@@ -1734,7 +1734,6 @@ int i915_gem_huge_page_mock_selftests(void)
 	err = i915_subtests(tests, ppgtt);
 
 out_close:
-	i915_ppgtt_close(&ppgtt->vm);
 	i915_ppgtt_put(ppgtt);
 
 out_unlock:
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
index 333de90789c7..27650966269f 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
@@ -324,6 +324,7 @@ static int gpu_fill(struct drm_i915_gem_object *obj,
 		goto skip_request;
 
 	i915_gem_object_set_active_reference(batch->obj);
+
 	i915_vma_unpin(batch);
 	i915_vma_close(batch);
 
@@ -372,7 +373,8 @@ static int cpu_fill(struct drm_i915_gem_object *obj, u32 value)
 	return 0;
 }
 
-static int cpu_check(struct drm_i915_gem_object *obj, unsigned int max)
+static noinline int cpu_check(struct drm_i915_gem_object *obj,
+			      unsigned int idx, unsigned int max)
 {
 	unsigned int n, m, needs_flush;
 	int err;
@@ -390,8 +392,10 @@ static int cpu_check(struct drm_i915_gem_object *obj, unsigned int max)
 
 		for (m = 0; m < max; m++) {
 			if (map[m] != m) {
-				pr_err("Invalid value at page %d, offset %d: found %x expected %x\n",
-				       n, m, map[m], m);
+				pr_err("%pS: Invalid value at object %d page %d/%ld, offset %d/%d: found %x expected %x\n",
+				       __builtin_return_address(0), idx,
+				       n, real_page_count(obj), m, max,
+				       map[m], m);
 				err = -EINVAL;
 				goto out_unmap;
 			}
@@ -399,8 +403,9 @@ static int cpu_check(struct drm_i915_gem_object *obj, unsigned int max)
 
 		for (; m < DW_PER_PAGE; m++) {
 			if (map[m] != STACK_MAGIC) {
-				pr_err("Invalid value at page %d, offset %d: found %x expected %x\n",
-				       n, m, map[m], STACK_MAGIC);
+				pr_err("%pS: Invalid value at object %d page %d, offset %d: found %x expected %x (uninitialised)\n",
+				       __builtin_return_address(0), idx, n, m,
+				       map[m], STACK_MAGIC);
 				err = -EINVAL;
 				goto out_unmap;
 			}
@@ -478,12 +483,8 @@ static unsigned long max_dwords(struct drm_i915_gem_object *obj)
 static int igt_ctx_exec(void *arg)
 {
 	struct drm_i915_private *i915 = arg;
-	struct drm_i915_gem_object *obj = NULL;
-	unsigned long ncontexts, ndwords, dw;
-	struct igt_live_test t;
-	struct drm_file *file;
-	IGT_TIMEOUT(end_time);
-	LIST_HEAD(objects);
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
 	int err = -ENODEV;
 
 	/*
@@ -495,38 +496,42 @@ static int igt_ctx_exec(void *arg)
 	if (!DRIVER_CAPS(i915)->has_logical_contexts)
 		return 0;
 
-	file = mock_file(i915);
-	if (IS_ERR(file))
-		return PTR_ERR(file);
+	for_each_engine(engine, i915, id) {
+		struct drm_i915_gem_object *obj = NULL;
+		unsigned long ncontexts, ndwords, dw;
+		struct igt_live_test t;
+		struct drm_file *file;
+		IGT_TIMEOUT(end_time);
+		LIST_HEAD(objects);
 
-	mutex_lock(&i915->drm.struct_mutex);
+		if (!intel_engine_can_store_dword(engine))
+			continue;
 
-	err = igt_live_test_begin(&t, i915, __func__, "");
-	if (err)
-		goto out_unlock;
+		if (!engine->context_size)
+			continue; /* No logical context support in HW */
 
-	ncontexts = 0;
-	ndwords = 0;
-	dw = 0;
-	while (!time_after(jiffies, end_time)) {
-		struct intel_engine_cs *engine;
-		struct i915_gem_context *ctx;
-		unsigned int id;
+		file = mock_file(i915);
+		if (IS_ERR(file))
+			return PTR_ERR(file);
+
+		mutex_lock(&i915->drm.struct_mutex);
 
-		ctx = i915_gem_create_context(i915, file->driver_priv);
-		if (IS_ERR(ctx)) {
-			err = PTR_ERR(ctx);
+		err = igt_live_test_begin(&t, i915, __func__, engine->name);
+		if (err)
 			goto out_unlock;
-		}
 
-		for_each_engine(engine, i915, id) {
+		ncontexts = 0;
+		ndwords = 0;
+		dw = 0;
+		while (!time_after(jiffies, end_time)) {
+			struct i915_gem_context *ctx;
 			intel_wakeref_t wakeref;
 
-			if (!engine->context_size)
-				continue; /* No logical context support in HW */
-
-			if (!intel_engine_can_store_dword(engine))
-				continue;
+			ctx = i915_gem_create_context(i915, file->driver_priv);
+			if (IS_ERR(ctx)) {
+				err = PTR_ERR(ctx);
+				goto out_unlock;
+			}
 
 			if (!obj) {
 				obj = create_test_object(ctx, file, &objects);
@@ -536,7 +541,6 @@ static int igt_ctx_exec(void *arg)
 				}
 			}
 
-			err = 0;
 			with_intel_runtime_pm(i915, wakeref)
 				err = gpu_fill(obj, ctx, engine, dw);
 			if (err) {
@@ -551,32 +555,158 @@ static int igt_ctx_exec(void *arg)
 				obj = NULL;
 				dw = 0;
 			}
+
 			ndwords++;
+			ncontexts++;
+		}
+
+		pr_info("Submitted %lu contexts to %s, filling %lu dwords\n",
+			ncontexts, engine->name, ndwords);
+
+		ncontexts = dw = 0;
+		list_for_each_entry(obj, &objects, st_link) {
+			unsigned int rem =
+				min_t(unsigned int, ndwords - dw, max_dwords(obj));
+
+			err = cpu_check(obj, ncontexts++, rem);
+			if (err)
+				break;
+
+			dw += rem;
 		}
-		ncontexts++;
+
+out_unlock:
+		if (igt_live_test_end(&t))
+			err = -EIO;
+		mutex_unlock(&i915->drm.struct_mutex);
+
+		mock_file_free(i915, file);
+		if (err)
+			return err;
 	}
-	pr_info("Submitted %lu contexts (across %u engines), filling %lu dwords\n",
-		ncontexts, RUNTIME_INFO(i915)->num_rings, ndwords);
 
-	dw = 0;
-	list_for_each_entry(obj, &objects, st_link) {
-		unsigned int rem =
-			min_t(unsigned int, ndwords - dw, max_dwords(obj));
+	return 0;
+}
+
+static int igt_shared_ctx_exec(void *arg)
+{
+	struct drm_i915_private *i915 = arg;
+	struct intel_engine_cs *engine;
+	enum intel_engine_id id;
+	int err = -ENODEV;
+
+	/*
+	 * Create a few different contexts with the same mm and write
+	 * through each ctx using the GPU making sure those writes end
+	 * up in the expected pages of our obj.
+	 */
+
+	for_each_engine(engine, i915, id) {
+		unsigned long ncontexts, ndwords, dw;
+		struct drm_i915_gem_object *obj = NULL;
+		struct i915_gem_context *ctx = NULL;
+		struct i915_gem_context *parent;
+		struct igt_live_test t;
+		struct drm_file *file;
+		IGT_TIMEOUT(end_time);
+		LIST_HEAD(objects);
+
+		if (!intel_engine_can_store_dword(engine))
+			continue;
+
+		file = mock_file(i915);
+		if (IS_ERR(file))
+			return PTR_ERR(file);
 
-		err = cpu_check(obj, rem);
+		mutex_lock(&i915->drm.struct_mutex);
+
+		err = igt_live_test_begin(&t, i915, __func__, engine->name);
 		if (err)
-			break;
+			goto out_unlock;
 
-		dw += rem;
-	}
+		parent = i915_gem_create_context(i915, file->driver_priv);
+		if (IS_ERR(parent)) {
+			err = PTR_ERR(parent);
+			if (err == -ENODEV) /* no logical ctx support */
+				err = 0;
+			goto out_unlock;
+		}
+
+		if (!parent->ppgtt) {
+			err = 0;
+			goto out_unlock;
+		}
+
+		ncontexts = 0;
+		ndwords = 0;
+		dw = 0;
+		while (!time_after(jiffies, end_time)) {
+			intel_wakeref_t wakeref;
+
+			if (ctx)
+				__destroy_hw_context(ctx, file->driver_priv);
+
+			ctx = i915_gem_create_context(i915, file->driver_priv);
+			if (IS_ERR(ctx)) {
+				err = PTR_ERR(ctx);
+				goto out_unlock;
+			}
+
+			__assign_ppgtt(ctx, parent->ppgtt);
+
+			if (!obj) {
+				obj = create_test_object(parent, file, &objects);
+				if (IS_ERR(obj)) {
+					err = PTR_ERR(obj);
+					goto out_unlock;
+				}
+			}
+
+			err = 0;
+			with_intel_runtime_pm(i915, wakeref)
+				err = gpu_fill(obj, ctx, engine, dw);
+			if (err) {
+				pr_err("Failed to fill dword %lu [%lu/%lu] with gpu (%s) in ctx %u [full-ppgtt? %s], err=%d\n",
+				       ndwords, dw, max_dwords(obj),
+				       engine->name, ctx->hw_id,
+				       yesno(!!ctx->ppgtt), err);
+				goto out_unlock;
+			}
+
+			if (++dw == max_dwords(obj)) {
+				obj = NULL;
+				dw = 0;
+			}
+
+			ndwords++;
+			ncontexts++;
+		}
+		pr_info("Submitted %lu contexts to %s, filling %lu dwords\n",
+			ncontexts, engine->name, ndwords);
+
+		ncontexts = dw = 0;
+		list_for_each_entry(obj, &objects, st_link) {
+			unsigned int rem =
+				min_t(unsigned int, ndwords - dw, max_dwords(obj));
+
+			err = cpu_check(obj, ncontexts++, rem);
+			if (err)
+				break;
+
+			dw += rem;
+		}
 
 out_unlock:
-	if (igt_live_test_end(&t))
-		err = -EIO;
-	mutex_unlock(&i915->drm.struct_mutex);
+		if (igt_live_test_end(&t))
+			err = -EIO;
+		mutex_unlock(&i915->drm.struct_mutex);
 
-	mock_file_free(i915, file);
-	return err;
+		mock_file_free(i915, file);
+		if (err)
+			return err;
+	}
+
+	return 0;
 }
 
 static struct i915_vma *rpcs_query_batch(struct i915_vma *vma)
@@ -1064,7 +1194,7 @@ static int igt_ctx_readonly(void *arg)
 	struct drm_i915_gem_object *obj = NULL;
 	struct i915_gem_context *ctx;
 	struct i915_hw_ppgtt *ppgtt;
-	unsigned long ndwords, dw;
+	unsigned long idx, ndwords, dw;
 	struct igt_live_test t;
 	struct drm_file *file;
 	I915_RND_STATE(prng);
@@ -1145,6 +1275,7 @@ static int igt_ctx_readonly(void *arg)
 		ndwords, RUNTIME_INFO(i915)->num_rings);
 
 	dw = 0;
+	idx = 0;
 	list_for_each_entry(obj, &objects, st_link) {
 		unsigned int rem =
 			min_t(unsigned int, ndwords - dw, max_dwords(obj));
@@ -1154,7 +1285,7 @@ static int igt_ctx_readonly(void *arg)
 		if (i915_gem_object_is_readonly(obj))
 			num_writes = 0;
 
-		err = cpu_check(obj, num_writes);
+		err = cpu_check(obj, idx++, num_writes);
 		if (err)
 			break;
 
@@ -1635,6 +1766,7 @@ int i915_gem_context_live_selftests(struct drm_i915_private *dev_priv)
 		SUBTEST(igt_ctx_exec),
 		SUBTEST(igt_ctx_readonly),
 		SUBTEST(igt_ctx_sseu),
+		SUBTEST(igt_shared_ctx_exec),
 		SUBTEST(igt_vm_isolation),
 	};
 
diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
index 3850ef4a5ec8..08a8f3d20854 100644
--- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
@@ -1020,7 +1020,6 @@ static int exercise_ppgtt(struct drm_i915_private *dev_priv,
 
 	err = func(dev_priv, &ppgtt->vm, 0, ppgtt->vm.total, end_time);
 
-	i915_ppgtt_close(&ppgtt->vm);
 	i915_ppgtt_put(ppgtt);
 out_unlock:
 	mutex_unlock(&dev_priv->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
index b646cdcdd602..5ab3f226c6ee 100644
--- a/drivers/gpu/drm/i915/selftests/mock_context.c
+++ b/drivers/gpu/drm/i915/selftests/mock_context.c
@@ -53,13 +53,17 @@ mock_context(struct drm_i915_private *i915,
 		goto err_handles;
 
 	if (name) {
+		struct i915_hw_ppgtt *ppgtt;
+
 		ctx->name = kstrdup(name, GFP_KERNEL);
 		if (!ctx->name)
 			goto err_put;
 
-		ctx->ppgtt = mock_ppgtt(i915, name);
-		if (!ctx->ppgtt)
+		ppgtt = mock_ppgtt(i915, name);
+		if (!ppgtt)
 			goto err_put;
+
+		__set_ppgtt(ctx, ppgtt);
 	}
 
 	return ctx;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 5de554708393..caf4de0fdfb6 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -339,6 +339,8 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_PERF_ADD_CONFIG	0x37
 #define DRM_I915_PERF_REMOVE_CONFIG	0x38
 #define DRM_I915_QUERY			0x39
+#define DRM_I915_GEM_VM_CREATE		0x3a
+#define DRM_I915_GEM_VM_DESTROY		0x3b
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -397,6 +399,8 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_PERF_ADD_CONFIG	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_ADD_CONFIG, struct drm_i915_perf_oa_config)
 #define DRM_IOCTL_I915_PERF_REMOVE_CONFIG	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_PERF_REMOVE_CONFIG, __u64)
 #define DRM_IOCTL_I915_QUERY			DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_QUERY, struct drm_i915_query)
+#define DRM_IOCTL_I915_GEM_VM_CREATE	DRM_IOWR (DRM_COMMAND_BASE + DRM_I915_GEM_VM_CREATE, struct drm_i915_gem_vm_control)
+#define DRM_IOCTL_I915_GEM_VM_DESTROY	DRM_IOW (DRM_COMMAND_BASE + DRM_I915_GEM_VM_DESTROY, struct drm_i915_gem_vm_control)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -1442,6 +1446,12 @@ struct drm_i915_gem_context_destroy {
 	__u32 pad;
 };
 
+struct drm_i915_gem_vm_control {
+	__u64 extensions;
+	__u32 flags;
+	__u32 id;
+};
+
 struct drm_i915_reg_read {
 	/*
 	 * Register offset.
@@ -1511,6 +1521,7 @@ struct drm_i915_gem_context_param {
 	 * drm_i915_gem_context_param_sseu.
 	 */
 #define I915_CONTEXT_PARAM_SSEU		0x7
+#define I915_CONTEXT_PARAM_VM		0x8
 	__u64 value;
 };
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 66+ messages in thread

* ✗ Fi.CI.BAT: failure for series starting with [01/38] drm/i915/execlists: Store the highest priority context (rev2)
  2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
                   ` (38 preceding siblings ...)
  2019-01-18 14:17 ` ✗ Fi.CI.BAT: failure for series starting with [01/38] drm/i915/execlists: Store the highest priority context Patchwork
@ 2019-01-24 16:28 ` Patchwork
  39 siblings, 0 replies; 66+ messages in thread
From: Patchwork @ 2019-01-24 16:28 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/38] drm/i915/execlists: Store the highest priority context (rev2)
URL   : https://patchwork.freedesktop.org/series/55411/
State : failure

== Summary ==

Applying: drm/i915/execlists: Store the highest priority context
Applying: drm/i915: Make all GPU resets atomic
Applying: drm/i915/guc: Disable global reset
Applying: drm/i915: Remove GPU reset dependence on struct_mutex
Applying: drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest
Applying: drm/i915: Issue engine resets onto idle engines
Applying: drm/i915: Stop tracking MRU activity on VMA
Using index info to reconstruct a base tree...
M	drivers/gpu/drm/i915/i915_gem.c
M	drivers/gpu/drm/i915/selftests/i915_gem_evict.c
M	drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
Falling back to patching base and 3-way merge...
Auto-merging drivers/gpu/drm/i915/selftests/i915_gem_gtt.c
Auto-merging drivers/gpu/drm/i915/selftests/i915_gem_evict.c
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/selftests/i915_gem_evict.c
Auto-merging drivers/gpu/drm/i915/i915_gem.c
error: Failed to merge in the changes.
hint: Use 'git am --show-current-patch' to see the failed patch
Patch failed at 0007 drm/i915: Stop tracking MRU activity on VMA
When you have resolved this problem, run "git am --continue".
If you prefer to skip this patch, run "git am --skip" instead.
To restore the original branch and stop patching, run "git am --abort".

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 31/38] drm/i915: Allow contexts to share a single timeline across all engines
  2019-01-18 14:01 ` [PATCH 31/38] drm/i915: Allow contexts to share a single timeline across all engines Chris Wilson
@ 2019-01-24 17:35   ` Tvrtko Ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-24 17:35 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 18/01/2019 14:01, Chris Wilson wrote:
> Previously, our view has been always to run the engines independently
> within a context. (Multiple engines happened before we had contexts and
> timelines, so they always operated independently and that behaviour
> persisted into contexts.) However, at the user level the context often
> represents a single timeline (e.g. GL contexts) and userspace must
> ensure that the individual engines are serialised to present that
> ordering to the client (or forgot about this detail entirely and hope no
> one notices - a fair ploy if the client can only directly control one
> engine themselves ;)
> 
> In the next patch, we will want to construct a set of engines that
> operate as one, that have a single timeline interwoven between them, to
> present a single virtual engine to the user. (They submit to the virtual
> engine, then we decide which engine to execute on based.)
> 
> To that end, we want to be able to create contexts which have a single
> timeline (fence context) shared between all engines, rather than multiple
> timelines.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/i915_gem_context.c       | 33 +++++++++++++---
>   drivers/gpu/drm/i915/i915_gem_context.h       |  3 ++
>   drivers/gpu/drm/i915/i915_request.c           | 10 ++++-
>   drivers/gpu/drm/i915/i915_request.h           |  5 ++-
>   drivers/gpu/drm/i915/i915_sw_fence.c          | 39 ++++++++++++++++---
>   drivers/gpu/drm/i915/i915_sw_fence.h          | 13 ++++++-
>   drivers/gpu/drm/i915/intel_lrc.c              |  5 ++-
>   .../gpu/drm/i915/selftests/i915_gem_context.c | 17 ++++----
>   drivers/gpu/drm/i915/selftests/mock_context.c |  2 +-
>   include/uapi/drm/i915_drm.h                   |  1 +
>   10 files changed, 103 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index ec5e3e1c6402..e28be242399d 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -225,6 +225,9 @@ static void i915_gem_context_free(struct i915_gem_context *ctx)
>   			ce->ops->destroy(ce);
>   	}
>   
> +	if (ctx->timeline)
> +		i915_timeline_put(ctx->timeline);
> +
>   	kfree(ctx->name);
>   	put_pid(ctx->pid);
>   
> @@ -425,12 +428,17 @@ static void __set_ppgtt(struct i915_gem_context *ctx,
>   
>   static struct i915_gem_context *
>   i915_gem_create_context(struct drm_i915_private *dev_priv,
> -			struct drm_i915_file_private *file_priv)
> +			struct drm_i915_file_private *file_priv,
> +			unsigned int flags)
>   {
>   	struct i915_gem_context *ctx;
>   
>   	lockdep_assert_held(&dev_priv->drm.struct_mutex);
>   
> +	if (flags & I915_GEM_CONTEXT_SINGLE_TIMELINE &&
> +	    !HAS_EXECLISTS(dev_priv))
> +		return ERR_PTR(-EINVAL);
> +
>   	/* Reap the most stale context */
>   	contexts_free_first(dev_priv);
>   
> @@ -453,6 +461,18 @@ i915_gem_create_context(struct drm_i915_private *dev_priv,
>   		i915_ppgtt_put(ppgtt);
>   	}
>   
> +	if (flags & I915_GEM_CONTEXT_SINGLE_TIMELINE) {
> +		struct i915_timeline *timeline;
> +
> +		timeline = i915_timeline_create(dev_priv, ctx->name, NULL);
> +		if (IS_ERR(timeline)) {
> +			__destroy_hw_context(ctx, file_priv);
> +			return ERR_CAST(timeline);
> +		}
> +
> +		ctx->timeline = timeline;
> +	}
> +
>   	trace_i915_context_create(ctx);
>   
>   	return ctx;
> @@ -481,7 +501,7 @@ i915_gem_context_create_gvt(struct drm_device *dev)
>   	if (ret)
>   		return ERR_PTR(ret);
>   
> -	ctx = i915_gem_create_context(to_i915(dev), NULL);
> +	ctx = i915_gem_create_context(to_i915(dev), NULL, 0);
>   	if (IS_ERR(ctx))
>   		goto out;
>   
> @@ -517,7 +537,7 @@ i915_gem_context_create_kernel(struct drm_i915_private *i915, int prio)
>   	struct i915_gem_context *ctx;
>   	int err;
>   
> -	ctx = i915_gem_create_context(i915, NULL);
> +	ctx = i915_gem_create_context(i915, NULL, 0);
>   	if (IS_ERR(ctx))
>   		return ctx;
>   
> @@ -638,7 +658,7 @@ int i915_gem_context_open(struct drm_i915_private *i915,
>   	idr_init_base(&file_priv->vm_idr, 1);
>   
>   	mutex_lock(&i915->drm.struct_mutex);
> -	ctx = i915_gem_create_context(i915, file_priv);
> +	ctx = i915_gem_create_context(i915, file_priv, 0);
>   	mutex_unlock(&i915->drm.struct_mutex);
>   	if (IS_ERR(ctx)) {
>   		idr_destroy(&file_priv->context_idr);
> @@ -992,7 +1012,8 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
>   	if (!DRIVER_CAPS(dev_priv)->has_logical_contexts)
>   		return -ENODEV;
>   
> -	if (args->flags)
> +	if (args->flags &
> +	    ~(I915_GEM_CONTEXT_SINGLE_TIMELINE))
>   		return -EINVAL;
>   
>   	if (client_is_banned(file_priv)) {
> @@ -1007,7 +1028,7 @@ int i915_gem_context_create_ioctl(struct drm_device *dev, void *data,
>   	if (ret)
>   		return ret;
>   
> -	ctx = i915_gem_create_context(dev_priv, file_priv);
> +	ctx = i915_gem_create_context(dev_priv, file_priv, args->flags);
>   	mutex_unlock(&dev->struct_mutex);
>   	if (IS_ERR(ctx))
>   		return PTR_ERR(ctx);
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.h b/drivers/gpu/drm/i915/i915_gem_context.h
> index 9786f86b659d..b3a840747330 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.h
> +++ b/drivers/gpu/drm/i915/i915_gem_context.h
> @@ -41,6 +41,7 @@ struct drm_i915_private;
>   struct drm_i915_file_private;
>   struct i915_hw_ppgtt;
>   struct i915_request;
> +struct i915_timeline;
>   struct i915_vma;
>   struct intel_ring;
>   
> @@ -66,6 +67,8 @@ struct i915_gem_context {
>   	/** file_priv: owning file descriptor */
>   	struct drm_i915_file_private *file_priv;
>   
> +	struct i915_timeline *timeline;
> +
>   	/**
>   	 * @ppgtt: unique address space (GTT)
>   	 *
> diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
> index 7bccf578cd65..ca432d3d8211 100644
> --- a/drivers/gpu/drm/i915/i915_request.c
> +++ b/drivers/gpu/drm/i915/i915_request.c
> @@ -860,8 +860,14 @@ void i915_request_add(struct i915_request *request)
>   	prev = i915_gem_active_raw(&timeline->last_request,
>   				   &request->i915->drm.struct_mutex);
>   	if (prev && !i915_request_completed(prev)) {
> -		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
> -					     &request->submitq);
> +		if (prev->engine == engine)
> +			i915_sw_fence_await_sw_fence(&request->submit,
> +						     &prev->submit,
> +						     &request->submitq);
> +		else
> +			__i915_sw_fence_await_dma_fence(&request->submit,
> +							&prev->fence,
> +							&request->dmaq);

Drop a comment here explaining this trick, at least I always forget the 
many different await flavours.

The subtlety of why we need a special flavour here of await helpers 
which use the builtin call back storage, vs using the existing ones 
which allocate that, totally escapes me at the moment.

It's probably a good idea to put a paragraph in the commit message 
explaining what new sw fence facility needs to be added to implement 
this and why.

>   		if (engine->schedule)
>   			__i915_sched_node_add_dependency(&request->sched,
>   							 &prev->sched,
> diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h
> index 679b4663f774..f715384ff485 100644
> --- a/drivers/gpu/drm/i915/i915_request.h
> +++ b/drivers/gpu/drm/i915/i915_request.h
> @@ -108,7 +108,10 @@ struct i915_request {
>   	 * It is used by the driver to then queue the request for execution.
>   	 */
>   	struct i915_sw_fence submit;
> -	wait_queue_entry_t submitq;
> +	union {
> +		wait_queue_entry_t submitq;
> +		struct i915_sw_dma_fence_cb dmaq;
> +	};

Union deserves a comment as well I think, like this is for that and that 
is for this, and only one can be in use at a time because of the third 
thing.

>   
>   	/*
>   	 * A list of everyone we wait upon, and everyone who waits upon us.
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 7c58b049ecb5..7bb64437dbbe 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -359,11 +359,6 @@ int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
>   	return __i915_sw_fence_await_sw_fence(fence, signaler, NULL, gfp);
>   }
>   
> -struct i915_sw_dma_fence_cb {
> -	struct dma_fence_cb base;
> -	struct i915_sw_fence *fence;
> -};
> -
>   struct i915_sw_dma_fence_cb_timer {
>   	struct i915_sw_dma_fence_cb base;
>   	struct dma_fence *dma;
> @@ -480,6 +475,40 @@ int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
>   	return ret;
>   }
>   
> +static void __dma_i915_sw_fence_wake(struct dma_fence *dma,
> +				     struct dma_fence_cb *data)
> +{
> +	struct i915_sw_dma_fence_cb *cb = container_of(data, typeof(*cb), base);
> +
> +	i915_sw_fence_complete(cb->fence);
> +}
> +
> +int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
> +				    struct dma_fence *dma,
> +				    struct i915_sw_dma_fence_cb *cb)
> +{
> +	int ret;
> +
> +	debug_fence_assert(fence);
> +
> +	if (dma_fence_is_signaled(dma))
> +		return 0;
> +
> +	cb->fence = fence;
> +	i915_sw_fence_await(fence);
> +
> +	ret = dma_fence_add_callback(dma, &cb->base, __dma_i915_sw_fence_wake);
> +	if (ret == 0) {
> +		ret = 1;
> +	} else {
> +		i915_sw_fence_complete(fence);
> +		if (ret == -ENOENT) /* fence already signaled */
> +			ret = 0;
> +	}
> +
> +	return ret;
> +}
> +
>   int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    struct reservation_object *resv,
>   				    const struct dma_fence_ops *exclude,
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 0e055ea0179f..b420ceadb813 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -9,14 +9,13 @@
>   #ifndef _I915_SW_FENCE_H_
>   #define _I915_SW_FENCE_H_
>   
> +#include <linux/dma-fence.h>
>   #include <linux/gfp.h>
>   #include <linux/kref.h>
>   #include <linux/notifier.h> /* for NOTIFY_DONE */
>   #include <linux/wait.h>
>   
>   struct completion;
> -struct dma_fence;
> -struct dma_fence_ops;
>   struct reservation_object;
>   
>   struct i915_sw_fence {
> @@ -68,10 +67,20 @@ int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
>   int i915_sw_fence_await_sw_fence_gfp(struct i915_sw_fence *fence,
>   				     struct i915_sw_fence *after,
>   				     gfp_t gfp);
> +
> +struct i915_sw_dma_fence_cb {
> +	struct dma_fence_cb base;
> +	struct i915_sw_fence *fence;
> +};
> +
> +int __i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
> +				    struct dma_fence *dma,
> +				    struct i915_sw_dma_fence_cb *cb);
>   int i915_sw_fence_await_dma_fence(struct i915_sw_fence *fence,
>   				  struct dma_fence *dma,
>   				  unsigned long timeout,
>   				  gfp_t gfp);
> +
>   int i915_sw_fence_await_reservation(struct i915_sw_fence *fence,
>   				    struct reservation_object *resv,
>   				    const struct dma_fence_ops *exclude,
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 323341e9bf2d..10c42820bb46 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -2644,7 +2644,10 @@ static int execlists_context_deferred_alloc(struct i915_gem_context *ctx,
>   		goto error_deref_obj;
>   	}
>   
> -	timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
> +	if (ctx->timeline)
> +		timeline = i915_timeline_get(ctx->timeline);
> +	else
> +		timeline = i915_timeline_create(ctx->i915, ctx->name, NULL);
>   	if (IS_ERR(timeline)) {
>   		ret = PTR_ERR(timeline);
>   		goto error_deref_obj;
> diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_context.c b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> index 2864cfb82325..3e68c2888b9c 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/selftests/i915_gem_context.c
> @@ -143,7 +143,7 @@ static int live_nop_switch(void *arg)
>   	}
>   
>   	for (n = 0; n < nctx; n++) {
> -		ctx[n] = i915_gem_create_context(i915, file->driver_priv);
> +		ctx[n] = i915_gem_create_context(i915, file->driver_priv, 0);
>   		if (IS_ERR(ctx[n])) {
>   			err = PTR_ERR(ctx[n]);
>   			goto out_unlock;
> @@ -601,7 +601,8 @@ static int igt_ctx_exec(void *arg)
>   			struct i915_gem_context *ctx;
>   			intel_wakeref_t wakeref;
>   
> -			ctx = i915_gem_create_context(i915, file->driver_priv);
> +			ctx = i915_gem_create_context(i915,
> +						      file->driver_priv, 0);
>   			if (IS_ERR(ctx)) {
>   				err = PTR_ERR(ctx);
>   				goto out_unlock;
> @@ -698,7 +699,8 @@ static int igt_shared_ctx_exec(void *arg)
>   		if (err)
>   			goto out_unlock;
>   
> -		parent = i915_gem_create_context(i915, file->driver_priv);
> +		parent = i915_gem_create_context(i915,
> +						 file->driver_priv, 0);
>   		if (IS_ERR(parent)) {
>   			err = PTR_ERR(parent);
>   			if (err == -ENODEV) /* no logical ctx support */
> @@ -720,7 +722,8 @@ static int igt_shared_ctx_exec(void *arg)
>   			if (ctx)
>   				__destroy_hw_context(ctx, file->driver_priv);
>   
> -			ctx = i915_gem_create_context(i915, file->driver_priv);
> +			ctx = i915_gem_create_context(i915,
> +						      file->driver_priv, 0);
>   			if (IS_ERR(ctx)) {
>   				err = PTR_ERR(ctx);
>   				goto out_unlock;
> @@ -813,7 +816,7 @@ static int igt_ctx_readonly(void *arg)
>   	if (err)
>   		goto out_unlock;
>   
> -	ctx = i915_gem_create_context(i915, file->driver_priv);
> +	ctx = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx)) {
>   		err = PTR_ERR(ctx);
>   		goto out_unlock;
> @@ -1139,13 +1142,13 @@ static int igt_vm_isolation(void *arg)
>   	if (err)
>   		goto out_unlock;
>   
> -	ctx_a = i915_gem_create_context(i915, file->driver_priv);
> +	ctx_a = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx_a)) {
>   		err = PTR_ERR(ctx_a);
>   		goto out_unlock;
>   	}
>   
> -	ctx_b = i915_gem_create_context(i915, file->driver_priv);
> +	ctx_b = i915_gem_create_context(i915, file->driver_priv, 0);
>   	if (IS_ERR(ctx_b)) {
>   		err = PTR_ERR(ctx_b);
>   		goto out_unlock;
> diff --git a/drivers/gpu/drm/i915/selftests/mock_context.c b/drivers/gpu/drm/i915/selftests/mock_context.c
> index b9fd2a4b95e9..f13f9c726034 100644
> --- a/drivers/gpu/drm/i915/selftests/mock_context.c
> +++ b/drivers/gpu/drm/i915/selftests/mock_context.c
> @@ -99,7 +99,7 @@ live_context(struct drm_i915_private *i915, struct drm_file *file)
>   {
>   	lockdep_assert_held(&i915->drm.struct_mutex);
>   
> -	return i915_gem_create_context(i915, file->driver_priv);
> +	return i915_gem_create_context(i915, file->driver_priv, 0);
>   }
>   
>   struct i915_gem_context *
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 704e9d2fe2d6..72749dc9801e 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -1444,6 +1444,7 @@ struct drm_i915_gem_context_create {
>   struct drm_i915_gem_context_create_ext {
>   	__u32 ctx_id; /* output: id of new context*/
>   	__u32 flags;
> +#define I915_GEM_CONTEXT_SINGLE_TIMELINE	0x1

And some kernel doc please.

>   	__u64 extensions;
>   };
>   
> 

Look fine, no complaints, apart from needing help to remind me how some 
things work.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA
  2019-01-22 14:19     ` Chris Wilson
@ 2019-01-25 10:46       ` Tvrtko Ursulin
  2019-01-25 13:38         ` Chris Wilson
  0 siblings, 1 reply; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-25 10:46 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 22/01/2019 14:19, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2019-01-18 16:03:27)
>>
>> On 18/01/2019 14:00, Chris Wilson wrote:
>>> Our goal is to remove struct_mutex and replace it with fine grained
>>> locking. One of the thorny issues is our eviction logic for reclaiming
>>> space for an execbuffer (or GTT mmaping, among a few other examples).
>>> While eviction itself is easy to move under a per-VM mutex, performing
>>> the activity tracking is less agreeable. One solution is not to do any
>>> MRU tracking and do a simple coarse evaluation during eviction of
>>> active/inactive, with a loose temporal ordering of last
>>> insertion/evaluation. That keeps all the locking constrained to when we
>>> are manipulating the VM itself, neatly avoiding the tricky handling of
>>> possible recursive locking during execbuf and elsewhere.
>>>
>>> Note that discarding the MRU is unlikely to impact upon our efficiency
>>> to reclaim VM space (where we think a LRU model is best) as our
>>> current strategy is to use random idle replacement first before doing
>>> a search, and over time the use of softpinned 48b per-ppGTT is growing
>>> (thereby eliminating any need to perform any eviction searches, in
>>> theory at least).
>>
>> There is still no mention of list consolidation.
> 
> "Note that discarding the MRU (currently implemented as a pair of lists,
> to avoid scanning the active list for a NONBLOCKING search)"
> 
> Is that enough to make it clear?

How about:

"Note that discarding the MRU, which is implemented both by not doing 
list operations while tracking activity, and by replacing 
active/inactive list with a single bound list, is unlikely..."

?

Plus I really want a changelog and that stale comment which talks about 
active/inactive *lists* updated.

Regards,

Tvrtko



_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA
  2019-01-25 10:46       ` Tvrtko Ursulin
@ 2019-01-25 13:38         ` Chris Wilson
  2019-01-25 13:46           ` Chris Wilson
  0 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-25 13:38 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2019-01-25 10:46:19)
> 
> On 22/01/2019 14:19, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2019-01-18 16:03:27)
> >>
> >> On 18/01/2019 14:00, Chris Wilson wrote:
> >>> Our goal is to remove struct_mutex and replace it with fine grained
> >>> locking. One of the thorny issues is our eviction logic for reclaiming
> >>> space for an execbuffer (or GTT mmaping, among a few other examples).
> >>> While eviction itself is easy to move under a per-VM mutex, performing
> >>> the activity tracking is less agreeable. One solution is not to do any
> >>> MRU tracking and do a simple coarse evaluation during eviction of
> >>> active/inactive, with a loose temporal ordering of last
> >>> insertion/evaluation. That keeps all the locking constrained to when we
> >>> are manipulating the VM itself, neatly avoiding the tricky handling of
> >>> possible recursive locking during execbuf and elsewhere.
> >>>
> >>> Note that discarding the MRU is unlikely to impact upon our efficiency
> >>> to reclaim VM space (where we think a LRU model is best) as our
> >>> current strategy is to use random idle replacement first before doing
> >>> a search, and over time the use of softpinned 48b per-ppGTT is growing
> >>> (thereby eliminating any need to perform any eviction searches, in
> >>> theory at least).
> >>
> >> There is still no mention of list consolidation.
> > 
> > "Note that discarding the MRU (currently implemented as a pair of lists,
> > to avoid scanning the active list for a NONBLOCKING search)"
> > 
> > Is that enough to make it clear?
> 
> How about:
> 
> "Note that discarding the MRU, which is implemented both by not doing 
> list operations while tracking activity, and by replacing 
> active/inactive list with a single bound list, is unlikely..."
> 
> ?
> 
> Plus I really want a changelog and that stale comment which talks about 
> active/inactive *lists* updated.

But there's active/inactive lists within the list...
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA
  2019-01-25 13:38         ` Chris Wilson
@ 2019-01-25 13:46           ` Chris Wilson
  2019-01-25 14:08             ` Tvrtko Ursulin
  0 siblings, 1 reply; 66+ messages in thread
From: Chris Wilson @ 2019-01-25 13:46 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Chris Wilson (2019-01-25 13:38:09)
> Quoting Tvrtko Ursulin (2019-01-25 10:46:19)
> > 
> > On 22/01/2019 14:19, Chris Wilson wrote:
> > > Quoting Tvrtko Ursulin (2019-01-18 16:03:27)
> > >>
> > >> On 18/01/2019 14:00, Chris Wilson wrote:
> > >>> Our goal is to remove struct_mutex and replace it with fine grained
> > >>> locking. One of the thorny issues is our eviction logic for reclaiming
> > >>> space for an execbuffer (or GTT mmaping, among a few other examples).
> > >>> While eviction itself is easy to move under a per-VM mutex, performing
> > >>> the activity tracking is less agreeable. One solution is not to do any
> > >>> MRU tracking and do a simple coarse evaluation during eviction of
> > >>> active/inactive, with a loose temporal ordering of last
> > >>> insertion/evaluation. That keeps all the locking constrained to when we
> > >>> are manipulating the VM itself, neatly avoiding the tricky handling of
> > >>> possible recursive locking during execbuf and elsewhere.
> > >>>
> > >>> Note that discarding the MRU is unlikely to impact upon our efficiency
> > >>> to reclaim VM space (where we think a LRU model is best) as our
> > >>> current strategy is to use random idle replacement first before doing
> > >>> a search, and over time the use of softpinned 48b per-ppGTT is growing
> > >>> (thereby eliminating any need to perform any eviction searches, in
> > >>> theory at least).
> > >>
> > >> There is still no mention of list consolidation.
> > > 
> > > "Note that discarding the MRU (currently implemented as a pair of lists,
> > > to avoid scanning the active list for a NONBLOCKING search)"
> > > 
> > > Is that enough to make it clear?
> > 
> > How about:
> > 
> > "Note that discarding the MRU, which is implemented both by not doing 
> > list operations while tracking activity, and by replacing 
> > active/inactive list with a single bound list, is unlikely..."
> > 
> > ?
> > 
> > Plus I really want a changelog and that stale comment which talks about 
> > active/inactive *lists* updated.
> 
> But there's active/inactive lists within the list...

@@ -136,17 +136,15 @@ i915_gem_evict_something(struct i915_address_space *vm,
        trace_i915_gem_evict(vm, min_size, alignment, flags);

        /*
-        * The goal is to evict objects and amalgamate space in LRU order.
-        * The oldest idle objects reside on the inactive list, which is in
-        * retirement order. The next objects to retire are those in flight,
-        * on the active list, again in retirement order.
+        * The goal is to evict objects and amalgamate space in rough LRU order.
+        * Since both active and inactive objects reside on the same list,
+        * in a mix of creation and last scanned order, as we process the list
+        * we sort it into inactive/active, which keeps the active portion
+        * in a rough MRU order.
         *
         * The retirement sequence is thus:
         *   1. Inactive objects (already retired, random order)
         *   2. Active objects (will stall on unbinding, oldest scanned first)
-        *
-        * On each list, the oldest objects lie at the HEAD with the freshest
-        * object on the TAIL.
         */
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

* Re: [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA
  2019-01-25 13:46           ` Chris Wilson
@ 2019-01-25 14:08             ` Tvrtko Ursulin
  0 siblings, 0 replies; 66+ messages in thread
From: Tvrtko Ursulin @ 2019-01-25 14:08 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 25/01/2019 13:46, Chris Wilson wrote:
> Quoting Chris Wilson (2019-01-25 13:38:09)
>> Quoting Tvrtko Ursulin (2019-01-25 10:46:19)
>>>
>>> On 22/01/2019 14:19, Chris Wilson wrote:
>>>> Quoting Tvrtko Ursulin (2019-01-18 16:03:27)
>>>>>
>>>>> On 18/01/2019 14:00, Chris Wilson wrote:
>>>>>> Our goal is to remove struct_mutex and replace it with fine grained
>>>>>> locking. One of the thorny issues is our eviction logic for reclaiming
>>>>>> space for an execbuffer (or GTT mmaping, among a few other examples).
>>>>>> While eviction itself is easy to move under a per-VM mutex, performing
>>>>>> the activity tracking is less agreeable. One solution is not to do any
>>>>>> MRU tracking and do a simple coarse evaluation during eviction of
>>>>>> active/inactive, with a loose temporal ordering of last
>>>>>> insertion/evaluation. That keeps all the locking constrained to when we
>>>>>> are manipulating the VM itself, neatly avoiding the tricky handling of
>>>>>> possible recursive locking during execbuf and elsewhere.
>>>>>>
>>>>>> Note that discarding the MRU is unlikely to impact upon our efficiency
>>>>>> to reclaim VM space (where we think a LRU model is best) as our
>>>>>> current strategy is to use random idle replacement first before doing
>>>>>> a search, and over time the use of softpinned 48b per-ppGTT is growing
>>>>>> (thereby eliminating any need to perform any eviction searches, in
>>>>>> theory at least).
>>>>>
>>>>> There is still no mention of list consolidation.
>>>>
>>>> "Note that discarding the MRU (currently implemented as a pair of lists,
>>>> to avoid scanning the active list for a NONBLOCKING search)"
>>>>
>>>> Is that enough to make it clear?
>>>
>>> How about:
>>>
>>> "Note that discarding the MRU, which is implemented both by not doing
>>> list operations while tracking activity, and by replacing
>>> active/inactive list with a single bound list, is unlikely..."
>>>
>>> ?
>>>
>>> Plus I really want a changelog and that stale comment which talks about
>>> active/inactive *lists* updated.
>>
>> But there's active/inactive lists within the list...
> 
> @@ -136,17 +136,15 @@ i915_gem_evict_something(struct i915_address_space *vm,
>          trace_i915_gem_evict(vm, min_size, alignment, flags);
> 
>          /*
> -        * The goal is to evict objects and amalgamate space in LRU order.
> -        * The oldest idle objects reside on the inactive list, which is in
> -        * retirement order. The next objects to retire are those in flight,
> -        * on the active list, again in retirement order.
> +        * The goal is to evict objects and amalgamate space in rough LRU order.
> +        * Since both active and inactive objects reside on the same list,
> +        * in a mix of creation and last scanned order, as we process the list
> +        * we sort it into inactive/active, which keeps the active portion
> +        * in a rough MRU order.
>           *
>           * The retirement sequence is thus:
>           *   1. Inactive objects (already retired, random order)
>           *   2. Active objects (will stall on unbinding, oldest scanned first)
> -        *
> -        * On each list, the oldest objects lie at the HEAD with the freshest
> -        * object on the TAIL.
>           */

Yep, I'm happy with that.

Regards,

Tvrtko


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 66+ messages in thread

end of thread, other threads:[~2019-01-25 14:08 UTC | newest]

Thread overview: 66+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-18 14:00 Keeping Tvrtko busy Chris Wilson
2019-01-18 14:00 ` [PATCH 01/38] drm/i915/execlists: Store the highest priority context Chris Wilson
2019-01-18 14:00 ` [PATCH 02/38] drm/i915: Make all GPU resets atomic Chris Wilson
2019-01-18 14:22   ` Mika Kuoppala
2019-01-18 14:00 ` [PATCH 03/38] drm/i915/guc: Disable global reset Chris Wilson
2019-01-18 14:00 ` [PATCH 04/38] drm/i915: Remove GPU reset dependence on struct_mutex Chris Wilson
2019-01-18 14:00 ` [PATCH 05/38] drm/i915/selftests: Trim struct_mutex duration for set-wedged selftest Chris Wilson
2019-01-18 14:29   ` Mika Kuoppala
2019-01-18 14:00 ` [PATCH 06/38] drm/i915: Issue engine resets onto idle engines Chris Wilson
2019-01-18 14:00 ` [PATCH 07/38] drm/i915: Stop tracking MRU activity on VMA Chris Wilson
2019-01-18 16:03   ` Tvrtko Ursulin
2019-01-18 16:06     ` Chris Wilson
2019-01-22 14:19     ` Chris Wilson
2019-01-25 10:46       ` Tvrtko Ursulin
2019-01-25 13:38         ` Chris Wilson
2019-01-25 13:46           ` Chris Wilson
2019-01-25 14:08             ` Tvrtko Ursulin
2019-01-18 14:00 ` [PATCH 08/38] drm/i915: Pull VM lists under the VM mutex Chris Wilson
2019-01-18 16:04   ` Tvrtko Ursulin
2019-01-18 14:00 ` [PATCH 09/38] drm/i915: Move vma lookup to its own lock Chris Wilson
2019-01-18 16:14   ` Tvrtko Ursulin
2019-01-18 14:00 ` [PATCH 10/38] drm/i915/selftests: Allocate mock ring/timeline per context Chris Wilson
2019-01-18 16:26   ` Tvrtko Ursulin
2019-01-18 14:00 ` [PATCH 11/38] drm/i915: Always allocate an object/vma for the HWSP Chris Wilson
2019-01-18 14:00 ` [PATCH 12/38] drm/i915: Move list of timelines under its own lock Chris Wilson
2019-01-18 16:28   ` Tvrtko Ursulin
2019-01-18 14:00 ` [PATCH 13/38] drm/i915: Introduce concept of per-timeline (context) HWSP Chris Wilson
2019-01-18 14:00 ` [PATCH 14/38] drm/i915: Enlarge vma->pin_count Chris Wilson
2019-01-18 14:00 ` [PATCH 15/38] drm/i915: Allocate a status page for each timeline Chris Wilson
2019-01-21 11:18   ` Tvrtko Ursulin
2019-01-18 14:00 ` [PATCH 16/38] drm/i915: Share per-timeline HWSP using a slab suballocator Chris Wilson
2019-01-18 14:00 ` [PATCH 17/38] drm/i915: Keep all partially allocated HWSP on a freelist Chris Wilson
2019-01-18 14:00 ` [PATCH 18/38] drm/i915: Track the context's seqno in its own timeline HWSP Chris Wilson
2019-01-18 14:00 ` [PATCH 19/38] drm/i915: Identify active requests Chris Wilson
2019-01-18 14:00 ` [PATCH 20/38] drm/i915: Remove the intel_engine_notify tracepoint Chris Wilson
2019-01-18 14:00 ` [PATCH 21/38] drm/i915: Replace global breadcrumbs with per-context interrupt tracking Chris Wilson
2019-01-18 14:00 ` [PATCH 22/38] drm/i915: Drop fake breadcrumb irq Chris Wilson
2019-01-18 14:00 ` [PATCH 23/38] drm/i915: Replace global_seqno with a hangcheck heartbeat seqno Chris Wilson
2019-01-18 14:00 ` [PATCH 24/38] drm/i915: Avoid presumption of execution ordering for kernel context switching Chris Wilson
2019-01-18 14:00 ` [PATCH 25/38] drm/i915/pmu: Always sample an active ringbuffer Chris Wilson
2019-01-22  9:20   ` Tvrtko Ursulin
2019-01-18 14:00 ` [PATCH 26/38] drm/i915: Remove the global per-engine execution timeline Chris Wilson
2019-01-18 14:00 ` [PATCH 27/38] drm/i915: Introduce the i915_user_extension_method Chris Wilson
2019-01-22  9:31   ` Tvrtko Ursulin
2019-01-22 10:47     ` Chris Wilson
2019-01-22 11:05       ` Tvrtko Ursulin
2019-01-18 14:00 ` [PATCH 28/38] drm/i915: Create/destroy VM (ppGTT) for use with contexts Chris Wilson
2019-01-23 11:30   ` Tvrtko Ursulin
2019-01-23 11:51     ` Chris Wilson
2019-01-23 12:03       ` Tvrtko Ursulin
2019-01-24 15:58     ` [PATCH v3] " Chris Wilson
2019-01-18 14:01 ` [PATCH 29/38] drm/i915: Expose user control over the ppGTT associated with a context Chris Wilson
2019-01-23 12:00   ` Tvrtko Ursulin
2019-01-23 12:15     ` Chris Wilson
2019-01-18 14:01 ` [PATCH 30/38] drm/i915: Extend CONTEXT_CREATE to set parameters upon construction Chris Wilson
2019-01-18 14:01 ` [PATCH 31/38] drm/i915: Allow contexts to share a single timeline across all engines Chris Wilson
2019-01-24 17:35   ` Tvrtko Ursulin
2019-01-18 14:01 ` [PATCH 32/38] drm/i915: Fix I915_EXEC_RING_MASK Chris Wilson
2019-01-18 14:01 ` [PATCH 33/38] drm/i915: Remove last traces of exec-id (GEM_BUSY) Chris Wilson
2019-01-18 14:01 ` [PATCH 34/38] drm/i915: Re-arrange execbuf so context is known before engine Chris Wilson
2019-01-18 14:01 ` [PATCH 35/38] drm/i915: Allow a context to define its set of engines Chris Wilson
2019-01-18 14:01 ` [PATCH 36/38] drm/i915/execlists: Refactor out can_merge_rq() Chris Wilson
2019-01-18 14:01 ` [PATCH 37/38] drm/i915: Store the BIT(engine->id) as the engine's mask Chris Wilson
2019-01-18 14:01 ` [PATCH 38/38] drm/i915: Load balancing across a virtual engine Chris Wilson
2019-01-18 14:17 ` ✗ Fi.CI.BAT: failure for series starting with [01/38] drm/i915/execlists: Store the highest priority context Patchwork
2019-01-24 16:28 ` ✗ Fi.CI.BAT: failure for series starting with [01/38] drm/i915/execlists: Store the highest priority context (rev2) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.