All of lore.kernel.org
 help / color / mirror / Atom feed
* Trivial priority scheduler
@ 2016-11-02 17:50 Chris Wilson
  2016-11-02 17:50 ` [PATCH 01/12] drm/i915: Split request submit/execute phase into two Chris Wilson
                   ` (12 more replies)
  0 siblings, 13 replies; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

Now that the tree has caught up, we can now look at the next steps
towards a basic scheduler. No timeslicing, no preemption, no fairness,
we simply choose the highest priority request to execute next. (An
incoming high priority request will bump the priority on its
prerequisites so that it will execute at the earliest opportunity.)

Changes since the preview: a weekend of testing with lockdep and by
popular demand plug GuC into to a software scheduler.
-Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH 01/12] drm/i915: Split request submit/execute phase into two
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-03 10:35   ` Tvrtko Ursulin
  2016-11-02 17:50 ` [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
                   ` (11 subsequent siblings)
  12 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

In order to support deferred scheduling, we need to differentiate
between when the request is ready to run (i.e. the submit fence is
signaled) and when the request is actually run (a new execute fence).
This is typically split between the request itself wanting to wait upon
others (for which we use the submit fence) and the CPU wanting to wait
upon the request, for which we use the execute fence to be sure the
hardware is ready to signal completion.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_request.c | 33 ++++++++++++++++++++++++---------
 drivers/gpu/drm/i915/i915_gem_request.h |  2 ++
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 79b0046d9a57..1ae5a2f8953f 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -351,11 +351,19 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
+	i915_sw_fence_commit(&request->execute);
+
 	spin_unlock_irqrestore(&timeline->lock, flags);
 
 	return NOTIFY_DONE;
 }
 
+static int __i915_sw_fence_call
+execute_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	return NOTIFY_DONE;
+}
+
 /**
  * i915_gem_request_alloc - allocate a request structure
  *
@@ -441,6 +449,12 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       __timeline_get_seqno(req->timeline->common));
 
 	i915_sw_fence_init(&req->submit, submit_notify);
+	i915_sw_fence_init(&req->execute, execute_notify);
+	/* Ensure that the execute fence completes after the submit fence -
+	 * as we complete the execute fence from within the submit fence
+	 * callback, its completion would otherwise be visible first.
+	 */
+	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
 
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
@@ -817,9 +831,9 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 }
 
 static long
-__i915_request_wait_for_submit(struct drm_i915_gem_request *request,
-			       unsigned int flags,
-			       long timeout)
+__i915_request_wait_for_execute(struct drm_i915_gem_request *request,
+				unsigned int flags,
+				long timeout)
 {
 	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
 		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
@@ -831,9 +845,9 @@ __i915_request_wait_for_submit(struct drm_i915_gem_request *request,
 		add_wait_queue(q, &reset);
 
 	do {
-		prepare_to_wait(&request->submit.wait, &wait, state);
+		prepare_to_wait(&request->execute.wait, &wait, state);
 
-		if (i915_sw_fence_done(&request->submit))
+		if (i915_sw_fence_done(&request->execute))
 			break;
 
 		if (flags & I915_WAIT_LOCKED &&
@@ -851,7 +865,7 @@ __i915_request_wait_for_submit(struct drm_i915_gem_request *request,
 
 		timeout = io_schedule_timeout(timeout);
 	} while (timeout);
-	finish_wait(&request->submit.wait, &wait);
+	finish_wait(&request->execute.wait, &wait);
 
 	if (flags & I915_WAIT_LOCKED)
 		remove_wait_queue(q, &reset);
@@ -903,13 +917,14 @@ long i915_wait_request(struct drm_i915_gem_request *req,
 
 	trace_i915_gem_request_wait_begin(req);
 
-	if (!i915_sw_fence_done(&req->submit)) {
-		timeout = __i915_request_wait_for_submit(req, flags, timeout);
+	if (!i915_sw_fence_done(&req->execute)) {
+		timeout = __i915_request_wait_for_execute(req, flags, timeout);
 		if (timeout < 0)
 			goto complete;
 
-		GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
+		GEM_BUG_ON(!i915_sw_fence_done(&req->execute));
 	}
+	GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
 	GEM_BUG_ON(!req->global_seqno);
 
 	/* Optimistic short spin before touching IRQs */
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 75f8360b3421..ed13f37fea0f 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -85,7 +85,9 @@ struct drm_i915_gem_request {
 	struct intel_signal_node signaling;
 
 	struct i915_sw_fence submit;
+	struct i915_sw_fence execute;
 	wait_queue_t submitq;
+	wait_queue_t execq;
 
 	u32 global_seqno;
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
  2016-11-02 17:50 ` [PATCH 01/12] drm/i915: Split request submit/execute phase into two Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-03 10:34   ` Tvrtko Ursulin
  2016-11-02 17:50 ` [PATCH 03/12] drm/i915: Remove engine->execlist_lock Chris Wilson
                   ` (10 subsequent siblings)
  12 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

Defer the transfer from the client's timeline onto the execution
timeline from the point of readiness to the point of actual submission.
For example, in execlists, a request is finally submitted to hardware
when the hardware is ready, and only put onto the hardware queue when
the request is ready. By deferring the transfer, we ensure that the
timeline is maintained in retirement order if we decide to queue the
requests onto the hardware in a different order than fifo.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_request.c    | 36 ++++++++++++++++++------------
 drivers/gpu/drm/i915/i915_gem_request.h    |  2 ++
 drivers/gpu/drm/i915/i915_guc_submission.c |  2 ++
 drivers/gpu/drm/i915/intel_lrc.c           |  5 +++++
 drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
 5 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 1ae5a2f8953f..05544dec5de9 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
 	return atomic_inc_return(&tl->next_seqno);
 }
 
-static int __i915_sw_fence_call
-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
 {
-	struct drm_i915_gem_request *request =
-		container_of(fence, typeof(*request), submit);
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_timeline *timeline;
-	unsigned long flags;
 	u32 seqno;
 
-	if (state != FENCE_COMPLETE)
-		return NOTIFY_DONE;
-
 	/* Transfer from per-context onto the global per-engine timeline */
 	timeline = engine->timeline;
 	GEM_BUG_ON(timeline == request->timeline);
-
-	/* Will be called from irq-context when using foreign DMA fences */
-	spin_lock_irqsave(&timeline->lock, flags);
+	assert_spin_locked(&timeline->lock);
 
 	seqno = timeline_get_seqno(timeline->common);
 	GEM_BUG_ON(!seqno);
@@ -345,15 +336,32 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	GEM_BUG_ON(!request->global_seqno);
 	engine->emit_breadcrumb(request,
 				request->ring->vaddr + request->postfix);
-	engine->submit_request(request);
 
-	spin_lock_nested(&request->timeline->lock, SINGLE_DEPTH_NESTING);
+	/* nested from submit_notify */
+	spin_lock_nested(&request->timeline->lock, 2);
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
 	i915_sw_fence_commit(&request->execute);
+}
 
-	spin_unlock_irqrestore(&timeline->lock, flags);
+static int __i915_sw_fence_call
+submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	if (state == FENCE_COMPLETE) {
+		struct drm_i915_gem_request *request =
+			container_of(fence, typeof(*request), submit);
+		struct intel_engine_cs *engine = request->engine;
+		unsigned long flags;
+
+		/* Will be called from irq-context when using foreign fences,
+		 * and may be called from the execution callback of another
+		 * engine.
+		 */
+		spin_lock_irqsave_nested(&engine->timeline->lock, flags, 1);
+		engine->submit_request(request);
+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
+	}
 
 	return NOTIFY_DONE;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index ed13f37fea0f..725d04128eaf 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -228,6 +228,8 @@ void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
 #define i915_add_request_no_flush(req) \
 	__i915_add_request(req, false)
 
+void __i915_gem_request_submit(struct drm_i915_gem_request *request);
+
 struct intel_rps_client;
 #define NO_WAITBOOST ERR_PTR(-1)
 #define IS_RPS_CLIENT(p) (!IS_ERR(p))
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 857ef914cae7..ddb574d5ceda 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -640,6 +640,8 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
 	struct i915_guc_client *client = guc->execbuf_client;
 	int b_ret;
 
+	__i915_gem_request_submit(rq);
+
 	spin_lock(&client->wq_lock);
 	guc_wq_item_append(client, rq);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fa3012c342cc..b9aba16851ac 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -434,6 +434,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *cursor, *last;
 	struct execlist_port *port = engine->execlist_port;
+	unsigned long flags;
 	bool submit = false;
 
 	last = port->request;
@@ -469,6 +470,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * and context switches) submission.
 	 */
 
+	spin_lock_irqsave(&engine->timeline->lock, flags);
 	spin_lock(&engine->execlist_lock);
 	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
 		/* Can we combine this request with the current port? It has to
@@ -501,6 +503,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			i915_gem_request_assign(&port->request, last);
 			port++;
 		}
+
+		__i915_gem_request_submit(cursor);
 		last = cursor;
 		submit = true;
 	}
@@ -512,6 +516,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		i915_gem_request_assign(&port->request, last);
 	}
 	spin_unlock(&engine->execlist_lock);
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	if (submit)
 		execlists_submit_ports(engine);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 700e93d80616..f91ee24e2763 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1294,6 +1294,8 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
 {
 	struct drm_i915_private *dev_priv = request->i915;
 
+	__i915_gem_request_submit(request);
+
 	I915_WRITE_TAIL(request->engine, request->tail);
 }
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 03/12] drm/i915: Remove engine->execlist_lock
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
  2016-11-02 17:50 ` [PATCH 01/12] drm/i915: Split request submit/execute phase into two Chris Wilson
  2016-11-02 17:50 ` [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-03 10:47   ` Tvrtko Ursulin
  2016-11-02 17:50 ` [PATCH 04/12] drm/i915/scheduler: Signal the arrival of a new request Chris Wilson
                   ` (9 subsequent siblings)
  12 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

The execlist_lock is now completely subsumed by the engine->timeline->lock,
and so we can remove the redundant layer of locking.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c     | 4 ++--
 drivers/gpu/drm/i915/i915_gem.c         | 4 ++--
 drivers/gpu/drm/i915/intel_engine_cs.c  | 1 -
 drivers/gpu/drm/i915/intel_lrc.c        | 7 +------
 drivers/gpu/drm/i915/intel_ringbuffer.h | 1 -
 5 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index c9465fbff2df..3cb96d260dfb 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3256,11 +3256,11 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 				seq_printf(m, "\t\tELSP[1] idle\n");
 			rcu_read_unlock();
 
-			spin_lock_irq(&engine->execlist_lock);
+			spin_lock_irq(&engine->timeline->lock);
 			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
 				print_request(m, rq, "\t\tQ ");
 			}
-			spin_unlock_irq(&engine->execlist_lock);
+			spin_unlock_irq(&engine->timeline->lock);
 		} else if (INTEL_GEN(dev_priv) > 6) {
 			seq_printf(m, "\tPP_DIR_BASE: 0x%08x\n",
 				   I915_READ(RING_PP_DIR_BASE(engine)));
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 5839bebba64a..cf28666021f8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2686,12 +2686,12 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
 	 */
 
 	if (i915.enable_execlists) {
-		spin_lock(&engine->execlist_lock);
+		spin_lock_irq(&engine->timeline->lock);
 		INIT_LIST_HEAD(&engine->execlist_queue);
 		i915_gem_request_put(engine->execlist_port[0].request);
 		i915_gem_request_put(engine->execlist_port[1].request);
 		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
-		spin_unlock(&engine->execlist_lock);
+		spin_unlock_irq(&engine->timeline->lock);
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 841f8d1e1410..298f0f95dd3f 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -237,7 +237,6 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
 void intel_engine_setup_common(struct intel_engine_cs *engine)
 {
 	INIT_LIST_HEAD(&engine->execlist_queue);
-	spin_lock_init(&engine->execlist_lock);
 
 	intel_engine_init_timeline(engine);
 	intel_engine_init_hangcheck(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index b9aba16851ac..186c3ccb2c34 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -471,7 +471,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 */
 
 	spin_lock_irqsave(&engine->timeline->lock, flags);
-	spin_lock(&engine->execlist_lock);
 	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
 		/* Can we combine this request with the current port? It has to
 		 * be the same context/ringbuffer and not have any exceptions
@@ -515,7 +514,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 
 		i915_gem_request_assign(&port->request, last);
 	}
-	spin_unlock(&engine->execlist_lock);
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	if (submit)
@@ -600,9 +598,8 @@ static void intel_lrc_irq_handler(unsigned long data)
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
-	unsigned long flags;
 
-	spin_lock_irqsave(&engine->execlist_lock, flags);
+	assert_spin_locked(&engine->timeline->lock);
 
 	/* We keep the previous context alive until we retire the following
 	 * request. This ensures that any the context object is still pinned
@@ -616,8 +613,6 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
-
-	spin_unlock_irqrestore(&engine->execlist_lock, flags);
 }
 
 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 642b54692d0d..062bc8e1872a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -335,7 +335,6 @@ struct intel_engine_cs {
 
 	/* Execlists */
 	struct tasklet_struct irq_tasklet;
-	spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
 	struct execlist_port {
 		struct drm_i915_gem_request *request;
 		unsigned int count;
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 04/12] drm/i915/scheduler: Signal the arrival of a new request
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
                   ` (2 preceding siblings ...)
  2016-11-02 17:50 ` [PATCH 03/12] drm/i915: Remove engine->execlist_lock Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-03 10:49   ` Tvrtko Ursulin
  2016-11-02 17:50 ` [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
                   ` (8 subsequent siblings)
  12 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

The start of the scheduler, add a hook into request submission for the
scheduler to see the arrival of new requests and prepare its runqueues.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.c         |  4 ++++
 drivers/gpu/drm/i915/i915_gem_request.c | 13 +++++++++++++
 drivers/gpu/drm/i915/intel_engine_cs.c  |  3 +++
 drivers/gpu/drm/i915/intel_ringbuffer.h |  9 +++++++++
 include/uapi/drm/i915_drm.h             |  5 +++++
 5 files changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 79cea49183b3..5a0e885e6104 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -323,6 +323,10 @@ static int i915_getparam(struct drm_device *dev, void *data,
 		 */
 		value = i915_gem_mmap_gtt_version();
 		break;
+	case I915_PARAM_HAS_SCHEDULER:
+		value = dev_priv->engine[RCS] &&
+			dev_priv->engine[RCS]->schedule;
+		break;
 	case I915_PARAM_MMAP_VERSION:
 		/* Remember to bump this if the version changes! */
 	case I915_PARAM_HAS_GEM:
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 05544dec5de9..9c8605c834f9 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -759,6 +759,19 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 
 	i915_gem_mark_busy(engine);
 
+	/* Let the backend know a new request has arrived that may need
+	 * to adjust the existing execution schedule due to a high priority
+	 * request - i.e. we may want to preempt the current request in order
+	 * to run a high priority dependency chain *before* we can execute this
+	 * request.
+	 *
+	 * This is called before the request is ready to run so that we can
+	 * decide whether to preempt the entire chain so that it is ready to
+	 * run at the earliest possible convenience.
+	 */
+	if (engine->schedule)
+		engine->schedule(request, 0);
+
 	local_bh_disable();
 	i915_sw_fence_commit(&request->submit);
 	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 298f0f95dd3f..c9171a058478 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -102,6 +102,9 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
 	engine->mmio_base = info->mmio_base;
 	engine->irq_shift = info->irq_shift;
 
+	/* Nothing to do here, execute in order of dependencies */
+	engine->schedule = NULL;
+
 	dev_priv->engine[id] = engine;
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 062bc8e1872a..75991a3c694b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -267,6 +267,15 @@ struct intel_engine_cs {
 	 */
 	void		(*submit_request)(struct drm_i915_gem_request *req);
 
+	/* Call when the priority on a request has changed and it and its
+	 * dependencies may need rescheduling. Note the request itself may
+	 * not be ready to run!
+	 *
+	 * Called under the struct_mutex.
+	 */
+	void		(*schedule)(struct drm_i915_gem_request *request,
+				    int priority);
+
 	/* Some chipsets are not quite as coherent as advertised and need
 	 * an expensive kick to force a true read of the up-to-date seqno.
 	 * However, the up-to-date seqno is not always required and the last
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 03725fe89859..1c12a350eca3 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -389,6 +389,11 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_MIN_EU_IN_POOL	 39
 #define I915_PARAM_MMAP_GTT_VERSION	 40
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
+ * priorities and the driver will attempt to execute batches in priority order.
+ */
+#define I915_PARAM_HAS_SCHEDULER	 41
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
                   ` (3 preceding siblings ...)
  2016-11-02 17:50 ` [PATCH 04/12] drm/i915/scheduler: Signal the arrival of a new request Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-03 11:03   ` Tvrtko Ursulin
  2016-11-02 17:50 ` [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities Chris Wilson
                   ` (7 subsequent siblings)
  12 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

The scheduler needs to know the dependencies of each request for the
lifetime of the request, as it may choose to reschedule the requests at
any time and must ensure the dependency tree is not broken. This is in
additional to using the fence to only allow execution after all
dependencies have been completed.

One option was to extend the fence to support the bidirectional
dependency tracking required by the scheduler. However the mismatch in
lifetimes between the submit fence and the request essentially meant
that we had to build a completely separate struct (and we could not
simply reuse the existing waitqueue in the fence for one half of the
dependency tracking). The extra dependency tracking simply did not mesh
well with the fence, and keeping it separate both keeps the fence
implementation simpler and allows us to extend the dependency tracking
into a priority tree (whilst maintaining support for reordering the
tree).

To avoid the additional allocations and list manipulations, the use of
the priotree is disabled when there are no schedulers to use it.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_request.c | 72 ++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_request.h | 23 +++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 9c8605c834f9..13090f226203 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -113,6 +113,59 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 	spin_unlock(&file_priv->mm.lock);
 }
 
+static int
+i915_priotree_add_dependency(struct i915_priotree *pt,
+			     struct i915_priotree *signal,
+			     struct i915_dependency *dep)
+{
+	unsigned long flags = 0;
+
+	if (!dep) {
+		dep = kmalloc(sizeof(*dep), GFP_KERNEL);
+		if (!dep)
+			return -ENOMEM;
+
+		flags |= I915_DEPENDENCY_ALLOC;
+	}
+
+	dep->signal = signal;
+	dep->flags = flags;
+
+	list_add(&dep->post_link, &signal->post_list);
+	list_add(&dep->pre_link, &pt->pre_list);
+	return 0;
+}
+
+static void
+i915_priotree_fini(struct i915_priotree *pt)
+{
+	struct i915_dependency *dep, *next;
+
+	/* Everyone we depended upon should retire before us and remove
+	 * themselves from our list. However, retirement is run independently
+	 * on each timeline and so we may be called out-of-order.
+	 */
+	list_for_each_entry_safe(dep, next, &pt->pre_list, pre_link) {
+		list_del(&dep->post_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			kfree(dep);
+	}
+
+	/* Remove ourselves from everyone who depends upon us */
+	list_for_each_entry_safe(dep, next, &pt->post_list, post_link) {
+		list_del(&dep->pre_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			kfree(dep);
+	}
+}
+
+static void
+i915_priotree_init(struct i915_priotree *pt)
+{
+	INIT_LIST_HEAD(&pt->pre_list);
+	INIT_LIST_HEAD(&pt->post_list);
+}
+
 void i915_gem_retire_noop(struct i915_gem_active *active,
 			  struct drm_i915_gem_request *request)
 {
@@ -182,6 +235,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	i915_gem_context_put(request->ctx);
 
 	dma_fence_signal(&request->fence);
+
+	i915_priotree_fini(&request->priotree);
 	i915_gem_request_put(request);
 }
 
@@ -464,6 +519,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 */
 	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
 
+	i915_priotree_init(&req->priotree);
+
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
 	req->engine = engine;
@@ -517,6 +574,14 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
 
 	GEM_BUG_ON(to == from);
 
+	if (to->engine->schedule) {
+		ret = i915_priotree_add_dependency(&to->priotree,
+						   &from->priotree,
+						   NULL);
+		if (ret < 0)
+			return ret;
+	}
+
 	if (to->timeline == from->timeline)
 		return 0;
 
@@ -740,9 +805,14 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 
 	prev = i915_gem_active_raw(&timeline->last_request,
 				   &request->i915->drm.struct_mutex);
-	if (prev)
+	if (prev) {
 		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
 					     &request->submitq);
+		if (engine->schedule)
+			i915_priotree_add_dependency(&request->priotree,
+						     &prev->priotree,
+						     &request->depq);
+	}
 
 	spin_lock_irq(&timeline->lock);
 	list_add_tail(&request->link, &timeline->requests);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 725d04128eaf..0cb2ba546062 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -41,6 +41,18 @@ struct intel_signal_node {
 	struct intel_wait wait;
 };
 
+struct i915_dependency {
+	struct i915_priotree *signal;
+	struct list_head pre_link, post_link;
+	unsigned long flags;
+#define I915_DEPENDENCY_ALLOC BIT(0)
+};
+
+struct i915_priotree {
+	struct list_head pre_list; /* who is before us, we depend upon */
+	struct list_head post_list; /* who is after us, they depend upon us */
+};
+
 /**
  * Request queue structure.
  *
@@ -89,6 +101,17 @@ struct drm_i915_gem_request {
 	wait_queue_t submitq;
 	wait_queue_t execq;
 
+	/* A list of everyone we wait upon, and everyone who waits upon us.
+	 * Even though we will not be submitted to the hardware before the
+	 * submit fence is signaled (it waits for all external events as well
+	 * as our own requests), the scheduler still needs to know the
+	 * dependency tree for the lifetime of the request (from execbuf
+	 * to retirement), i.e. bidirectional dependency information for the
+	 * request not tied to individual fences.
+	 */
+	struct i915_priotree priotree;
+	struct i915_dependency depq;
+
 	u32 global_seqno;
 
 	/** GEM sequence number associated with the previous request,
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
                   ` (4 preceding siblings ...)
  2016-11-02 17:50 ` [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-03 16:21   ` Tvrtko Ursulin
  2016-11-02 17:50 ` [PATCH 07/12] drm/i915/scheduler: Boost priorities for flips Chris Wilson
                   ` (6 subsequent siblings)
  12 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

Track the priority of each request and use it to determine the order in
which we submit requests to the hardware via execlists.

The priority of the request is determined by the user (eventually via
the context) but may be overridden at any time by the driver. When we set
the priority of the request, we bump the priority of all of its
dependencies to match - so that a high priority drawing operation is not
stuck behind a background task.

When the request is ready to execute (i.e. we have signaled the submit
fence following completion of all its dependencies, including third
party fences), we put the request into a priority sorted rbtree to be
submitted to the hardware. If the request is higher priority than all
pending requests, it will be submitted on the next context-switch
interrupt as soon as the hardware has completed the current request. We
do not currently preempt any current execution to immediately run a very
high priority request, at least not yet.

One more limitation, is that this is first implementation is for
execlists only so currently limited to gen8/gen9.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
 drivers/gpu/drm/i915/i915_gem.c            |   3 +-
 drivers/gpu/drm/i915/i915_gem_request.c    |   4 ++
 drivers/gpu/drm/i915/i915_gem_request.h    |   6 +-
 drivers/gpu/drm/i915/i915_guc_submission.c |   4 ++
 drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
 drivers/gpu/drm/i915/intel_lrc.c           | 104 ++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
 8 files changed, 109 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 3cb96d260dfb..dac435680e98 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
 			  struct drm_i915_gem_request *rq,
 			  const char *prefix)
 {
-	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
+	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
 		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
+		   rq->priotree.priority,
 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
 		   rq->timeline->common->name);
 }
@@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 
 		if (i915.enable_execlists) {
 			u32 ptr, read, write;
+			struct rb_node *rb;
 
 			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
 				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 			rcu_read_unlock();
 
 			spin_lock_irq(&engine->timeline->lock);
-			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
+			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
+				rq = rb_entry(rb, typeof(*rq), priotree.node);
 				print_request(m, rq, "\t\tQ ");
 			}
 			spin_unlock_irq(&engine->timeline->lock);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cf28666021f8..4697848ecfd9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2687,10 +2687,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
 
 	if (i915.enable_execlists) {
 		spin_lock_irq(&engine->timeline->lock);
-		INIT_LIST_HEAD(&engine->execlist_queue);
 		i915_gem_request_put(engine->execlist_port[0].request);
 		i915_gem_request_put(engine->execlist_port[1].request);
 		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+		engine->execlist_queue = RB_ROOT;
+		engine->execlist_first = NULL;
 		spin_unlock_irq(&engine->timeline->lock);
 	}
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 13090f226203..5f0068fac3e9 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -141,6 +141,8 @@ i915_priotree_fini(struct i915_priotree *pt)
 {
 	struct i915_dependency *dep, *next;
 
+	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
+
 	/* Everyone we depended upon should retire before us and remove
 	 * themselves from our list. However, retirement is run independently
 	 * on each timeline and so we may be called out-of-order.
@@ -164,6 +166,8 @@ i915_priotree_init(struct i915_priotree *pt)
 {
 	INIT_LIST_HEAD(&pt->pre_list);
 	INIT_LIST_HEAD(&pt->post_list);
+	RB_CLEAR_NODE(&pt->node);
+	pt->priority = INT_MIN;
 }
 
 void i915_gem_retire_noop(struct i915_gem_active *active,
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 0cb2ba546062..4288f512ac51 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -51,6 +51,9 @@ struct i915_dependency {
 struct i915_priotree {
 	struct list_head pre_list; /* who is before us, we depend upon */
 	struct list_head post_list; /* who is after us, they depend upon us */
+	struct rb_node node;
+	int priority;
+#define I915_PRIORITY_MAX 1024
 };
 
 /**
@@ -168,9 +171,6 @@ struct drm_i915_gem_request {
 	struct drm_i915_file_private *file_priv;
 	/** file_priv list entry for this request */
 	struct list_head client_list;
-
-	/** Link in the execlist submission queue, guarded by execlist_lock. */
-	struct list_head execlist_link;
 };
 
 extern const struct dma_fence_ops i915_fence_ops;
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index ddb574d5ceda..b31573a825fa 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -640,6 +640,9 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
 	struct i915_guc_client *client = guc->execbuf_client;
 	int b_ret;
 
+	rq->previous_context = rq->engine->last_context;
+	rq->engine->last_context = rq->ctx;
+
 	__i915_gem_request_submit(rq);
 
 	spin_lock(&client->wq_lock);
@@ -1522,6 +1525,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
 		engine->submit_request = i915_guc_submit;
+		engine->schedule = NULL;
 
 		/* Replay the current set of previously submitted requests */
 		list_for_each_entry(request,
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index c9171a058478..3da4d466e332 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -239,7 +239,8 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
  */
 void intel_engine_setup_common(struct intel_engine_cs *engine)
 {
-	INIT_LIST_HEAD(&engine->execlist_queue);
+	engine->execlist_queue = RB_ROOT;
+	engine->execlist_first = NULL;
 
 	intel_engine_init_timeline(engine);
 	intel_engine_init_hangcheck(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 186c3ccb2c34..70ac74c959bd 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -432,9 +432,10 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
 
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *cursor, *last;
+	struct drm_i915_gem_request *last;
 	struct execlist_port *port = engine->execlist_port;
 	unsigned long flags;
+	struct rb_node *rb;
 	bool submit = false;
 
 	last = port->request;
@@ -471,7 +472,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 */
 
 	spin_lock_irqsave(&engine->timeline->lock, flags);
-	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
 		/* Can we combine this request with the current port? It has to
 		 * be the same context/ringbuffer and not have any exceptions
 		 * (e.g. GVT saying never to combine contexts).
@@ -503,16 +508,27 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			port++;
 		}
 
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
+		/* We keep the previous context alive until we retire the
+		 * following request. This ensures that any the context object
+		 * is still pinned for any residual writes the HW makes into
+		 * it on the context switch into the next object following the
+		 * breadcrumb. Otherwise, we may retire the context too early.
+		 */
+		cursor->previous_context = engine->last_context;
+		engine->last_context = cursor->ctx;
+
 		__i915_gem_request_submit(cursor);
 		last = cursor;
 		submit = true;
 	}
 	if (submit) {
-		/* Decouple all the requests submitted from the queue */
-		engine->execlist_queue.next = &cursor->execlist_link;
-		cursor->execlist_link.prev = &engine->execlist_queue;
-
 		i915_gem_request_assign(&port->request, last);
+		engine->execlist_first = rb;
 	}
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
@@ -595,26 +611,77 @@ static void intel_lrc_irq_handler(unsigned long data)
 	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
 }
 
+static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
+{
+	struct rb_node **p, *rb;
+	bool first = true;
+
+	/* most positive priority is scheduled first, equal priorities fifo */
+	rb = NULL;
+	p = &root->rb_node;
+	while (*p) {
+		struct i915_priotree *pos;
+
+		rb = *p;
+		pos = rb_entry(rb, typeof(*pos), node);
+		if (pt->priority > pos->priority) {
+			p = &rb->rb_left;
+		} else {
+			p = &rb->rb_right;
+			first = false;
+		}
+	}
+	rb_link_node(&pt->node, rb, p);
+	rb_insert_color(&pt->node, root);
+
+	return first;
+}
+
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
 
 	assert_spin_locked(&engine->timeline->lock);
 
-	/* We keep the previous context alive until we retire the following
-	 * request. This ensures that any the context object is still pinned
-	 * for any residual writes the HW makes into it on the context switch
-	 * into the next object following the breadcrumb. Otherwise, we may
-	 * retire the context too early.
-	 */
-	request->previous_context = engine->last_context;
-	engine->last_context = request->ctx;
-
-	list_add_tail(&request->execlist_link, &engine->execlist_queue);
+	if (insert_request(&request->priotree, &engine->execlist_queue))
+		engine->execlist_first = &request->priotree.node;
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
 }
 
+static void update_priorities(struct i915_priotree *pt, int prio)
+{
+	struct drm_i915_gem_request *request =
+		container_of(pt, struct drm_i915_gem_request, priotree);
+	struct intel_engine_cs *engine = request->engine;
+	struct i915_dependency *dep;
+
+	if (prio <= READ_ONCE(pt->priority))
+		return;
+
+	/* Recursively bump all dependent priorities to match the new request */
+	list_for_each_entry(dep, &pt->pre_list, pre_link)
+		update_priorities(dep->signal, prio);
+
+	/* Fifo and depth-first replacement ensure our deps execute before us */
+	spin_lock_irq(&engine->timeline->lock);
+	pt->priority = prio;
+	if (!RB_EMPTY_NODE(&pt->node)) {
+		rb_erase(&pt->node, &engine->execlist_queue);
+		if (insert_request(pt, &engine->execlist_queue))
+			engine->execlist_first = &pt->node;
+	}
+	spin_unlock_irq(&engine->timeline->lock);
+}
+
+static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
+{
+	/* Make sure that our entire dependency chain shares our prio */
+	update_priorities(&request->priotree, prio);
+
+	/* XXX Do we need to preempt to make room for us and our deps? */
+}
+
 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
@@ -1651,8 +1718,10 @@ void intel_execlists_enable_submission(struct drm_i915_private *dev_priv)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	for_each_engine(engine, dev_priv, id)
+	for_each_engine(engine, dev_priv, id) {
 		engine->submit_request = execlists_submit_request;
+		engine->schedule = execlists_schedule;
+	}
 }
 
 static void
@@ -1665,6 +1734,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	engine->emit_breadcrumb = gen8_emit_breadcrumb;
 	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
 	engine->submit_request = execlists_submit_request;
+	engine->schedule = execlists_schedule;
 
 	engine->irq_enable = gen8_logical_ring_enable_irq;
 	engine->irq_disable = gen8_logical_ring_disable_irq;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 75991a3c694b..cbc148863a03 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -348,7 +348,8 @@ struct intel_engine_cs {
 		struct drm_i915_gem_request *request;
 		unsigned int count;
 	} execlist_port[2];
-	struct list_head execlist_queue;
+	struct rb_root execlist_queue;
+	struct rb_node *execlist_first;
 	unsigned int fw_domains;
 	bool disable_lite_restore_wa;
 	bool preempt_wa;
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 07/12] drm/i915/scheduler: Boost priorities for flips
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
                   ` (5 preceding siblings ...)
  2016-11-02 17:50 ` [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-03 16:29   ` Tvrtko Ursulin
  2016-11-02 17:50 ` [PATCH 08/12] drm/i915/guc: Cache the client mapping Chris Wilson
                   ` (5 subsequent siblings)
  12 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

Boost the priority of any rendering required to show the next pageflip
as we want to avoid missing the vblank by being delayed by invisible
workload. We prioritise avoiding jank and jitter in the GUI over
starving background tasks.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h      |  5 ++++
 drivers/gpu/drm/i915/i915_gem.c      | 50 ++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_display.c |  2 ++
 3 files changed, 57 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 61fee0b0c302..738ec44fe6af 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3416,6 +3416,11 @@ int i915_gem_object_wait(struct drm_i915_gem_object *obj,
 			 unsigned int flags,
 			 long timeout,
 			 struct intel_rps_client *rps);
+int i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
+				  unsigned int flags,
+				  int priority);
+#define I915_PRIORITY_DISPLAY I915_PRIORITY_MAX
+
 int __must_check
 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
 				  bool write);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 4697848ecfd9..4287c51fb461 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -433,6 +433,56 @@ i915_gem_object_wait_reservation(struct reservation_object *resv,
 	return timeout;
 }
 
+static void fence_set_priority(struct dma_fence *fence, int prio)
+{
+	struct drm_i915_gem_request *rq;
+	struct intel_engine_cs *engine;
+
+	if (!dma_fence_is_i915(fence))
+		return;
+
+	rq = to_request(fence);
+	engine = rq->engine;
+	if (!engine->schedule)
+		return;
+
+	engine->schedule(rq, prio);
+}
+
+int
+i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
+			      unsigned int flags,
+			      int prio)
+{
+	struct dma_fence *excl;
+
+	if (flags & I915_WAIT_ALL) {
+		struct dma_fence **shared;
+		unsigned int count, i;
+		int ret;
+
+		ret = reservation_object_get_fences_rcu(obj->resv,
+							&excl, &count, &shared);
+		if (ret)
+			return ret;
+
+		for (i = 0; i < count; i++) {
+			fence_set_priority(shared[i], prio);
+			dma_fence_put(shared[i]);
+		}
+
+		kfree(shared);
+	} else {
+		excl = reservation_object_get_excl_rcu(obj->resv);
+	}
+
+	if (excl) {
+		fence_set_priority(excl, prio);
+		dma_fence_put(excl);
+	}
+	return 0;
+}
+
 /**
  * Waits for rendering to the object to be completed
  * @obj: i915 gem object
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 97589102442c..8df9554432a9 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -14761,6 +14761,8 @@ intel_prepare_plane_fb(struct drm_plane *plane,
 						      GFP_KERNEL);
 		if (ret < 0)
 			return ret;
+
+		i915_gem_object_wait_priority(obj, 0, I915_PRIORITY_DISPLAY);
 	}
 
 	if (plane->type == DRM_PLANE_TYPE_CURSOR &&
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 08/12] drm/i915/guc: Cache the client mapping
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
                   ` (6 preceding siblings ...)
  2016-11-02 17:50 ` [PATCH 07/12] drm/i915/scheduler: Boost priorities for flips Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-03 16:37   ` Tvrtko Ursulin
  2016-11-02 17:50 ` [PATCH 09/12] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
                   ` (4 subsequent siblings)
  12 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

Use i915_gem_object_pin_map() for the guc client's lifetime to replace
the peristent kmap + frequent kmap_atomic with a permanent vmapping.
This avoids taking the obj->mm.lock mutex whilst inside irq context
later.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 38 +++++++++++++++---------------
 drivers/gpu/drm/i915/intel_guc.h           |  2 +-
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index b31573a825fa..bab0c2fc3bce 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -220,7 +220,7 @@ static int guc_update_doorbell_id(struct intel_guc *guc,
 	struct guc_context_desc desc;
 	size_t len;
 
-	doorbell = client->client_base + client->doorbell_offset;
+	doorbell = client->vaddr + client->doorbell_offset;
 
 	if (client->doorbell_id != GUC_INVALID_DOORBELL_ID &&
 	    test_bit(client->doorbell_id, doorbell_bitmap)) {
@@ -326,7 +326,7 @@ static void guc_proc_desc_init(struct intel_guc *guc,
 {
 	struct guc_process_desc *desc;
 
-	desc = client->client_base + client->proc_desc_offset;
+	desc = client->vaddr + client->proc_desc_offset;
 
 	memset(desc, 0, sizeof(*desc));
 
@@ -413,8 +413,8 @@ static void guc_ctx_desc_init(struct intel_guc *guc,
 	gfx_addr = i915_ggtt_offset(client->vma);
 	desc.db_trigger_phy = sg_dma_address(client->vma->pages->sgl) +
 				client->doorbell_offset;
-	desc.db_trigger_cpu = (uintptr_t)client->client_base +
-				client->doorbell_offset;
+	desc.db_trigger_cpu =
+		(uintptr_t)client->vaddr + client->doorbell_offset;
 	desc.db_trigger_uk = gfx_addr + client->doorbell_offset;
 	desc.process_desc = gfx_addr + client->proc_desc_offset;
 	desc.wq_addr = gfx_addr + client->wq_offset;
@@ -465,7 +465,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 {
 	const size_t wqi_size = sizeof(struct guc_wq_item);
 	struct i915_guc_client *gc = request->i915->guc.execbuf_client;
-	struct guc_process_desc *desc = gc->client_base + gc->proc_desc_offset;
+	struct guc_process_desc *desc = gc->vaddr + gc->proc_desc_offset;
 	u32 freespace;
 	int ret;
 
@@ -506,10 +506,9 @@ static void guc_wq_item_append(struct i915_guc_client *gc,
 	struct intel_engine_cs *engine = rq->engine;
 	struct guc_process_desc *desc;
 	struct guc_wq_item *wqi;
-	void *base;
-	u32 freespace, tail, wq_off, wq_page;
+	u32 freespace, tail, wq_off;
 
-	desc = gc->client_base + gc->proc_desc_offset;
+	desc = gc->vaddr + gc->proc_desc_offset;
 
 	/* Free space is guaranteed, see i915_guc_wq_reserve() above */
 	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
@@ -539,10 +538,7 @@ static void guc_wq_item_append(struct i915_guc_client *gc,
 	gc->wq_rsvd -= wqi_size;
 
 	/* WQ starts from the page after doorbell / process_desc */
-	wq_page = (wq_off + GUC_DB_SIZE) >> PAGE_SHIFT;
-	wq_off &= PAGE_SIZE - 1;
-	base = kmap_atomic(i915_gem_object_get_page(gc->vma->obj, wq_page));
-	wqi = (struct guc_wq_item *)((char *)base + wq_off);
+	wqi = gc->vaddr + wq_off + GUC_DB_SIZE;
 
 	/* Now fill in the 4-word work queue item */
 	wqi->header = WQ_TYPE_INORDER |
@@ -555,8 +551,6 @@ static void guc_wq_item_append(struct i915_guc_client *gc,
 
 	wqi->ring_tail = tail << WQ_RING_TAIL_SHIFT;
 	wqi->fence_id = rq->global_seqno;
-
-	kunmap_atomic(base);
 }
 
 static int guc_ring_doorbell(struct i915_guc_client *gc)
@@ -566,7 +560,7 @@ static int guc_ring_doorbell(struct i915_guc_client *gc)
 	union guc_doorbell_qw *db;
 	int attempt = 2, ret = -EAGAIN;
 
-	desc = gc->client_base + gc->proc_desc_offset;
+	desc = gc->vaddr + gc->proc_desc_offset;
 
 	/* Update the tail so it is visible to GuC */
 	desc->tail = gc->wq_tail;
@@ -582,7 +576,7 @@ static int guc_ring_doorbell(struct i915_guc_client *gc)
 		db_exc.cookie = 1;
 
 	/* pointer of current doorbell cacheline */
-	db = gc->client_base + gc->doorbell_offset;
+	db = gc->vaddr + gc->doorbell_offset;
 
 	while (attempt--) {
 		/* lets ring the doorbell */
@@ -729,14 +723,14 @@ guc_client_free(struct drm_i915_private *dev_priv,
 	 * Be sure to drop any locks
 	 */
 
-	if (client->client_base) {
+	if (client->vaddr) {
 		/*
 		 * If we got as far as setting up a doorbell, make sure we
 		 * shut it down before unmapping & deallocating the memory.
 		 */
 		guc_disable_doorbell(guc, client);
 
-		kunmap(kmap_to_page(client->client_base));
+		i915_gem_object_unpin_map(client->vma->obj);
 	}
 
 	i915_vma_unpin_and_release(&client->vma);
@@ -825,6 +819,7 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 	struct i915_guc_client *client;
 	struct intel_guc *guc = &dev_priv->guc;
 	struct i915_vma *vma;
+	void *vaddr;
 	uint16_t db_id;
 
 	client = kzalloc(sizeof(*client), GFP_KERNEL);
@@ -851,7 +846,12 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
 
 	/* We'll keep just the first (doorbell/proc) page permanently kmap'd. */
 	client->vma = vma;
-	client->client_base = kmap(i915_vma_first_page(vma));
+
+	vaddr = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
+	if (IS_ERR(vaddr))
+		goto err;
+
+	client->vaddr = vaddr;
 
 	spin_lock_init(&client->wq_lock);
 	client->wq_offset = GUC_DB_SIZE;
diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
index 49ced0bad0f5..0053258e03d3 100644
--- a/drivers/gpu/drm/i915/intel_guc.h
+++ b/drivers/gpu/drm/i915/intel_guc.h
@@ -64,7 +64,7 @@ struct drm_i915_gem_request;
  */
 struct i915_guc_client {
 	struct i915_vma *vma;
-	void *client_base;		/* first page (only) of above	*/
+	void *vaddr;
 	struct i915_gem_context *owner;
 	struct intel_guc *guc;
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 09/12] HACK drm/i915/scheduler: emulate a scheduler for guc
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
                   ` (7 preceding siblings ...)
  2016-11-02 17:50 ` [PATCH 08/12] drm/i915/guc: Cache the client mapping Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-03 13:17   ` Tvrtko Ursulin
  2016-11-02 17:50 ` [PATCH 10/12] drm/i915/scheduler: Support user-defined priorities Chris Wilson
                   ` (3 subsequent siblings)
  12 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 83 ++++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_irq.c            |  4 +-
 drivers/gpu/drm/i915/intel_lrc.c           |  3 --
 3 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index bab0c2fc3bce..601b8777d3fd 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 	u32 freespace;
 	int ret;
 
-	spin_lock(&gc->wq_lock);
+	spin_lock_irq(&gc->wq_lock);
 	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
 	freespace -= gc->wq_rsvd;
 	if (likely(freespace >= wqi_size)) {
@@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 		gc->no_wq_space++;
 		ret = -EAGAIN;
 	}
-	spin_unlock(&gc->wq_lock);
+	spin_unlock_irq(&gc->wq_lock);
 
 	return ret;
 }
@@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
 
 	GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
 
-	spin_lock(&gc->wq_lock);
+	spin_lock_irq(&gc->wq_lock);
 	gc->wq_rsvd -= wqi_size;
-	spin_unlock(&gc->wq_lock);
+	spin_unlock_irq(&gc->wq_lock);
 }
 
 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -658,6 +658,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
 	spin_unlock(&client->wq_lock);
 }
 
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *last = port[0].request;
+	unsigned long flags;
+	struct rb_node *rb;
+	bool submit = false;
+
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
+		if (last && cursor->ctx != last->ctx) {
+			if (port != engine->execlist_port)
+				break;
+
+			i915_gem_request_assign(&port->request, last);
+			dma_fence_enable_sw_signaling(&last->fence);
+			port++;
+		}
+
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
+		i915_guc_submit(cursor);
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		i915_gem_request_assign(&port->request, last);
+		dma_fence_enable_sw_signaling(&last->fence);
+		engine->execlist_first = rb;
+	}
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+
+	return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *rq;
+	bool submit;
+
+	do {
+		rq = port[0].request;
+		while (rq && i915_gem_request_completed(rq)) {
+			i915_gem_request_put(rq);
+			rq = port[1].request;
+			port[0].request = rq;
+			port[1].request = NULL;
+		}
+
+		submit = false;
+		if (!port[1].request)
+			submit = i915_guc_dequeue(engine);
+	} while (submit);
+}
+
 /*
  * Everything below here is concerned with setup & teardown, and is
  * therefore not part of the somewhat time-critical batch-submission
@@ -1524,16 +1588,13 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
-		engine->submit_request = i915_guc_submit;
-		engine->schedule = NULL;
+		tasklet_init(&engine->irq_tasklet,
+			     i915_guc_irq_handler,
+			     (unsigned long)engine);
 
 		/* Replay the current set of previously submitted requests */
-		list_for_each_entry(request,
-				    &engine->timeline->requests, link) {
+		list_for_each_entry(request, &engine->timeline->requests, link)
 			client->wq_rsvd += sizeof(struct guc_wq_item);
-			if (i915_sw_fence_done(&request->submit))
-				i915_guc_submit(request);
-		}
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6d7505b5c5e7..217f63e17e4e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 static __always_inline void
 gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 {
-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
+		tasklet_schedule(&engine->irq_tasklet);
 		notify_ring(engine);
+	}
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
 		tasklet_schedule(&engine->irq_tasklet);
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 70ac74c959bd..d17c0fff9642 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1350,9 +1350,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	request->ring->last_retired_head = -1;
 	intel_ring_update_space(request->ring);
 
-	if (i915.enable_guc_submission)
-		return;
-
 	/* Catch up with any missed context-switch interrupts */
 	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
 	if (request->ctx != port[0].request->ctx) {
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 10/12] drm/i915/scheduler: Support user-defined priorities
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
                   ` (8 preceding siblings ...)
  2016-11-02 17:50 ` [PATCH 09/12] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-02 17:50 ` [PATCH 11/12] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
                   ` (2 subsequent siblings)
  12 siblings, 0 replies; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

Use a priority stored in the context as the initial value when
submitting a request. This allows us to change the default priority on a
per-context basis, allowing different contexts to be favoured with GPU
time at the expense of lower importance work. The user can adjust the
context's priority via I915_CONTEXT_PARAM_PRIORITY, with more positive
values being higher priority (they will be serviced earlier, after their
dependencies have been resolved). Any prerequisite work for an execbuf
will have its priority raised to match the new request as required.

Normal users can specify any value in the range of -1023 to 0 [default],
i.e. they can reduce the priority of their workloads (and temporarily
boost it back to normal if so desired).

Privileged users can specify any value in the range of -1023 to 1023,
[default is 0], i.e. they can raise their priority above all overs and
so potentially starve the system.

Note that the existing schedulers are not fair, nor load balancing, the
execution is strictly by priority on a first-come, first-served basis,
and the driver may choose to boost some requests above the range
available to users.

This priority was originally based around nice(2), but evolved to allow
clients to adjust their priority within a small range, and allow for a
privileged high priority range.

For example, this can be used to implement EGL_IMG_context_priority
https://www.khronos.org/registry/egl/extensions/IMG/EGL_IMG_context_priority.txt

	EGL_CONTEXT_PRIORITY_LEVEL_IMG determines the priority level of
        the context to be created. This attribute is a hint, as an
        implementation may not support multiple contexts at some
        priority levels and system policy may limit access to high
        priority contexts to appropriate system privilege level. The
        default value for EGL_CONTEXT_PRIORITY_LEVEL_IMG is
        EGL_CONTEXT_PRIORITY_MEDIUM_IMG."

so we can map

	PRIORITY_HIGH -> 1023 [privileged, will failback to 0]
	PRIORITY_MED -> 0 [default]
	PRIORITY_LOW -> -1023

They also map onto the priorities used by VkQueue (and a VkQueue is
essentially a timeline, our i915_gem_context under full-ppgtt).

Testcase: igt/gem_exec_schedule
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_gem_context.c | 21 +++++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_request.c |  2 +-
 include/uapi/drm/i915_drm.h             |  3 +++
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 738ec44fe6af..7431c72beccc 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -948,6 +948,7 @@ struct i915_gem_context {
 	/* Unique identifier for this context, used by the hw for tracking */
 	unsigned int hw_id;
 	u32 user_handle;
+	int priority; /* greater priorities are serviced first */
 
 	u32 ggtt_alignment;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 6dd475735f0a..48b5aacf5fc2 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -476,6 +476,7 @@ int i915_gem_context_init(struct drm_device *dev)
 		return PTR_ERR(ctx);
 	}
 
+	ctx->priority = -I915_PRIORITY_MAX; /* lowest priority; idle task */
 	dev_priv->kernel_context = ctx;
 
 	DRM_DEBUG_DRIVER("%s context support initialized\n",
@@ -1100,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
 		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
 		break;
+	case I915_CONTEXT_PARAM_PRIORITY:
+		args->value = ctx->priority;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1155,6 +1159,23 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
 		}
 		break;
+
+	case I915_CONTEXT_PARAM_PRIORITY:
+		{
+			int priority = args->value;
+
+			if (args->size)
+				ret = -EINVAL;
+			else if (priority >= I915_PRIORITY_MAX ||
+				 priority <= -I915_PRIORITY_MAX)
+				ret = -EINVAL;
+			else if (priority > 0 && !capable(CAP_SYS_ADMIN))
+				ret = -EPERM;
+			else
+				ctx->priority = priority;
+		}
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 5f0068fac3e9..0b7b61e9f2e4 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -844,7 +844,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * run at the earliest possible convenience.
 	 */
 	if (engine->schedule)
-		engine->schedule(request, 0);
+		engine->schedule(request, request->ctx->priority);
 
 	local_bh_disable();
 	i915_sw_fence_commit(&request->submit);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1c12a350eca3..47901a8ad682 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -391,6 +391,8 @@ typedef struct drm_i915_irq_wait {
 
 /* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
  * priorities and the driver will attempt to execute batches in priority order.
+ * The initial priority for each batch is supplied by the context and is
+ * controlled via I915_CONTEXT_PARAM_PRIORITY.
  */
 #define I915_PARAM_HAS_SCHEDULER	 41
 
@@ -1224,6 +1226,7 @@ struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
+#define I915_CONTEXT_PARAM_PRIORITY	0x5
 	__u64 value;
 };
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 11/12] drm/i915: Enable userspace to opt-out of implicit fencing
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
                   ` (9 preceding siblings ...)
  2016-11-02 17:50 ` [PATCH 10/12] drm/i915/scheduler: Support user-defined priorities Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-02 17:50 ` [PATCH 12/12] drm/i915: Support explicit fencing for execbuf Chris Wilson
  2016-11-02 18:45 ` ✓ Fi.CI.BAT: success for series starting with [01/12] drm/i915: Split request submit/execute phase into two Patchwork
  12 siblings, 0 replies; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

Userspace is faced with a dilemma. The kernel requires implicit fencing
to manage resource usage (we always must wait for the GPU to finish
before releasing its PTE) and for third parties. However, userspace may
wish to avoid this serialisation if it is either using explicit fencing
between parties and wants more fine-grained access to buffers (e.g. it
may partition the buffer between uses and track fences on ranges rather
than the implicit fences tracking the whole object). It follows that
userspace needs a mechanism to avoid the kernel's serialisation on its
implicit fences before execbuf execution.

The next question is whether this is an object, execbuf or context flag.
Hybrid users (such as using explicit EGL_ANDROID_native_sync fencing on
shared winsys buffers, but implicit fencing on internal surfaces)
require a per-object level flag. Given that this flag need to be only
set once for the lifetime of the object, this reduces the convenience of
having an execbuf or context level flag (and avoids having multiple
pieces of uABI controlling the same feature).

Incorrect use of this flag will result in rendering corruption and GPU
hangs - but will not result in use-after-free or similar resource
tracking issues.

Serious caveat: write ordering is not strictly correct after setting
this flag on a render target on multiple engines. This affects all
subsequent GEM operations (execbuf, set-domain, pread) and shared
dma-buf operations. A fix is possible - but costly (both in terms of
further ABI changes and runtime overhead).

Testcase: igt/gem_exec_async
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
 include/uapi/drm/i915_drm.h                | 29 ++++++++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 5a0e885e6104..4bfd5a582aa2 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -343,6 +343,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_HANDLE_LUT:
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
+	case I915_PARAM_HAS_EXEC_ASYNC:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index c35e847bb8bc..0ae2feb24632 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1110,6 +1110,9 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 	list_for_each_entry(vma, vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
+		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
+			continue;
+
 		ret = i915_gem_request_await_object
 			(req, obj, obj->base.pending_write_domain);
 		if (ret)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 47901a8ad682..4bd83c0b07db 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -396,6 +396,12 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_SCHEDULER	 41
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to opt-out of
+ * synchronisation with implicit fencing on individual objects.
+ */
+#define I915_PARAM_HAS_EXEC_ASYNC	 42
+
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -736,8 +742,29 @@ struct drm_i915_gem_exec_object2 {
 #define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
 #define EXEC_OBJECT_PINNED		 (1<<4)
 #define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
+/* The kernel implicitly tracks GPU activity on all GEM objects, and
+ * synchronises operations with outstanding rendering. This includes
+ * rendering on other devices if exported via dma-buf. However, sometimes
+ * this tracking is too coarse and the user knows better. For example,
+ * if the object is split into non-overlapping ranges shared between different
+ * clients or engines (i.e. suballocating objects), the implicit tracking
+ * by kernel assumes that each operation affects the whole object rather
+ * than an individual range, causing needless synchronisation between clients.
+ * The kernel will also forgo any CPU cache flushes prior to rendering from
+ * the object as the client is expected to be also handling such domain
+ * tracking.
+ *
+ * The kernel maintains the implicit tracking in order to manage resources
+ * used by the GPU - this flag only disables the synchronisation prior to
+ * rendering with this object in this execbuf.
+ *
+ * Opting out of implicit synhronisation requires the user to do its own
+ * explicit tracking to avoid rendering corruption. See, for example,
+ * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
+ */
+#define EXEC_OBJECT_ASYNC		(1<<6)
 /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_PAD_TO_SIZE<<1)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_ASYNC<<1)
 	__u64 flags;
 
 	union {
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 12/12] drm/i915: Support explicit fencing for execbuf
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
                   ` (10 preceding siblings ...)
  2016-11-02 17:50 ` [PATCH 11/12] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
@ 2016-11-02 17:50 ` Chris Wilson
  2016-11-02 18:45 ` ✓ Fi.CI.BAT: success for series starting with [01/12] drm/i915: Split request submit/execute phase into two Patchwork
  12 siblings, 0 replies; 40+ messages in thread
From: Chris Wilson @ 2016-11-02 17:50 UTC (permalink / raw)
  To: intel-gfx

Now that the user can opt-out of implicit fencing, we need to give them
back control over the fencing. We employ sync_file to wrap our
drm_i915_gem_request and provide an fd that userspace can merge with
other sync_file fds and pass back to the kernel to wait upon before
future execution.

Testcase: igt/gem_exec_fence
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/Kconfig               |  1 +
 drivers/gpu/drm/i915/i915_drv.c            |  3 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 54 +++++++++++++++++++++++++++---
 include/uapi/drm/i915_drm.h                | 35 ++++++++++++++++++-
 4 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index df96aed6975a..8e93b61bd8c3 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -19,6 +19,7 @@ config DRM_I915
 	select INPUT if ACPI
 	select ACPI_VIDEO if ACPI
 	select ACPI_BUTTON if ACPI
+	select SYNC_FILE
 	help
 	  Choose this option if you have a system that has "Intel Graphics
 	  Media Accelerator" or "HD Graphics" integrated graphics,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 4bfd5a582aa2..d022fb207a42 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -344,6 +344,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
 	case I915_PARAM_HAS_EXEC_ASYNC:
+	case I915_PARAM_HAS_EXEC_FENCE:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
@@ -2534,7 +2535,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_HWS_ADDR, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_INIT, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH),
-	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2_WR, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_GEM_PIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_UNPIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0ae2feb24632..83533fb6cf46 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -28,6 +28,7 @@
 
 #include <linux/dma_remapping.h>
 #include <linux/reservation.h>
+#include <linux/sync_file.h>
 #include <linux/uaccess.h>
 
 #include <drm/drmP.h>
@@ -1588,6 +1589,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct i915_execbuffer_params *params = &params_master;
 	const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 dispatch_flags;
+	struct dma_fence *in_fence = NULL;
+	struct sync_file *out_fence = NULL;
+	int out_fence_fd = -1;
 	int ret;
 	bool need_relocs;
 
@@ -1631,6 +1635,23 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		dispatch_flags |= I915_DISPATCH_RS;
 	}
 
+	if (args->flags & I915_EXEC_FENCE_IN) {
+		in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
+		if (!in_fence) {
+			ret = -EINVAL;
+			goto pre_mutex_err;
+		}
+	}
+
+	if (args->flags & I915_EXEC_FENCE_OUT) {
+		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
+		if (out_fence_fd < 0) {
+			ret = out_fence_fd;
+			out_fence_fd = -1;
+			goto pre_mutex_err;
+		}
+	}
+
 	/* Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
 	 * process. Upon first dispatch, we acquire another prolonged
@@ -1775,6 +1796,21 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		goto err_batch_unpin;
 	}
 
+	if (in_fence) {
+		ret = i915_gem_request_await_dma_fence(params->request,
+						       in_fence);
+		if (ret < 0)
+			goto err_request;
+	}
+
+	if (out_fence_fd != -1) {
+		out_fence = sync_file_create(&params->request->fence);
+		if (!out_fence) {
+			ret = -ENOMEM;
+			goto err_request;
+		}
+	}
+
 	/* Whilst this request exists, batch_obj will be on the
 	 * active_list, and so will hold the active reference. Only when this
 	 * request is retired will the the batch_obj be moved onto the
@@ -1802,6 +1838,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	ret = execbuf_submit(params, args, &eb->vmas);
 err_request:
 	__i915_add_request(params->request, ret == 0);
+	if (out_fence) {
+		if (ret == 0) {
+			fd_install(out_fence_fd, out_fence->file);
+			args->rsvd2 &= GENMASK_ULL(0, 31); /* keep in-fence */
+			args->rsvd2 |= (u64)out_fence_fd << 32;
+			out_fence_fd = -1;
+		} else {
+			fput(out_fence->file);
+		}
+	}
 
 err_batch_unpin:
 	/*
@@ -1823,6 +1869,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
 	intel_runtime_pm_put(dev_priv);
+	if (out_fence_fd != -1)
+		put_unused_fd(out_fence_fd);
+	dma_fence_put(in_fence);
 	return ret;
 }
 
@@ -1930,11 +1979,6 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	if (args->rsvd2 != 0) {
-		DRM_DEBUG("dirty rvsd2 field\n");
-		return -EINVAL;
-	}
-
 	exec2_list = drm_malloc_gfp(args->buffer_count,
 				    sizeof(*exec2_list),
 				    GFP_TEMPORARY);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 4bd83c0b07db..90082269fb50 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -246,6 +246,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
 #define DRM_I915_OVERLAY_ATTRS	0x28
 #define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_EXECBUFFER2_WR	DRM_I915_GEM_EXECBUFFER2
 #define DRM_I915_GET_SPRITE_COLORKEY	0x2a
 #define DRM_I915_SET_SPRITE_COLORKEY	0x2b
 #define DRM_I915_GEM_WAIT	0x2c
@@ -279,6 +280,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
 #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
 #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
 #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
@@ -401,6 +403,12 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_ASYNC	 42
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports explicit fence support -
+ * both being able to pass in a sync_file fd to wait upon before executing,
+ * and being able to return a new sync_file fd that is signaled when the
+ * current request is complete.
+ */
+#define I915_PARAM_HAS_EXEC_FENCE	 43
 
 typedef struct drm_i915_getparam {
 	__s32 param;
@@ -854,7 +862,32 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
+/* Setting I915_EXEC_FENCE_IN implies that lower_32_bits(rsvd2) represent
+ * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
+ * the batch.
+ *
+ * Returns -EINVAL if the sync_file fd cannot be found.
+ */
+#define I915_EXEC_FENCE_IN		(1<<16)
+
+/* Setting I915_EXEC_FENCE_OUT causes the ioctl to return a sync_file fd
+ * in the upper_32_bits(rsvd2) upon success. Ownership of the fd is given
+ * to the caller, and it should be close() after use. (The fd is a regular
+ * file descriptor and will be cleaned up on process termination. It holds
+ * a reference to the request, but nothing else.)
+ *
+ * The sync_file fd can be combined with other sync_file and passed either
+ * to execbuf using I915_EXEC_FENCE_IN, to atomic KMS ioctls (so that a flip
+ * will only occur after this request completes), or to other devices.
+ *
+ * Using I915_EXEC_FENCE_OUT requires use of
+ * DRM_IOCTL_I915_GEM_EXECBUFFER2_WR ioctl so that the result is written
+ * back to userspace. Failure to do so will cause the out-fence to always
+ * be reported as zero, and the real fence fd to be leaked.
+ */
+#define I915_EXEC_FENCE_OUT		(1<<17)
+
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [01/12] drm/i915: Split request submit/execute phase into two
  2016-11-02 17:50 Trivial priority scheduler Chris Wilson
                   ` (11 preceding siblings ...)
  2016-11-02 17:50 ` [PATCH 12/12] drm/i915: Support explicit fencing for execbuf Chris Wilson
@ 2016-11-02 18:45 ` Patchwork
  12 siblings, 0 replies; 40+ messages in thread
From: Patchwork @ 2016-11-02 18:45 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [01/12] drm/i915: Split request submit/execute phase into two
URL   : https://patchwork.freedesktop.org/series/14751/
State : success

== Summary ==

Series 14751v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/14751/revisions/1/mbox/


fi-bdw-5557u     total:241  pass:226  dwarn:0   dfail:0   fail:0   skip:15 
fi-bsw-n3050     total:241  pass:201  dwarn:0   dfail:0   fail:0   skip:40 
fi-byt-j1900     total:241  pass:213  dwarn:0   dfail:0   fail:0   skip:28 
fi-byt-n2820     total:241  pass:209  dwarn:0   dfail:0   fail:0   skip:32 
fi-hsw-4770      total:241  pass:221  dwarn:0   dfail:0   fail:0   skip:20 
fi-hsw-4770r     total:241  pass:220  dwarn:0   dfail:0   fail:0   skip:21 
fi-ilk-650       total:241  pass:187  dwarn:0   dfail:0   fail:0   skip:54 
fi-ivb-3520m     total:241  pass:218  dwarn:0   dfail:0   fail:0   skip:23 
fi-ivb-3770      total:241  pass:218  dwarn:0   dfail:0   fail:0   skip:23 
fi-kbl-7200u     total:241  pass:219  dwarn:0   dfail:0   fail:0   skip:22 
fi-skl-6260u     total:241  pass:227  dwarn:0   dfail:0   fail:0   skip:14 
fi-skl-6700hq    total:241  pass:220  dwarn:0   dfail:0   fail:0   skip:21 
fi-skl-6700k     total:241  pass:219  dwarn:1   dfail:0   fail:0   skip:21 
fi-skl-6770hq    total:241  pass:227  dwarn:0   dfail:0   fail:0   skip:14 
fi-snb-2520m     total:241  pass:208  dwarn:0   dfail:0   fail:0   skip:33 
fi-snb-2600      total:241  pass:207  dwarn:0   dfail:0   fail:0   skip:34 

bf6b989af8b0fde56a352d9005c97b2d8e3bbbe3 drm-intel-nightly: 2016y-11m-02d-15h-44m-03s UTC integration manifest
164f390 drm/i915: Support explicit fencing for execbuf
9811cdb drm/i915: Enable userspace to opt-out of implicit fencing
4d35615 drm/i915/scheduler: Support user-defined priorities
675e7ea HACK drm/i915/scheduler: emulate a scheduler for guc
224c9f8 drm/i915/guc: Cache the client mapping
b1c5a70 drm/i915/scheduler: Boost priorities for flips
2ad0f8b drm/i915/scheduler: Execute requests in order of priorities
f2881be drm/i915/scheduler: Record all dependencies upon request construction
c822ff5 drm/i915/scheduler: Signal the arrival of a new request
b707ee3 drm/i915: Remove engine->execlist_lock
9943660 drm/i915: Defer transfer onto execution timeline to actual hw submission
332bc0d drm/i915: Split request submit/execute phase into two

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_2894/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-02 17:50 ` [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
@ 2016-11-03 10:34   ` Tvrtko Ursulin
  2016-11-03 10:51     ` Chris Wilson
  0 siblings, 1 reply; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-03 10:34 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 02/11/2016 17:50, Chris Wilson wrote:
> Defer the transfer from the client's timeline onto the execution
> timeline from the point of readiness to the point of actual submission.
> For example, in execlists, a request is finally submitted to hardware
> when the hardware is ready, and only put onto the hardware queue when
> the request is ready. By deferring the transfer, we ensure that the
> timeline is maintained in retirement order if we decide to queue the
> requests onto the hardware in a different order than fifo.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_request.c    | 36 ++++++++++++++++++------------
>  drivers/gpu/drm/i915/i915_gem_request.h    |  2 ++
>  drivers/gpu/drm/i915/i915_guc_submission.c |  2 ++
>  drivers/gpu/drm/i915/intel_lrc.c           |  5 +++++
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
>  5 files changed, 33 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 1ae5a2f8953f..05544dec5de9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
>  	return atomic_inc_return(&tl->next_seqno);
>  }
>
> -static int __i915_sw_fence_call
> -submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +void __i915_gem_request_submit(struct drm_i915_gem_request *request)
>  {
> -	struct drm_i915_gem_request *request =
> -		container_of(fence, typeof(*request), submit);
>  	struct intel_engine_cs *engine = request->engine;
>  	struct intel_timeline *timeline;
> -	unsigned long flags;
>  	u32 seqno;
>
> -	if (state != FENCE_COMPLETE)
> -		return NOTIFY_DONE;
> -
>  	/* Transfer from per-context onto the global per-engine timeline */
>  	timeline = engine->timeline;
>  	GEM_BUG_ON(timeline == request->timeline);
> -
> -	/* Will be called from irq-context when using foreign DMA fences */
> -	spin_lock_irqsave(&timeline->lock, flags);
> +	assert_spin_locked(&timeline->lock);
>
>  	seqno = timeline_get_seqno(timeline->common);
>  	GEM_BUG_ON(!seqno);
> @@ -345,15 +336,32 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>  	GEM_BUG_ON(!request->global_seqno);
>  	engine->emit_breadcrumb(request,
>  				request->ring->vaddr + request->postfix);
> -	engine->submit_request(request);
>
> -	spin_lock_nested(&request->timeline->lock, SINGLE_DEPTH_NESTING);
> +	/* nested from submit_notify */
> +	spin_lock_nested(&request->timeline->lock, 2);

Is this because lockdep does not differentiate between 
request->timeline->lock and engine->timeline->lock?

Could we pass different lock classes to i915_gem_timeline_init and 
remove this deep nested annotation. Would be better I think.

>  	list_move_tail(&request->link, &timeline->requests);
>  	spin_unlock(&request->timeline->lock);
>
>  	i915_sw_fence_commit(&request->execute);
> +}
>
> -	spin_unlock_irqrestore(&timeline->lock, flags);
> +static int __i915_sw_fence_call
> +submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +{
> +	if (state == FENCE_COMPLETE) {
> +		struct drm_i915_gem_request *request =
> +			container_of(fence, typeof(*request), submit);
> +		struct intel_engine_cs *engine = request->engine;
> +		unsigned long flags;
> +
> +		/* Will be called from irq-context when using foreign fences,
> +		 * and may be called from the execution callback of another
> +		 * engine.
> +		 */
> +		spin_lock_irqsave_nested(&engine->timeline->lock, flags, 1);
> +		engine->submit_request(request);
> +		spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +	}
>
>  	return NOTIFY_DONE;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index ed13f37fea0f..725d04128eaf 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -228,6 +228,8 @@ void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
>  #define i915_add_request_no_flush(req) \
>  	__i915_add_request(req, false)
>
> +void __i915_gem_request_submit(struct drm_i915_gem_request *request);
> +
>  struct intel_rps_client;
>  #define NO_WAITBOOST ERR_PTR(-1)
>  #define IS_RPS_CLIENT(p) (!IS_ERR(p))
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 857ef914cae7..ddb574d5ceda 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -640,6 +640,8 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	struct i915_guc_client *client = guc->execbuf_client;
>  	int b_ret;
>
> +	__i915_gem_request_submit(rq);
> +
>  	spin_lock(&client->wq_lock);
>  	guc_wq_item_append(client, rq);
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index fa3012c342cc..b9aba16851ac 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -434,6 +434,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_gem_request *cursor, *last;
>  	struct execlist_port *port = engine->execlist_port;
> +	unsigned long flags;
>  	bool submit = false;
>
>  	last = port->request;
> @@ -469,6 +470,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	 * and context switches) submission.
>  	 */
>
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
>  	spin_lock(&engine->execlist_lock);
>  	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
>  		/* Can we combine this request with the current port? It has to
> @@ -501,6 +503,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			i915_gem_request_assign(&port->request, last);
>  			port++;
>  		}
> +
> +		__i915_gem_request_submit(cursor);
>  		last = cursor;
>  		submit = true;
>  	}
> @@ -512,6 +516,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		i915_gem_request_assign(&port->request, last);
>  	}
>  	spin_unlock(&engine->execlist_lock);
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>
>  	if (submit)
>  		execlists_submit_ports(engine);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 700e93d80616..f91ee24e2763 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1294,6 +1294,8 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
>  {
>  	struct drm_i915_private *dev_priv = request->i915;
>
> +	__i915_gem_request_submit(request);
> +
>  	I915_WRITE_TAIL(request->engine, request->tail);
>  }
>
>

Otherwise looking good to me.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 01/12] drm/i915: Split request submit/execute phase into two
  2016-11-02 17:50 ` [PATCH 01/12] drm/i915: Split request submit/execute phase into two Chris Wilson
@ 2016-11-03 10:35   ` Tvrtko Ursulin
  0 siblings, 0 replies; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-03 10:35 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 02/11/2016 17:50, Chris Wilson wrote:
> In order to support deferred scheduling, we need to differentiate
> between when the request is ready to run (i.e. the submit fence is
> signaled) and when the request is actually run (a new execute fence).
> This is typically split between the request itself wanting to wait upon
> others (for which we use the submit fence) and the CPU wanting to wait
> upon the request, for which we use the execute fence to be sure the
> hardware is ready to signal completion.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_request.c | 33 ++++++++++++++++++++++++---------
>  drivers/gpu/drm/i915/i915_gem_request.h |  2 ++
>  2 files changed, 26 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 79b0046d9a57..1ae5a2f8953f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -351,11 +351,19 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>  	list_move_tail(&request->link, &timeline->requests);
>  	spin_unlock(&request->timeline->lock);
>
> +	i915_sw_fence_commit(&request->execute);
> +
>  	spin_unlock_irqrestore(&timeline->lock, flags);
>
>  	return NOTIFY_DONE;
>  }
>
> +static int __i915_sw_fence_call
> +execute_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +{
> +	return NOTIFY_DONE;
> +}
> +
>  /**
>   * i915_gem_request_alloc - allocate a request structure
>   *
> @@ -441,6 +449,12 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>  		       __timeline_get_seqno(req->timeline->common));
>
>  	i915_sw_fence_init(&req->submit, submit_notify);
> +	i915_sw_fence_init(&req->execute, execute_notify);
> +	/* Ensure that the execute fence completes after the submit fence -
> +	 * as we complete the execute fence from within the submit fence
> +	 * callback, its completion would otherwise be visible first.
> +	 */
> +	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
>
>  	INIT_LIST_HEAD(&req->active_list);
>  	req->i915 = dev_priv;
> @@ -817,9 +831,9 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
>  }
>
>  static long
> -__i915_request_wait_for_submit(struct drm_i915_gem_request *request,
> -			       unsigned int flags,
> -			       long timeout)
> +__i915_request_wait_for_execute(struct drm_i915_gem_request *request,
> +				unsigned int flags,
> +				long timeout)
>  {
>  	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
>  		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
> @@ -831,9 +845,9 @@ __i915_request_wait_for_submit(struct drm_i915_gem_request *request,
>  		add_wait_queue(q, &reset);
>
>  	do {
> -		prepare_to_wait(&request->submit.wait, &wait, state);
> +		prepare_to_wait(&request->execute.wait, &wait, state);
>
> -		if (i915_sw_fence_done(&request->submit))
> +		if (i915_sw_fence_done(&request->execute))
>  			break;
>
>  		if (flags & I915_WAIT_LOCKED &&
> @@ -851,7 +865,7 @@ __i915_request_wait_for_submit(struct drm_i915_gem_request *request,
>
>  		timeout = io_schedule_timeout(timeout);
>  	} while (timeout);
> -	finish_wait(&request->submit.wait, &wait);
> +	finish_wait(&request->execute.wait, &wait);
>
>  	if (flags & I915_WAIT_LOCKED)
>  		remove_wait_queue(q, &reset);
> @@ -903,13 +917,14 @@ long i915_wait_request(struct drm_i915_gem_request *req,
>
>  	trace_i915_gem_request_wait_begin(req);
>
> -	if (!i915_sw_fence_done(&req->submit)) {
> -		timeout = __i915_request_wait_for_submit(req, flags, timeout);
> +	if (!i915_sw_fence_done(&req->execute)) {
> +		timeout = __i915_request_wait_for_execute(req, flags, timeout);
>  		if (timeout < 0)
>  			goto complete;
>
> -		GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
> +		GEM_BUG_ON(!i915_sw_fence_done(&req->execute));
>  	}
> +	GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
>  	GEM_BUG_ON(!req->global_seqno);
>
>  	/* Optimistic short spin before touching IRQs */
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 75f8360b3421..ed13f37fea0f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -85,7 +85,9 @@ struct drm_i915_gem_request {
>  	struct intel_signal_node signaling;
>
>  	struct i915_sw_fence submit;
> +	struct i915_sw_fence execute;
>  	wait_queue_t submitq;
> +	wait_queue_t execq;
>
>  	u32 global_seqno;
>
>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 03/12] drm/i915: Remove engine->execlist_lock
  2016-11-02 17:50 ` [PATCH 03/12] drm/i915: Remove engine->execlist_lock Chris Wilson
@ 2016-11-03 10:47   ` Tvrtko Ursulin
  2016-11-03 12:28     ` Chris Wilson
  0 siblings, 1 reply; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-03 10:47 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 02/11/2016 17:50, Chris Wilson wrote:
> The execlist_lock is now completely subsumed by the engine->timeline->lock,
> and so we can remove the redundant layer of locking.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c     | 4 ++--
>  drivers/gpu/drm/i915/i915_gem.c         | 4 ++--
>  drivers/gpu/drm/i915/intel_engine_cs.c  | 1 -
>  drivers/gpu/drm/i915/intel_lrc.c        | 7 +------
>  drivers/gpu/drm/i915/intel_ringbuffer.h | 1 -
>  5 files changed, 5 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index c9465fbff2df..3cb96d260dfb 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -3256,11 +3256,11 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  				seq_printf(m, "\t\tELSP[1] idle\n");
>  			rcu_read_unlock();
>
> -			spin_lock_irq(&engine->execlist_lock);

Hm why did we have this as _irq and not _bh already?

> +			spin_lock_irq(&engine->timeline->lock);
>  			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
>  				print_request(m, rq, "\t\tQ ");
>  			}
> -			spin_unlock_irq(&engine->execlist_lock);
> +			spin_unlock_irq(&engine->timeline->lock);
>  		} else if (INTEL_GEN(dev_priv) > 6) {
>  			seq_printf(m, "\tPP_DIR_BASE: 0x%08x\n",
>  				   I915_READ(RING_PP_DIR_BASE(engine)));
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 5839bebba64a..cf28666021f8 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2686,12 +2686,12 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
>  	 */
>
>  	if (i915.enable_execlists) {
> -		spin_lock(&engine->execlist_lock);

Likewise should have been _bh here, never mind now. :)

> +		spin_lock_irq(&engine->timeline->lock);
>  		INIT_LIST_HEAD(&engine->execlist_queue);
>  		i915_gem_request_put(engine->execlist_port[0].request);
>  		i915_gem_request_put(engine->execlist_port[1].request);
>  		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> -		spin_unlock(&engine->execlist_lock);
> +		spin_unlock_irq(&engine->timeline->lock);
>  	}
>  }
>
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 841f8d1e1410..298f0f95dd3f 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -237,7 +237,6 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
>  void intel_engine_setup_common(struct intel_engine_cs *engine)
>  {
>  	INIT_LIST_HEAD(&engine->execlist_queue);
> -	spin_lock_init(&engine->execlist_lock);
>
>  	intel_engine_init_timeline(engine);
>  	intel_engine_init_hangcheck(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index b9aba16851ac..186c3ccb2c34 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -471,7 +471,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	 */
>
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
> -	spin_lock(&engine->execlist_lock);
>  	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
>  		/* Can we combine this request with the current port? It has to
>  		 * be the same context/ringbuffer and not have any exceptions
> @@ -515,7 +514,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>
>  		i915_gem_request_assign(&port->request, last);
>  	}
> -	spin_unlock(&engine->execlist_lock);
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>
>  	if (submit)
> @@ -600,9 +598,8 @@ static void intel_lrc_irq_handler(unsigned long data)
>  static void execlists_submit_request(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
> -	unsigned long flags;
>
> -	spin_lock_irqsave(&engine->execlist_lock, flags);

Again I am confused why this wasn't just _bh.

> +	assert_spin_locked(&engine->timeline->lock);
>
>  	/* We keep the previous context alive until we retire the following
>  	 * request. This ensures that any the context object is still pinned
> @@ -616,8 +613,6 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
>  	list_add_tail(&request->execlist_link, &engine->execlist_queue);
>  	if (execlists_elsp_idle(engine))
>  		tasklet_hi_schedule(&engine->irq_tasklet);
> -
> -	spin_unlock_irqrestore(&engine->execlist_lock, flags);
>  }
>
>  int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 642b54692d0d..062bc8e1872a 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -335,7 +335,6 @@ struct intel_engine_cs {
>
>  	/* Execlists */
>  	struct tasklet_struct irq_tasklet;
> -	spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
>  	struct execlist_port {
>  		struct drm_i915_gem_request *request;
>  		unsigned int count;
>

Anyway, looks correct.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 04/12] drm/i915/scheduler: Signal the arrival of a new request
  2016-11-02 17:50 ` [PATCH 04/12] drm/i915/scheduler: Signal the arrival of a new request Chris Wilson
@ 2016-11-03 10:49   ` Tvrtko Ursulin
  0 siblings, 0 replies; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-03 10:49 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 02/11/2016 17:50, Chris Wilson wrote:
> The start of the scheduler, add a hook into request submission for the
> scheduler to see the arrival of new requests and prepare its runqueues.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_drv.c         |  4 ++++
>  drivers/gpu/drm/i915/i915_gem_request.c | 13 +++++++++++++
>  drivers/gpu/drm/i915/intel_engine_cs.c  |  3 +++
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  9 +++++++++
>  include/uapi/drm/i915_drm.h             |  5 +++++
>  5 files changed, 34 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 79cea49183b3..5a0e885e6104 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -323,6 +323,10 @@ static int i915_getparam(struct drm_device *dev, void *data,
>  		 */
>  		value = i915_gem_mmap_gtt_version();
>  		break;
> +	case I915_PARAM_HAS_SCHEDULER:
> +		value = dev_priv->engine[RCS] &&
> +			dev_priv->engine[RCS]->schedule;
> +		break;
>  	case I915_PARAM_MMAP_VERSION:
>  		/* Remember to bump this if the version changes! */
>  	case I915_PARAM_HAS_GEM:
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 05544dec5de9..9c8605c834f9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -759,6 +759,19 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
>
>  	i915_gem_mark_busy(engine);
>
> +	/* Let the backend know a new request has arrived that may need
> +	 * to adjust the existing execution schedule due to a high priority
> +	 * request - i.e. we may want to preempt the current request in order
> +	 * to run a high priority dependency chain *before* we can execute this
> +	 * request.
> +	 *
> +	 * This is called before the request is ready to run so that we can
> +	 * decide whether to preempt the entire chain so that it is ready to
> +	 * run at the earliest possible convenience.
> +	 */
> +	if (engine->schedule)
> +		engine->schedule(request, 0);
> +
>  	local_bh_disable();
>  	i915_sw_fence_commit(&request->submit);
>  	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index 298f0f95dd3f..c9171a058478 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -102,6 +102,9 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
>  	engine->mmio_base = info->mmio_base;
>  	engine->irq_shift = info->irq_shift;
>
> +	/* Nothing to do here, execute in order of dependencies */
> +	engine->schedule = NULL;
> +
>  	dev_priv->engine[id] = engine;
>  	return 0;
>  }
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 062bc8e1872a..75991a3c694b 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -267,6 +267,15 @@ struct intel_engine_cs {
>  	 */
>  	void		(*submit_request)(struct drm_i915_gem_request *req);
>
> +	/* Call when the priority on a request has changed and it and its
> +	 * dependencies may need rescheduling. Note the request itself may
> +	 * not be ready to run!
> +	 *
> +	 * Called under the struct_mutex.
> +	 */
> +	void		(*schedule)(struct drm_i915_gem_request *request,
> +				    int priority);
> +
>  	/* Some chipsets are not quite as coherent as advertised and need
>  	 * an expensive kick to force a true read of the up-to-date seqno.
>  	 * However, the up-to-date seqno is not always required and the last
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 03725fe89859..1c12a350eca3 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -389,6 +389,11 @@ typedef struct drm_i915_irq_wait {
>  #define I915_PARAM_MIN_EU_IN_POOL	 39
>  #define I915_PARAM_MMAP_GTT_VERSION	 40
>
> +/* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
> + * priorities and the driver will attempt to execute batches in priority order.
> + */
> +#define I915_PARAM_HAS_SCHEDULER	 41
> +
>  typedef struct drm_i915_getparam {
>  	__s32 param;
>  	/*
>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-03 10:34   ` Tvrtko Ursulin
@ 2016-11-03 10:51     ` Chris Wilson
  2016-11-03 11:27       ` Chris Wilson
  0 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-03 10:51 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 03, 2016 at 10:34:26AM +0000, Tvrtko Ursulin wrote:
> 
> On 02/11/2016 17:50, Chris Wilson wrote:
> >Defer the transfer from the client's timeline onto the execution
> >timeline from the point of readiness to the point of actual submission.
> >For example, in execlists, a request is finally submitted to hardware
> >when the hardware is ready, and only put onto the hardware queue when
> >the request is ready. By deferring the transfer, we ensure that the
> >timeline is maintained in retirement order if we decide to queue the
> >requests onto the hardware in a different order than fifo.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> > drivers/gpu/drm/i915/i915_gem_request.c    | 36 ++++++++++++++++++------------
> > drivers/gpu/drm/i915/i915_gem_request.h    |  2 ++
> > drivers/gpu/drm/i915/i915_guc_submission.c |  2 ++
> > drivers/gpu/drm/i915/intel_lrc.c           |  5 +++++
> > drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
> > 5 files changed, 33 insertions(+), 14 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> >index 1ae5a2f8953f..05544dec5de9 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_request.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_request.c
> >@@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
> > 	return atomic_inc_return(&tl->next_seqno);
> > }
> >
> >-static int __i915_sw_fence_call
> >-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> >+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
> > {
> >-	struct drm_i915_gem_request *request =
> >-		container_of(fence, typeof(*request), submit);
> > 	struct intel_engine_cs *engine = request->engine;
> > 	struct intel_timeline *timeline;
> >-	unsigned long flags;
> > 	u32 seqno;
> >
> >-	if (state != FENCE_COMPLETE)
> >-		return NOTIFY_DONE;
> >-
> > 	/* Transfer from per-context onto the global per-engine timeline */
> > 	timeline = engine->timeline;
> > 	GEM_BUG_ON(timeline == request->timeline);
> >-
> >-	/* Will be called from irq-context when using foreign DMA fences */
> >-	spin_lock_irqsave(&timeline->lock, flags);
> >+	assert_spin_locked(&timeline->lock);
> >
> > 	seqno = timeline_get_seqno(timeline->common);
> > 	GEM_BUG_ON(!seqno);
> >@@ -345,15 +336,32 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> > 	GEM_BUG_ON(!request->global_seqno);
> > 	engine->emit_breadcrumb(request,
> > 				request->ring->vaddr + request->postfix);
> >-	engine->submit_request(request);
> >
> >-	spin_lock_nested(&request->timeline->lock, SINGLE_DEPTH_NESTING);
> >+	/* nested from submit_notify */
> >+	spin_lock_nested(&request->timeline->lock, 2);
> 
> Is this because lockdep does not differentiate between
> request->timeline->lock and engine->timeline->lock?

Yes. Worse is that the 2 comes from having different paths to this point
with their own nesting pattern.

struct timeline {
	struct lock_class_key lockdep;
	struct mutex mutex;
}

__mutex_init(&timeline->mutex, "timeline", &timeline->lockdep);

? (Hopefully that works w/o lockdep enabled)

Better then would be

i915_gem_timeline_create()
i915_gem_timeline_create_global()

and just use one lock class for all user timelines, and a lock class per
engine on the global timeline.

Will try.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-02 17:50 ` [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
@ 2016-11-03 11:03   ` Tvrtko Ursulin
  2016-11-03 11:55     ` Chris Wilson
  0 siblings, 1 reply; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-03 11:03 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 02/11/2016 17:50, Chris Wilson wrote:
> The scheduler needs to know the dependencies of each request for the
> lifetime of the request, as it may choose to reschedule the requests at
> any time and must ensure the dependency tree is not broken. This is in
> additional to using the fence to only allow execution after all
> dependencies have been completed.
>
> One option was to extend the fence to support the bidirectional
> dependency tracking required by the scheduler. However the mismatch in
> lifetimes between the submit fence and the request essentially meant
> that we had to build a completely separate struct (and we could not
> simply reuse the existing waitqueue in the fence for one half of the
> dependency tracking). The extra dependency tracking simply did not mesh
> well with the fence, and keeping it separate both keeps the fence
> implementation simpler and allows us to extend the dependency tracking
> into a priority tree (whilst maintaining support for reordering the
> tree).
>
> To avoid the additional allocations and list manipulations, the use of
> the priotree is disabled when there are no schedulers to use it.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_request.c | 72 ++++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_gem_request.h | 23 +++++++++++
>  2 files changed, 94 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 9c8605c834f9..13090f226203 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -113,6 +113,59 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>  	spin_unlock(&file_priv->mm.lock);
>  }
>
> +static int
> +i915_priotree_add_dependency(struct i915_priotree *pt,
> +			     struct i915_priotree *signal,
> +			     struct i915_dependency *dep)
> +{
> +	unsigned long flags = 0;
> +
> +	if (!dep) {
> +		dep = kmalloc(sizeof(*dep), GFP_KERNEL);

I will mention a dedicated cache again since this could possibly be our 
hottest allocation path. With a dedicated slab I've seen it grow to 5-7k 
objects in some benchmarks, with the request slab around 1k at the same 
time.

> +		if (!dep)
> +			return -ENOMEM;
> +
> +		flags |= I915_DEPENDENCY_ALLOC;
> +	}

Not sure if it would be any nicer to just set the flags after allocating 
to I915_DEPENDENCY_ALLOC and add an else path to set it to zero here.

> +
> +	dep->signal = signal;
> +	dep->flags = flags;
> +
> +	list_add(&dep->post_link, &signal->post_list);
> +	list_add(&dep->pre_link, &pt->pre_list);
> +	return 0;
> +}
> +
> +static void
> +i915_priotree_fini(struct i915_priotree *pt)
> +{
> +	struct i915_dependency *dep, *next;
> +
> +	/* Everyone we depended upon should retire before us and remove
> +	 * themselves from our list. However, retirement is run independently
> +	 * on each timeline and so we may be called out-of-order.
> +	 */
> +	list_for_each_entry_safe(dep, next, &pt->pre_list, pre_link) {
> +		list_del(&dep->post_link);
> +		if (dep->flags & I915_DEPENDENCY_ALLOC)
> +			kfree(dep);
> +	}
> +
> +	/* Remove ourselves from everyone who depends upon us */
> +	list_for_each_entry_safe(dep, next, &pt->post_list, post_link) {
> +		list_del(&dep->pre_link);
> +		if (dep->flags & I915_DEPENDENCY_ALLOC)
> +			kfree(dep);
> +	}
> +}
> +
> +static void
> +i915_priotree_init(struct i915_priotree *pt)
> +{
> +	INIT_LIST_HEAD(&pt->pre_list);
> +	INIT_LIST_HEAD(&pt->post_list);
> +}
> +
>  void i915_gem_retire_noop(struct i915_gem_active *active,
>  			  struct drm_i915_gem_request *request)
>  {
> @@ -182,6 +235,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
>  	i915_gem_context_put(request->ctx);
>
>  	dma_fence_signal(&request->fence);
> +
> +	i915_priotree_fini(&request->priotree);
>  	i915_gem_request_put(request);
>  }
>
> @@ -464,6 +519,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>  	 */
>  	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
>
> +	i915_priotree_init(&req->priotree);
> +
>  	INIT_LIST_HEAD(&req->active_list);
>  	req->i915 = dev_priv;
>  	req->engine = engine;
> @@ -517,6 +574,14 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
>
>  	GEM_BUG_ON(to == from);
>
> +	if (to->engine->schedule) {
> +		ret = i915_priotree_add_dependency(&to->priotree,
> +						   &from->priotree,
> +						   NULL);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
>  	if (to->timeline == from->timeline)
>  		return 0;
>
> @@ -740,9 +805,14 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
>
>  	prev = i915_gem_active_raw(&timeline->last_request,
>  				   &request->i915->drm.struct_mutex);
> -	if (prev)
> +	if (prev) {
>  		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
>  					     &request->submitq);
> +		if (engine->schedule)
> +			i915_priotree_add_dependency(&request->priotree,
> +						     &prev->priotree,
> +						     &request->depq);
> +	}
>
>  	spin_lock_irq(&timeline->lock);
>  	list_add_tail(&request->link, &timeline->requests);
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 725d04128eaf..0cb2ba546062 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -41,6 +41,18 @@ struct intel_signal_node {
>  	struct intel_wait wait;
>  };
>
> +struct i915_dependency {
> +	struct i915_priotree *signal;
> +	struct list_head pre_link, post_link;
> +	unsigned long flags;
> +#define I915_DEPENDENCY_ALLOC BIT(0)
> +};
> +
> +struct i915_priotree {
> +	struct list_head pre_list; /* who is before us, we depend upon */
> +	struct list_head post_list; /* who is after us, they depend upon us */
> +};

I need a picture to imagine this data structure. :(

> +
>  /**
>   * Request queue structure.
>   *
> @@ -89,6 +101,17 @@ struct drm_i915_gem_request {
>  	wait_queue_t submitq;
>  	wait_queue_t execq;
>
> +	/* A list of everyone we wait upon, and everyone who waits upon us.
> +	 * Even though we will not be submitted to the hardware before the
> +	 * submit fence is signaled (it waits for all external events as well
> +	 * as our own requests), the scheduler still needs to know the
> +	 * dependency tree for the lifetime of the request (from execbuf
> +	 * to retirement), i.e. bidirectional dependency information for the
> +	 * request not tied to individual fences.
> +	 */
> +	struct i915_priotree priotree;
> +	struct i915_dependency depq;

Why depq and not dep_node or something? Makes me wonder what am I 
missing about the "q". It is not a queue?!?

> +
>  	u32 global_seqno;
>
>  	/** GEM sequence number associated with the previous request,
>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-03 10:51     ` Chris Wilson
@ 2016-11-03 11:27       ` Chris Wilson
  0 siblings, 0 replies; 40+ messages in thread
From: Chris Wilson @ 2016-11-03 11:27 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

On Thu, Nov 03, 2016 at 10:51:14AM +0000, Chris Wilson wrote:
> Yes. Worse is that the 2 comes from having different paths to this point
> with their own nesting pattern.

Not any more, that was a leftover from one version that managed to nest
signaling/execution.

The issue is 

	__i915_gem_request_submit -> fence_signal(rq->execute)

Then if we hook the submit fence onto an earlier execute fence, we hit
submit_notify() and try to acquire the engine->timeline again.

Hindsight says we cannot control the recursion there, so don't hook up
the fences like that (submit is not allowed to listen to execute).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-03 11:03   ` Tvrtko Ursulin
@ 2016-11-03 11:55     ` Chris Wilson
  2016-11-04 14:44       ` Tvrtko Ursulin
  0 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-03 11:55 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 03, 2016 at 11:03:47AM +0000, Tvrtko Ursulin wrote:
> 
> On 02/11/2016 17:50, Chris Wilson wrote:
> >The scheduler needs to know the dependencies of each request for the
> >lifetime of the request, as it may choose to reschedule the requests at
> >any time and must ensure the dependency tree is not broken. This is in
> >additional to using the fence to only allow execution after all
> >dependencies have been completed.
> >
> >One option was to extend the fence to support the bidirectional
> >dependency tracking required by the scheduler. However the mismatch in
> >lifetimes between the submit fence and the request essentially meant
> >that we had to build a completely separate struct (and we could not
> >simply reuse the existing waitqueue in the fence for one half of the
> >dependency tracking). The extra dependency tracking simply did not mesh
> >well with the fence, and keeping it separate both keeps the fence
> >implementation simpler and allows us to extend the dependency tracking
> >into a priority tree (whilst maintaining support for reordering the
> >tree).
> >
> >To avoid the additional allocations and list manipulations, the use of
> >the priotree is disabled when there are no schedulers to use it.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> > drivers/gpu/drm/i915/i915_gem_request.c | 72 ++++++++++++++++++++++++++++++++-
> > drivers/gpu/drm/i915/i915_gem_request.h | 23 +++++++++++
> > 2 files changed, 94 insertions(+), 1 deletion(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> >index 9c8605c834f9..13090f226203 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_request.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_request.c
> >@@ -113,6 +113,59 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
> > 	spin_unlock(&file_priv->mm.lock);
> > }
> >
> >+static int
> >+i915_priotree_add_dependency(struct i915_priotree *pt,
> >+			     struct i915_priotree *signal,
> >+			     struct i915_dependency *dep)
> >+{
> >+	unsigned long flags = 0;
> >+
> >+	if (!dep) {
> >+		dep = kmalloc(sizeof(*dep), GFP_KERNEL);
> 
> I will mention a dedicated cache again since this could possibly be
> our hottest allocation path. With a dedicated slab I've seen it grow
> to 5-7k objects in some benchmarks, with the request slab around 1k
> at the same time.

I'm open to one. We allocate more of these than we do even for fences. I
was thinking it could be added later, but if we can the api to always
pass in the i915_dependency it will probably work better.
> 
> >+		if (!dep)
> >+			return -ENOMEM;
> >+
> >+		flags |= I915_DEPENDENCY_ALLOC;
> >+	}
> 
> Not sure if it would be any nicer to just set the flags after
> allocating to I915_DEPENDENCY_ALLOC and add an else path to set it
> to zero here.

I just tend to avoid if {} else {} if I can help, just a personal
preference.

> >+struct i915_dependency {
> >+	struct i915_priotree *signal;
> >+	struct list_head pre_link, post_link;
> >+	unsigned long flags;
> >+#define I915_DEPENDENCY_ALLOC BIT(0)
> >+};
> >+
> >+struct i915_priotree {
> >+	struct list_head pre_list; /* who is before us, we depend upon */
> >+	struct list_head post_list; /* who is after us, they depend upon us */
> >+};
> 
> I need a picture to imagine this data structure. :(

The names suck.

\|||/
 \|/
  .
 /|\
/|||\

Googling bidirectional acyclic graph doesn't help that much.

> >+
> > /**
> >  * Request queue structure.
> >  *
> >@@ -89,6 +101,17 @@ struct drm_i915_gem_request {
> > 	wait_queue_t submitq;
> > 	wait_queue_t execq;
> >
> >+	/* A list of everyone we wait upon, and everyone who waits upon us.
> >+	 * Even though we will not be submitted to the hardware before the
> >+	 * submit fence is signaled (it waits for all external events as well
> >+	 * as our own requests), the scheduler still needs to know the
> >+	 * dependency tree for the lifetime of the request (from execbuf
> >+	 * to retirement), i.e. bidirectional dependency information for the
> >+	 * request not tied to individual fences.
> >+	 */
> >+	struct i915_priotree priotree;
> >+	struct i915_dependency depq;
> 
> Why depq and not dep_node or something? Makes me wonder what am I
> missing about the "q". It is not a queue?!?

Just a pattern was forming. It's just a much of a queue as our
wait_queue_t above :) Point taken.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 03/12] drm/i915: Remove engine->execlist_lock
  2016-11-03 10:47   ` Tvrtko Ursulin
@ 2016-11-03 12:28     ` Chris Wilson
  2016-11-03 13:31       ` Tvrtko Ursulin
  0 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-03 12:28 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 03, 2016 at 10:47:54AM +0000, Tvrtko Ursulin wrote:
> 
> On 02/11/2016 17:50, Chris Wilson wrote:
> >@@ -600,9 +598,8 @@ static void intel_lrc_irq_handler(unsigned long data)
> > static void execlists_submit_request(struct drm_i915_gem_request *request)
> > {
> > 	struct intel_engine_cs *engine = request->engine;
> >-	unsigned long flags;
> >
> >-	spin_lock_irqsave(&engine->execlist_lock, flags);
> 
> Again I am confused why this wasn't just _bh.

We may be in hardirq context here (normally, from just i915, we are
not). At the very least irqs are disabled here making spin_unlock_bh()
veboten.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 09/12] HACK drm/i915/scheduler: emulate a scheduler for guc
  2016-11-02 17:50 ` [PATCH 09/12] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
@ 2016-11-03 13:17   ` Tvrtko Ursulin
  0 siblings, 0 replies; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-03 13:17 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 02/11/2016 17:50, Chris Wilson wrote:
> This emulates execlists on top of the GuC in order to defer submission of
> requests to the hardware. This deferral allows time for high priority
> requests to gazump their way to the head of the queue, however it nerfs
> the GuC by converting it back into a simple execlist (where the CPU has
> to wake up after every request to feed new commands into the GuC).

How big is the performance hit? :)

Regards,

Tvrtko

> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 83 ++++++++++++++++++++++++++----
>  drivers/gpu/drm/i915/i915_irq.c            |  4 +-
>  drivers/gpu/drm/i915/intel_lrc.c           |  3 --
>  3 files changed, 75 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index bab0c2fc3bce..601b8777d3fd 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  	u32 freespace;
>  	int ret;
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
>  	freespace -= gc->wq_rsvd;
>  	if (likely(freespace >= wqi_size)) {
> @@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  		gc->no_wq_space++;
>  		ret = -EAGAIN;
>  	}
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>
>  	return ret;
>  }
> @@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
>
>  	GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	gc->wq_rsvd -= wqi_size;
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>  }
>
>  /* Construct a Work Item and append it to the GuC's Work Queue */
> @@ -658,6 +658,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	spin_unlock(&client->wq_lock);
>  }
>
> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +{
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *last = port[0].request;
> +	unsigned long flags;
> +	struct rb_node *rb;
> +	bool submit = false;
> +
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
> +		if (last && cursor->ctx != last->ctx) {
> +			if (port != engine->execlist_port)
> +				break;
> +
> +			i915_gem_request_assign(&port->request, last);
> +			dma_fence_enable_sw_signaling(&last->fence);
> +			port++;
> +		}
> +
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		i915_guc_submit(cursor);
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		i915_gem_request_assign(&port->request, last);
> +		dma_fence_enable_sw_signaling(&last->fence);
> +		engine->execlist_first = rb;
> +	}
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +	return submit;
> +}
> +
> +static void i915_guc_irq_handler(unsigned long data)
> +{
> +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *rq;
> +	bool submit;
> +
> +	do {
> +		rq = port[0].request;
> +		while (rq && i915_gem_request_completed(rq)) {
> +			i915_gem_request_put(rq);
> +			rq = port[1].request;
> +			port[0].request = rq;
> +			port[1].request = NULL;
> +		}
> +
> +		submit = false;
> +		if (!port[1].request)
> +			submit = i915_guc_dequeue(engine);
> +	} while (submit);
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission
> @@ -1524,16 +1588,13 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
> -		engine->submit_request = i915_guc_submit;
> -		engine->schedule = NULL;
> +		tasklet_init(&engine->irq_tasklet,
> +			     i915_guc_irq_handler,
> +			     (unsigned long)engine);
>
>  		/* Replay the current set of previously submitted requests */
> -		list_for_each_entry(request,
> -				    &engine->timeline->requests, link) {
> +		list_for_each_entry(request, &engine->timeline->requests, link)
>  			client->wq_rsvd += sizeof(struct guc_wq_item);
> -			if (i915_sw_fence_done(&request->submit))
> -				i915_guc_submit(request);
> -		}
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 6d7505b5c5e7..217f63e17e4e 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>  static __always_inline void
>  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>  {
> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> +		tasklet_schedule(&engine->irq_tasklet);
>  		notify_ring(engine);
> +	}
>  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
>  		tasklet_schedule(&engine->irq_tasklet);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 70ac74c959bd..d17c0fff9642 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1350,9 +1350,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
>
> -	if (i915.enable_guc_submission)
> -		return;
> -
>  	/* Catch up with any missed context-switch interrupts */
>  	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
>  	if (request->ctx != port[0].request->ctx) {
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 03/12] drm/i915: Remove engine->execlist_lock
  2016-11-03 12:28     ` Chris Wilson
@ 2016-11-03 13:31       ` Tvrtko Ursulin
  0 siblings, 0 replies; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-03 13:31 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 03/11/2016 12:28, Chris Wilson wrote:
> On Thu, Nov 03, 2016 at 10:47:54AM +0000, Tvrtko Ursulin wrote:
>>
>> On 02/11/2016 17:50, Chris Wilson wrote:
>>> @@ -600,9 +598,8 @@ static void intel_lrc_irq_handler(unsigned long data)
>>> static void execlists_submit_request(struct drm_i915_gem_request *request)
>>> {
>>> 	struct intel_engine_cs *engine = request->engine;
>>> -	unsigned long flags;
>>>
>>> -	spin_lock_irqsave(&engine->execlist_lock, flags);
>>
>> Again I am confused why this wasn't just _bh.
>
> We may be in hardirq context here (normally, from just i915, we are
> not). At the very least irqs are disabled here making spin_unlock_bh()
> veboten.

Blast, I thought softirq are a subset of hardirq. :) On looking at the 
code I see it is not implemented at all like that. Apart from sometimes. :)

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities
  2016-11-02 17:50 ` [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities Chris Wilson
@ 2016-11-03 16:21   ` Tvrtko Ursulin
  2016-11-03 17:44     ` Chris Wilson
  2016-11-03 19:47     ` Chris Wilson
  0 siblings, 2 replies; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-03 16:21 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


Hi,

1st pass comments only for now.

On 02/11/2016 17:50, Chris Wilson wrote:
> Track the priority of each request and use it to determine the order in
> which we submit requests to the hardware via execlists.
>
> The priority of the request is determined by the user (eventually via
> the context) but may be overridden at any time by the driver. When we set
> the priority of the request, we bump the priority of all of its
> dependencies to match - so that a high priority drawing operation is not
> stuck behind a background task.
>
> When the request is ready to execute (i.e. we have signaled the submit
> fence following completion of all its dependencies, including third
> party fences), we put the request into a priority sorted rbtree to be
> submitted to the hardware. If the request is higher priority than all
> pending requests, it will be submitted on the next context-switch
> interrupt as soon as the hardware has completed the current request. We
> do not currently preempt any current execution to immediately run a very
> high priority request, at least not yet.
>
> One more limitation, is that this is first implementation is for
> execlists only so currently limited to gen8/gen9.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
>  drivers/gpu/drm/i915/i915_gem.c            |   3 +-
>  drivers/gpu/drm/i915/i915_gem_request.c    |   4 ++
>  drivers/gpu/drm/i915/i915_gem_request.h    |   6 +-
>  drivers/gpu/drm/i915/i915_guc_submission.c |   4 ++
>  drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
>  drivers/gpu/drm/i915/intel_lrc.c           | 104 ++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
>  8 files changed, 109 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 3cb96d260dfb..dac435680e98 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
>  			  struct drm_i915_gem_request *rq,
>  			  const char *prefix)
>  {
> -	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
> +	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,

Noticed this is quite unreadable due the range of INT_MIN to INT_MAX. Do 
we need such granularity or could use something smaller so it looks 
nicer here? I know, the argument is quite poor. :)

>  		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
> +		   rq->priotree.priority,
>  		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
>  		   rq->timeline->common->name);
>  }
> @@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>
>  		if (i915.enable_execlists) {
>  			u32 ptr, read, write;
> +			struct rb_node *rb;
>
>  			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
>  				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
> @@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  			rcu_read_unlock();
>
>  			spin_lock_irq(&engine->timeline->lock);
> -			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
> +			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
> +				rq = rb_entry(rb, typeof(*rq), priotree.node);
>  				print_request(m, rq, "\t\tQ ");
>  			}
>  			spin_unlock_irq(&engine->timeline->lock);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index cf28666021f8..4697848ecfd9 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2687,10 +2687,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
>
>  	if (i915.enable_execlists) {
>  		spin_lock_irq(&engine->timeline->lock);
> -		INIT_LIST_HEAD(&engine->execlist_queue);
>  		i915_gem_request_put(engine->execlist_port[0].request);
>  		i915_gem_request_put(engine->execlist_port[1].request);
>  		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> +		engine->execlist_queue = RB_ROOT;
> +		engine->execlist_first = NULL;
>  		spin_unlock_irq(&engine->timeline->lock);
>  	}
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 13090f226203..5f0068fac3e9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -141,6 +141,8 @@ i915_priotree_fini(struct i915_priotree *pt)
>  {
>  	struct i915_dependency *dep, *next;
>
> +	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
> +
>  	/* Everyone we depended upon should retire before us and remove
>  	 * themselves from our list. However, retirement is run independently
>  	 * on each timeline and so we may be called out-of-order.
> @@ -164,6 +166,8 @@ i915_priotree_init(struct i915_priotree *pt)
>  {
>  	INIT_LIST_HEAD(&pt->pre_list);
>  	INIT_LIST_HEAD(&pt->post_list);
> +	RB_CLEAR_NODE(&pt->node);
> +	pt->priority = INT_MIN;
>  }
>
>  void i915_gem_retire_noop(struct i915_gem_active *active,
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 0cb2ba546062..4288f512ac51 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -51,6 +51,9 @@ struct i915_dependency {
>  struct i915_priotree {
>  	struct list_head pre_list; /* who is before us, we depend upon */
>  	struct list_head post_list; /* who is after us, they depend upon us */
> +	struct rb_node node;
> +	int priority;
> +#define I915_PRIORITY_MAX 1024

Unused in this patch.

>  };
>
>  /**
> @@ -168,9 +171,6 @@ struct drm_i915_gem_request {
>  	struct drm_i915_file_private *file_priv;
>  	/** file_priv list entry for this request */
>  	struct list_head client_list;
> -
> -	/** Link in the execlist submission queue, guarded by execlist_lock. */
> -	struct list_head execlist_link;
>  };
>
>  extern const struct dma_fence_ops i915_fence_ops;
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index ddb574d5ceda..b31573a825fa 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -640,6 +640,9 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	struct i915_guc_client *client = guc->execbuf_client;
>  	int b_ret;
>
> +	rq->previous_context = rq->engine->last_context;
> +	rq->engine->last_context = rq->ctx;
> +
>  	__i915_gem_request_submit(rq);
>
>  	spin_lock(&client->wq_lock);
> @@ -1522,6 +1525,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
>  		engine->submit_request = i915_guc_submit;
> +		engine->schedule = NULL;
>
>  		/* Replay the current set of previously submitted requests */
>  		list_for_each_entry(request,
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index c9171a058478..3da4d466e332 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -239,7 +239,8 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
>   */
>  void intel_engine_setup_common(struct intel_engine_cs *engine)
>  {
> -	INIT_LIST_HEAD(&engine->execlist_queue);
> +	engine->execlist_queue = RB_ROOT;
> +	engine->execlist_first = NULL;
>
>  	intel_engine_init_timeline(engine);
>  	intel_engine_init_hangcheck(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 186c3ccb2c34..70ac74c959bd 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -432,9 +432,10 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
>
>  static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_gem_request *cursor, *last;
> +	struct drm_i915_gem_request *last;
>  	struct execlist_port *port = engine->execlist_port;
>  	unsigned long flags;
> +	struct rb_node *rb;
>  	bool submit = false;
>
>  	last = port->request;
> @@ -471,7 +472,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	 */
>
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
> -	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
>  		/* Can we combine this request with the current port? It has to
>  		 * be the same context/ringbuffer and not have any exceptions
>  		 * (e.g. GVT saying never to combine contexts).
> @@ -503,16 +508,27 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			port++;
>  		}
>
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		/* We keep the previous context alive until we retire the
> +		 * following request. This ensures that any the context object
> +		 * is still pinned for any residual writes the HW makes into
> +		 * it on the context switch into the next object following the
> +		 * breadcrumb. Otherwise, we may retire the context too early.
> +		 */
> +		cursor->previous_context = engine->last_context;
> +		engine->last_context = cursor->ctx;
> +

Will we will need a knob to control the amount of context merging? Until 
preemption that is, similar to GuC queue depth "nerfing" from the other 
patch.

>  		__i915_gem_request_submit(cursor);
>  		last = cursor;
>  		submit = true;
>  	}
>  	if (submit) {
> -		/* Decouple all the requests submitted from the queue */
> -		engine->execlist_queue.next = &cursor->execlist_link;
> -		cursor->execlist_link.prev = &engine->execlist_queue;
> -
>  		i915_gem_request_assign(&port->request, last);
> +		engine->execlist_first = rb;
>  	}
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>
> @@ -595,26 +611,77 @@ static void intel_lrc_irq_handler(unsigned long data)
>  	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
>  }
>
> +static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
> +{
> +	struct rb_node **p, *rb;
> +	bool first = true;
> +
> +	/* most positive priority is scheduled first, equal priorities fifo */
> +	rb = NULL;
> +	p = &root->rb_node;
> +	while (*p) {
> +		struct i915_priotree *pos;
> +
> +		rb = *p;
> +		pos = rb_entry(rb, typeof(*pos), node);
> +		if (pt->priority > pos->priority) {
> +			p = &rb->rb_left;
> +		} else {
> +			p = &rb->rb_right;
> +			first = false;
> +		}
> +	}
> +	rb_link_node(&pt->node, rb, p);
> +	rb_insert_color(&pt->node, root);
> +
> +	return first;
> +}
> +
>  static void execlists_submit_request(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
>
>  	assert_spin_locked(&engine->timeline->lock);
>
> -	/* We keep the previous context alive until we retire the following
> -	 * request. This ensures that any the context object is still pinned
> -	 * for any residual writes the HW makes into it on the context switch
> -	 * into the next object following the breadcrumb. Otherwise, we may
> -	 * retire the context too early.
> -	 */
> -	request->previous_context = engine->last_context;
> -	engine->last_context = request->ctx;
> -
> -	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> +	if (insert_request(&request->priotree, &engine->execlist_queue))
> +		engine->execlist_first = &request->priotree.node;
>  	if (execlists_elsp_idle(engine))
>  		tasklet_hi_schedule(&engine->irq_tasklet);
>  }
>
> +static void update_priorities(struct i915_priotree *pt, int prio)
> +{
> +	struct drm_i915_gem_request *request =
> +		container_of(pt, struct drm_i915_gem_request, priotree);
> +	struct intel_engine_cs *engine = request->engine;
> +	struct i915_dependency *dep;
> +
> +	if (prio <= READ_ONCE(pt->priority))
> +		return;
> +
> +	/* Recursively bump all dependent priorities to match the new request */
> +	list_for_each_entry(dep, &pt->pre_list, pre_link)
> +		update_priorities(dep->signal, prio);

John got in trouble from recursion in his scheduler, used for the same 
thing AFAIR. Or was it the priority bumping? Either way, it could be 
imperative to avoid it.

> +
> +	/* Fifo and depth-first replacement ensure our deps execute before us */
> +	spin_lock_irq(&engine->timeline->lock);
> +	pt->priority = prio;
> +	if (!RB_EMPTY_NODE(&pt->node)) {
> +		rb_erase(&pt->node, &engine->execlist_queue);
> +		if (insert_request(pt, &engine->execlist_queue))
> +			engine->execlist_first = &pt->node;
> +	}
> +	spin_unlock_irq(&engine->timeline->lock);
> +}
> +
> +static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
> +{
> +	/* Make sure that our entire dependency chain shares our prio */
> +	update_priorities(&request->priotree, prio);
> +
> +	/* XXX Do we need to preempt to make room for us and our deps? */
> +}

Hm, why isn't scheduling backend agnostic? Makes it much simpler to do 
the first implementation like this?

> +
>  int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
> @@ -1651,8 +1718,10 @@ void intel_execlists_enable_submission(struct drm_i915_private *dev_priv)
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>
> -	for_each_engine(engine, dev_priv, id)
> +	for_each_engine(engine, dev_priv, id) {
>  		engine->submit_request = execlists_submit_request;
> +		engine->schedule = execlists_schedule;
> +	}
>  }
>
>  static void
> @@ -1665,6 +1734,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>  	engine->emit_breadcrumb = gen8_emit_breadcrumb;
>  	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
>  	engine->submit_request = execlists_submit_request;
> +	engine->schedule = execlists_schedule;
>
>  	engine->irq_enable = gen8_logical_ring_enable_irq;
>  	engine->irq_disable = gen8_logical_ring_disable_irq;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 75991a3c694b..cbc148863a03 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -348,7 +348,8 @@ struct intel_engine_cs {
>  		struct drm_i915_gem_request *request;
>  		unsigned int count;
>  	} execlist_port[2];
> -	struct list_head execlist_queue;
> +	struct rb_root execlist_queue;
> +	struct rb_node *execlist_first;
>  	unsigned int fw_domains;
>  	bool disable_lite_restore_wa;
>  	bool preempt_wa;
>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 07/12] drm/i915/scheduler: Boost priorities for flips
  2016-11-02 17:50 ` [PATCH 07/12] drm/i915/scheduler: Boost priorities for flips Chris Wilson
@ 2016-11-03 16:29   ` Tvrtko Ursulin
  2016-11-03 16:54     ` Chris Wilson
  0 siblings, 1 reply; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-03 16:29 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 02/11/2016 17:50, Chris Wilson wrote:
> Boost the priority of any rendering required to show the next pageflip
> as we want to avoid missing the vblank by being delayed by invisible
> workload. We prioritise avoiding jank and jitter in the GUI over
> starving background tasks.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_drv.h      |  5 ++++
>  drivers/gpu/drm/i915/i915_gem.c      | 50 ++++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_display.c |  2 ++
>  3 files changed, 57 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 61fee0b0c302..738ec44fe6af 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3416,6 +3416,11 @@ int i915_gem_object_wait(struct drm_i915_gem_object *obj,
>  			 unsigned int flags,
>  			 long timeout,
>  			 struct intel_rps_client *rps);
> +int i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
> +				  unsigned int flags,
> +				  int priority);
> +#define I915_PRIORITY_DISPLAY I915_PRIORITY_MAX
> +
>  int __must_check
>  i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
>  				  bool write);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 4697848ecfd9..4287c51fb461 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -433,6 +433,56 @@ i915_gem_object_wait_reservation(struct reservation_object *resv,
>  	return timeout;
>  }
>
> +static void fence_set_priority(struct dma_fence *fence, int prio)
> +{
> +	struct drm_i915_gem_request *rq;
> +	struct intel_engine_cs *engine;
> +
> +	if (!dma_fence_is_i915(fence))
> +		return;
> +
> +	rq = to_request(fence);
> +	engine = rq->engine;
> +	if (!engine->schedule)
> +		return;
> +
> +	engine->schedule(rq, prio);

This will be inefficient with reservation objects containing multiple 
i915 fences.

Instead you could update just a single priority and then rebalance the 
tree at the end.

Not sure how much work that would be. Perhaps it can be improved later 
on. Or we don't expect this scenario to occur here?

> +}
> +
> +int
> +i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
> +			      unsigned int flags,
> +			      int prio)
> +{
> +	struct dma_fence *excl;
> +
> +	if (flags & I915_WAIT_ALL) {
> +		struct dma_fence **shared;
> +		unsigned int count, i;
> +		int ret;
> +
> +		ret = reservation_object_get_fences_rcu(obj->resv,
> +							&excl, &count, &shared);
> +		if (ret)
> +			return ret;
> +
> +		for (i = 0; i < count; i++) {
> +			fence_set_priority(shared[i], prio);
> +			dma_fence_put(shared[i]);
> +		}
> +
> +		kfree(shared);
> +	} else {
> +		excl = reservation_object_get_excl_rcu(obj->resv);
> +	}
> +
> +	if (excl) {
> +		fence_set_priority(excl, prio);
> +		dma_fence_put(excl);
> +	}
> +	return 0;
> +}
> +
>  /**
>   * Waits for rendering to the object to be completed
>   * @obj: i915 gem object
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 97589102442c..8df9554432a9 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -14761,6 +14761,8 @@ intel_prepare_plane_fb(struct drm_plane *plane,
>  						      GFP_KERNEL);
>  		if (ret < 0)
>  			return ret;
> +
> +		i915_gem_object_wait_priority(obj, 0, I915_PRIORITY_DISPLAY);
>  	}
>
>  	if (plane->type == DRM_PLANE_TYPE_CURSOR &&
>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 08/12] drm/i915/guc: Cache the client mapping
  2016-11-02 17:50 ` [PATCH 08/12] drm/i915/guc: Cache the client mapping Chris Wilson
@ 2016-11-03 16:37   ` Tvrtko Ursulin
  2016-11-03 20:01     ` Chris Wilson
  0 siblings, 1 reply; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-03 16:37 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 02/11/2016 17:50, Chris Wilson wrote:
> Use i915_gem_object_pin_map() for the guc client's lifetime to replace
> the peristent kmap + frequent kmap_atomic with a permanent vmapping.
> This avoids taking the obj->mm.lock mutex whilst inside irq context
> later.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 38 +++++++++++++++---------------
>  drivers/gpu/drm/i915/intel_guc.h           |  2 +-
>  2 files changed, 20 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index b31573a825fa..bab0c2fc3bce 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -220,7 +220,7 @@ static int guc_update_doorbell_id(struct intel_guc *guc,
>  	struct guc_context_desc desc;
>  	size_t len;
>
> -	doorbell = client->client_base + client->doorbell_offset;
> +	doorbell = client->vaddr + client->doorbell_offset;
>
>  	if (client->doorbell_id != GUC_INVALID_DOORBELL_ID &&
>  	    test_bit(client->doorbell_id, doorbell_bitmap)) {
> @@ -326,7 +326,7 @@ static void guc_proc_desc_init(struct intel_guc *guc,
>  {
>  	struct guc_process_desc *desc;
>
> -	desc = client->client_base + client->proc_desc_offset;
> +	desc = client->vaddr + client->proc_desc_offset;
>
>  	memset(desc, 0, sizeof(*desc));
>
> @@ -413,8 +413,8 @@ static void guc_ctx_desc_init(struct intel_guc *guc,
>  	gfx_addr = i915_ggtt_offset(client->vma);
>  	desc.db_trigger_phy = sg_dma_address(client->vma->pages->sgl) +
>  				client->doorbell_offset;
> -	desc.db_trigger_cpu = (uintptr_t)client->client_base +
> -				client->doorbell_offset;
> +	desc.db_trigger_cpu =
> +		(uintptr_t)client->vaddr + client->doorbell_offset;
>  	desc.db_trigger_uk = gfx_addr + client->doorbell_offset;
>  	desc.process_desc = gfx_addr + client->proc_desc_offset;
>  	desc.wq_addr = gfx_addr + client->wq_offset;
> @@ -465,7 +465,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  {
>  	const size_t wqi_size = sizeof(struct guc_wq_item);
>  	struct i915_guc_client *gc = request->i915->guc.execbuf_client;
> -	struct guc_process_desc *desc = gc->client_base + gc->proc_desc_offset;
> +	struct guc_process_desc *desc = gc->vaddr + gc->proc_desc_offset;
>  	u32 freespace;
>  	int ret;
>
> @@ -506,10 +506,9 @@ static void guc_wq_item_append(struct i915_guc_client *gc,
>  	struct intel_engine_cs *engine = rq->engine;
>  	struct guc_process_desc *desc;
>  	struct guc_wq_item *wqi;
> -	void *base;
> -	u32 freespace, tail, wq_off, wq_page;
> +	u32 freespace, tail, wq_off;
>
> -	desc = gc->client_base + gc->proc_desc_offset;
> +	desc = gc->vaddr + gc->proc_desc_offset;
>
>  	/* Free space is guaranteed, see i915_guc_wq_reserve() above */
>  	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
> @@ -539,10 +538,7 @@ static void guc_wq_item_append(struct i915_guc_client *gc,
>  	gc->wq_rsvd -= wqi_size;
>
>  	/* WQ starts from the page after doorbell / process_desc */
> -	wq_page = (wq_off + GUC_DB_SIZE) >> PAGE_SHIFT;
> -	wq_off &= PAGE_SIZE - 1;
> -	base = kmap_atomic(i915_gem_object_get_page(gc->vma->obj, wq_page));
> -	wqi = (struct guc_wq_item *)((char *)base + wq_off);
> +	wqi = gc->vaddr + wq_off + GUC_DB_SIZE;
>
>  	/* Now fill in the 4-word work queue item */
>  	wqi->header = WQ_TYPE_INORDER |
> @@ -555,8 +551,6 @@ static void guc_wq_item_append(struct i915_guc_client *gc,
>
>  	wqi->ring_tail = tail << WQ_RING_TAIL_SHIFT;
>  	wqi->fence_id = rq->global_seqno;
> -
> -	kunmap_atomic(base);
>  }
>
>  static int guc_ring_doorbell(struct i915_guc_client *gc)
> @@ -566,7 +560,7 @@ static int guc_ring_doorbell(struct i915_guc_client *gc)
>  	union guc_doorbell_qw *db;
>  	int attempt = 2, ret = -EAGAIN;
>
> -	desc = gc->client_base + gc->proc_desc_offset;
> +	desc = gc->vaddr + gc->proc_desc_offset;
>
>  	/* Update the tail so it is visible to GuC */
>  	desc->tail = gc->wq_tail;
> @@ -582,7 +576,7 @@ static int guc_ring_doorbell(struct i915_guc_client *gc)
>  		db_exc.cookie = 1;
>
>  	/* pointer of current doorbell cacheline */
> -	db = gc->client_base + gc->doorbell_offset;
> +	db = gc->vaddr + gc->doorbell_offset;
>
>  	while (attempt--) {
>  		/* lets ring the doorbell */
> @@ -729,14 +723,14 @@ guc_client_free(struct drm_i915_private *dev_priv,
>  	 * Be sure to drop any locks
>  	 */
>
> -	if (client->client_base) {
> +	if (client->vaddr) {
>  		/*
>  		 * If we got as far as setting up a doorbell, make sure we
>  		 * shut it down before unmapping & deallocating the memory.
>  		 */
>  		guc_disable_doorbell(guc, client);
>
> -		kunmap(kmap_to_page(client->client_base));
> +		i915_gem_object_unpin_map(client->vma->obj);
>  	}
>
>  	i915_vma_unpin_and_release(&client->vma);
> @@ -825,6 +819,7 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
>  	struct i915_guc_client *client;
>  	struct intel_guc *guc = &dev_priv->guc;
>  	struct i915_vma *vma;
> +	void *vaddr;
>  	uint16_t db_id;
>
>  	client = kzalloc(sizeof(*client), GFP_KERNEL);
> @@ -851,7 +846,12 @@ guc_client_alloc(struct drm_i915_private *dev_priv,
>
>  	/* We'll keep just the first (doorbell/proc) page permanently kmap'd. */
>  	client->vma = vma;
> -	client->client_base = kmap(i915_vma_first_page(vma));
> +
> +	vaddr = i915_gem_object_pin_map(vma->obj, I915_MAP_WB);
> +	if (IS_ERR(vaddr))
> +		goto err;
> +
> +	client->vaddr = vaddr;
>
>  	spin_lock_init(&client->wq_lock);
>  	client->wq_offset = GUC_DB_SIZE;
> diff --git a/drivers/gpu/drm/i915/intel_guc.h b/drivers/gpu/drm/i915/intel_guc.h
> index 49ced0bad0f5..0053258e03d3 100644
> --- a/drivers/gpu/drm/i915/intel_guc.h
> +++ b/drivers/gpu/drm/i915/intel_guc.h
> @@ -64,7 +64,7 @@ struct drm_i915_gem_request;
>   */
>  struct i915_guc_client {
>  	struct i915_vma *vma;
> -	void *client_base;		/* first page (only) of above	*/
> +	void *vaddr;
>  	struct i915_gem_context *owner;
>  	struct intel_guc *guc;
>
>

Looks OK, I thought we've done this months ago. :)

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 07/12] drm/i915/scheduler: Boost priorities for flips
  2016-11-03 16:29   ` Tvrtko Ursulin
@ 2016-11-03 16:54     ` Chris Wilson
  0 siblings, 0 replies; 40+ messages in thread
From: Chris Wilson @ 2016-11-03 16:54 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 03, 2016 at 04:29:52PM +0000, Tvrtko Ursulin wrote:
> 
> On 02/11/2016 17:50, Chris Wilson wrote:
> >Boost the priority of any rendering required to show the next pageflip
> >as we want to avoid missing the vblank by being delayed by invisible
> >workload. We prioritise avoiding jank and jitter in the GUI over
> >starving background tasks.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> > drivers/gpu/drm/i915/i915_drv.h      |  5 ++++
> > drivers/gpu/drm/i915/i915_gem.c      | 50 ++++++++++++++++++++++++++++++++++++
> > drivers/gpu/drm/i915/intel_display.c |  2 ++
> > 3 files changed, 57 insertions(+)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >index 61fee0b0c302..738ec44fe6af 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.h
> >+++ b/drivers/gpu/drm/i915/i915_drv.h
> >@@ -3416,6 +3416,11 @@ int i915_gem_object_wait(struct drm_i915_gem_object *obj,
> > 			 unsigned int flags,
> > 			 long timeout,
> > 			 struct intel_rps_client *rps);
> >+int i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
> >+				  unsigned int flags,
> >+				  int priority);
> >+#define I915_PRIORITY_DISPLAY I915_PRIORITY_MAX
> >+
> > int __must_check
> > i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
> > 				  bool write);
> >diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >index 4697848ecfd9..4287c51fb461 100644
> >--- a/drivers/gpu/drm/i915/i915_gem.c
> >+++ b/drivers/gpu/drm/i915/i915_gem.c
> >@@ -433,6 +433,56 @@ i915_gem_object_wait_reservation(struct reservation_object *resv,
> > 	return timeout;
> > }
> >
> >+static void fence_set_priority(struct dma_fence *fence, int prio)
> >+{
> >+	struct drm_i915_gem_request *rq;
> >+	struct intel_engine_cs *engine;
> >+
> >+	if (!dma_fence_is_i915(fence))
> >+		return;
> >+
> >+	rq = to_request(fence);
> >+	engine = rq->engine;
> >+	if (!engine->schedule)
> >+		return;
> >+
> >+	engine->schedule(rq, prio);
> 
> This will be inefficient with reservation objects containing
> multiple i915 fences.

We recursively walk a list of lists, inefficiency is its middle name!

> Instead you could update just a single priority and then rebalance
> the tree at the end.
> 
> Not sure how much work that would be. Perhaps it can be improved
> later on. Or we don't expect this scenario to occur here?

I don't think it will be so bad as to be noticeable. The principle being
that much of the dependency tree of the multiple fences are likely the
same and so we end up completing the walk much earlier. What is more
worrying is that we can get non-i915 fences and not bump the i915
dependencies beneath. The most likely of these is fence-array.
fence-array at least we can extend fence_set_priority (but nouveau-i915
interdependencies will be left behind).

Anyway I don't think calling engine->schedule() on each request is as
bad the ->schedule() implementation itself.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities
  2016-11-03 16:21   ` Tvrtko Ursulin
@ 2016-11-03 17:44     ` Chris Wilson
  2016-11-03 19:47     ` Chris Wilson
  1 sibling, 0 replies; 40+ messages in thread
From: Chris Wilson @ 2016-11-03 17:44 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 03, 2016 at 04:21:25PM +0000, Tvrtko Ursulin wrote:
> >+		rb = rb_next(rb);
> >+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> >+		RB_CLEAR_NODE(&cursor->priotree.node);
> >+		cursor->priotree.priority = INT_MAX;
> >+
> >+		/* We keep the previous context alive until we retire the
> >+		 * following request. This ensures that any the context object
> >+		 * is still pinned for any residual writes the HW makes into
> >+		 * it on the context switch into the next object following the
> >+		 * breadcrumb. Otherwise, we may retire the context too early.
> >+		 */
> >+		cursor->previous_context = engine->last_context;
> >+		engine->last_context = cursor->ctx;
> >+
> 
> Will we will need a knob to control the amount of context merging?
> Until preemption that is, similar to GuC queue depth "nerfing" from
> the other patch.

Right, timeslicing + preemption is the perhaps the best answer here. A
knob here would be one of those policy decisions that we want to avoid
:) Not quite sure how we would go about autotuning it. A GL workload
looks quite different to a VK workload and quite different again to a CL
workload. One can only imagine the horrors on the transcode side :)

Making those decisions is the real heart of a scheduler.

This code should be more like:

while (req = scheduler.next_request()) {
	if (!is_compatible(port, req)) {
		if (port++)
			break;
	}

	finish_request(req);
	scheduler.remove_request(req);

	port = req;
}

Given that at least this block of code is duplicated for the guc, maybe
it is worth starting. I was trying to avoid doing that because in my
mind it is the preemption that is the complex part and should drive the
next steps.

> >+static void update_priorities(struct i915_priotree *pt, int prio)
> >+{
> >+	struct drm_i915_gem_request *request =
> >+		container_of(pt, struct drm_i915_gem_request, priotree);
> >+	struct intel_engine_cs *engine = request->engine;
> >+	struct i915_dependency *dep;
> >+
> >+	if (prio <= READ_ONCE(pt->priority))
> >+		return;
> >+
> >+	/* Recursively bump all dependent priorities to match the new request */
> >+	list_for_each_entry(dep, &pt->pre_list, pre_link)
> >+		update_priorities(dep->signal, prio);
> 
> John got in trouble from recursion in his scheduler, used for the
> same thing AFAIR. Or was it the priority bumping? Either way, it
> could be imperative to avoid it.

Yup. I haven't thrown a 100,000 requests at it for this very reason.

Hmm, if I put an additional struct list_head into i915_dependency, we
can build a search list using it (praise be to the BKL).
 
> >+	/* Fifo and depth-first replacement ensure our deps execute before us */
> >+	spin_lock_irq(&engine->timeline->lock);
> >+	pt->priority = prio;
> >+	if (!RB_EMPTY_NODE(&pt->node)) {
> >+		rb_erase(&pt->node, &engine->execlist_queue);
> >+		if (insert_request(pt, &engine->execlist_queue))
> >+			engine->execlist_first = &pt->node;
> >+	}
> >+	spin_unlock_irq(&engine->timeline->lock);
> >+}
> >+
> >+static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
> >+{
> >+	/* Make sure that our entire dependency chain shares our prio */
> >+	update_priorities(&request->priotree, prio);
> >+
> >+	/* XXX Do we need to preempt to make room for us and our deps? */
> >+}
> 
> Hm, why isn't scheduling backend agnostic? Makes it much simpler to
> do the first implementation like this?

The scheduling is? The scheduling is just the policy. But we do need
entry points to call into the scheduler, such as change in the request
tree (here), completion of a request (context-switch interrupt) or
completion of a time slice. engine->schedule() is not the best name,
perhaps schedule_request(). Using engine is mostly a placeholder (a
convenient place to store a vfunc), but I do think we will end up with
different interactions with the scheduler based on the backend (in
particular feeding the GuC with dependency information). So, yes, this
is the easy route that should evolve as the need changes, and just
pencil the different callbacks as entry points where the backend and
scheduler should collude.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities
  2016-11-03 16:21   ` Tvrtko Ursulin
  2016-11-03 17:44     ` Chris Wilson
@ 2016-11-03 19:47     ` Chris Wilson
  2016-11-04  9:20       ` Chris Wilson
  1 sibling, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-03 19:47 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 03, 2016 at 04:21:25PM +0000, Tvrtko Ursulin wrote:
> >+static void update_priorities(struct i915_priotree *pt, int prio)
> >+{
> >+	struct drm_i915_gem_request *request =
> >+		container_of(pt, struct drm_i915_gem_request, priotree);
> >+	struct intel_engine_cs *engine = request->engine;
> >+	struct i915_dependency *dep;
> >+
> >+	if (prio <= READ_ONCE(pt->priority))
> >+		return;
> >+
> >+	/* Recursively bump all dependent priorities to match the new request */
> >+	list_for_each_entry(dep, &pt->pre_list, pre_link)
> >+		update_priorities(dep->signal, prio);
> 
> John got in trouble from recursion in his scheduler, used for the
> same thing AFAIR. Or was it the priority bumping? Either way, it
> could be imperative to avoid it.

static void update_priority(struct i915_priotree *pt, int prio)
{
        struct drm_i915_gem_request *request =
                container_of(pt, struct drm_i915_gem_request, priotree);
        struct intel_engine_cs *engine = request->engine;

        spin_lock_irq(&engine->timeline->lock);
        if (prio > pt->priority) {
                pt->priority = prio;
                if (!RB_EMPTY_NODE(&pt->node)) {
                        rb_erase(&pt->node, &engine->execlist_queue);
                        if (insert_request(pt, &engine->execlist_queue))
                                engine->execlist_first = &pt->node;
                }
        }
        spin_unlock_irq(&engine->timeline->lock);
}

static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
{
        struct i915_dependency *dep, *p;
        LIST_HEAD(dfs);

        if (prio <= READ_ONCE(request->priotree.priority))
                return;

        /* Need BKL in order to use the temporary link inside i915_dependency */
        lockdep_assert_held(&request->i915->drm.struct_mutex);

        /* Recursively bump all dependent priorities to match the new request */
        list_for_each_entry(dep, &request->priotree.pre_list, pre_link)
                if (prio > READ_ONCE(dep->signal->priority))
                        list_add_tail(&dep->dfs_link, &dfs);

        list_for_each_entry(dep, &dfs, dfs_link) {
                list_for_each_entry(p, &dep->signal->pre_list, pre_link) {
                        GEM_BUG_ON(p == dep);
                        if (prio > READ_ONCE(p->signal->priority))
                                list_move_tail(&p->dfs_link, &dfs);
                }
        }

        /* Fifo and depth-first replacement ensure our deps execute before us */
        list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
                update_priority(dep->signal, prio);
                INIT_LIST_HEAD(&dep->dfs_link);
        }
        update_priority(&request->priotree, prio);

        /* XXX Do we need to preempt to make room for us and our deps? */
}

Still ugh. I think that we don't chase beyond the priorities we need to
bump is its only saving grace.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 08/12] drm/i915/guc: Cache the client mapping
  2016-11-03 16:37   ` Tvrtko Ursulin
@ 2016-11-03 20:01     ` Chris Wilson
  0 siblings, 0 replies; 40+ messages in thread
From: Chris Wilson @ 2016-11-03 20:01 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 03, 2016 at 04:37:24PM +0000, Tvrtko Ursulin wrote:
> 
> On 02/11/2016 17:50, Chris Wilson wrote:
> >Use i915_gem_object_pin_map() for the guc client's lifetime to replace
> >the peristent kmap + frequent kmap_atomic with a permanent vmapping.
> >This avoids taking the obj->mm.lock mutex whilst inside irq context
> >later.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

> Looks OK, I thought we've done this months ago. :)

:)
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Pulled this in to close
https://bugs.freedesktop.org/show_bug.cgi?id=98571
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities
  2016-11-03 19:47     ` Chris Wilson
@ 2016-11-04  9:20       ` Chris Wilson
  0 siblings, 0 replies; 40+ messages in thread
From: Chris Wilson @ 2016-11-04  9:20 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

On Thu, Nov 03, 2016 at 07:47:39PM +0000, Chris Wilson wrote:
> On Thu, Nov 03, 2016 at 04:21:25PM +0000, Tvrtko Ursulin wrote:
> > >+static void update_priorities(struct i915_priotree *pt, int prio)
> > >+{
> > >+	struct drm_i915_gem_request *request =
> > >+		container_of(pt, struct drm_i915_gem_request, priotree);
> > >+	struct intel_engine_cs *engine = request->engine;
> > >+	struct i915_dependency *dep;
> > >+
> > >+	if (prio <= READ_ONCE(pt->priority))
> > >+		return;
> > >+
> > >+	/* Recursively bump all dependent priorities to match the new request */
> > >+	list_for_each_entry(dep, &pt->pre_list, pre_link)
> > >+		update_priorities(dep->signal, prio);
> > 
> > John got in trouble from recursion in his scheduler, used for the
> > same thing AFAIR. Or was it the priority bumping? Either way, it
> > could be imperative to avoid it.

Spent some time tuning (but not very well) for very deep pipelines:

static struct intel_engine_cs *
pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
{
	struct intel_engine_cs *engine;

	engine = container_of(pt,
			      struct drm_i915_gem_request,
			      priotree)->engine;
	if (engine != locked) {
		if (locked)
			spin_unlock_irq(&locked->timeline->lock);
		spin_lock_irq(&engine->timeline->lock);
	}

	return engine;
}

static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
{
	struct intel_engine_cs *engine = NULL;
	struct i915_dependency *dep, *p;
	struct i915_dependency stack;
	LIST_HEAD(dfs);

	if (prio <= READ_ONCE(request->priotree.priority))
		return;

	/* Need BKL in order to use the temporary link inside i915_dependency */
	lockdep_assert_held(&request->i915->drm.struct_mutex);

	stack.signal = &request->priotree;
	list_add(&stack.dfs_link, &dfs);

	/* Recursively bump all dependent priorities to match the new request */
	list_for_each_entry_safe(dep, p, &dfs, dfs_link) {
		struct i915_priotree *pt = dep->signal;

		list_for_each_entry(p, &pt->pre_list, pre_link)
			if (prio > READ_ONCE(p->signal->priority))
				list_move_tail(&p->dfs_link, &dfs);

		p = list_first_entry(&dep->dfs_link, typeof(*p), dfs_link);
		if (!RB_EMPTY_NODE(&pt->node))
			continue;

		engine = pt_lock_engine(pt, engine);

		if (prio > pt->priority && RB_EMPTY_NODE(&pt->node)) {
			pt->priority = prio;
			list_del_init(&dep->dfs_link);
		}
	}

	/* Fifo and depth-first replacement ensure our deps execute before us */
	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
		struct i915_priotree *pt = dep->signal;

		INIT_LIST_HEAD(&dep->dfs_link);

		engine = pt_lock_engine(pt, engine);

		if (prio <= pt->priority)
			continue;

		GEM_BUG_ON(RB_EMPTY_NODE(&pt->node));

		pt->priority = prio;
		rb_erase(&pt->node, &engine->execlist_queue);
		if (insert_request(pt, &engine->execlist_queue))
			engine->execlist_first = &pt->node;
	}

	if (engine)
		spin_unlock_irq(&engine->timeline->lock);

	/* XXX Do we need to preempt to make room for us and our deps? */
}

But as always any linear list scales poorly. It is just fortunate that
typically we don't see 10,000s of requests in the pipeline that need PI.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-03 11:55     ` Chris Wilson
@ 2016-11-04 14:44       ` Tvrtko Ursulin
  2016-11-04 15:11         ` Chris Wilson
  0 siblings, 1 reply; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-04 14:44 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 03/11/2016 11:55, Chris Wilson wrote:
> On Thu, Nov 03, 2016 at 11:03:47AM +0000, Tvrtko Ursulin wrote:
>>
>> On 02/11/2016 17:50, Chris Wilson wrote:
>>> The scheduler needs to know the dependencies of each request for the
>>> lifetime of the request, as it may choose to reschedule the requests at
>>> any time and must ensure the dependency tree is not broken. This is in
>>> additional to using the fence to only allow execution after all
>>> dependencies have been completed.
>>>
>>> One option was to extend the fence to support the bidirectional
>>> dependency tracking required by the scheduler. However the mismatch in
>>> lifetimes between the submit fence and the request essentially meant
>>> that we had to build a completely separate struct (and we could not
>>> simply reuse the existing waitqueue in the fence for one half of the
>>> dependency tracking). The extra dependency tracking simply did not mesh
>>> well with the fence, and keeping it separate both keeps the fence
>>> implementation simpler and allows us to extend the dependency tracking
>>> into a priority tree (whilst maintaining support for reordering the
>>> tree).
>>>
>>> To avoid the additional allocations and list manipulations, the use of
>>> the priotree is disabled when there are no schedulers to use it.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>> drivers/gpu/drm/i915/i915_gem_request.c | 72 ++++++++++++++++++++++++++++++++-
>>> drivers/gpu/drm/i915/i915_gem_request.h | 23 +++++++++++
>>> 2 files changed, 94 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
>>> index 9c8605c834f9..13090f226203 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_request.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
>>> @@ -113,6 +113,59 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>>> 	spin_unlock(&file_priv->mm.lock);
>>> }
>>>
>>> +static int
>>> +i915_priotree_add_dependency(struct i915_priotree *pt,
>>> +			     struct i915_priotree *signal,
>>> +			     struct i915_dependency *dep)
>>> +{
>>> +	unsigned long flags = 0;
>>> +
>>> +	if (!dep) {
>>> +		dep = kmalloc(sizeof(*dep), GFP_KERNEL);
>>
>> I will mention a dedicated cache again since this could possibly be
>> our hottest allocation path. With a dedicated slab I've seen it grow
>> to 5-7k objects in some benchmarks, with the request slab around 1k
>> at the same time.
>
> I'm open to one. We allocate more of these than we do even for fences. I
> was thinking it could be added later, but if we can the api to always
> pass in the i915_dependency it will probably work better.
>>
>>> +		if (!dep)
>>> +			return -ENOMEM;
>>> +
>>> +		flags |= I915_DEPENDENCY_ALLOC;
>>> +	}
>>
>> Not sure if it would be any nicer to just set the flags after
>> allocating to I915_DEPENDENCY_ALLOC and add an else path to set it
>> to zero here.
>
> I just tend to avoid if {} else {} if I can help, just a personal
> preference.
>
>>> +struct i915_dependency {
>>> +	struct i915_priotree *signal;
>>> +	struct list_head pre_link, post_link;
>>> +	unsigned long flags;
>>> +#define I915_DEPENDENCY_ALLOC BIT(0)
>>> +};
>>> +
>>> +struct i915_priotree {
>>> +	struct list_head pre_list; /* who is before us, we depend upon */
>>> +	struct list_head post_list; /* who is after us, they depend upon us */
>>> +};
>>
>> I need a picture to imagine this data structure. :(
>
> The names suck.

When you wrote this I assumed you would respin shortly with some better 
names?

I tried to grasp it one more time since then but keep getting lost. :I

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-04 14:44       ` Tvrtko Ursulin
@ 2016-11-04 15:11         ` Chris Wilson
  2016-11-07  9:12           ` Tvrtko Ursulin
  0 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-04 15:11 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Fri, Nov 04, 2016 at 02:44:44PM +0000, Tvrtko Ursulin wrote:
> 
> On 03/11/2016 11:55, Chris Wilson wrote:
> >On Thu, Nov 03, 2016 at 11:03:47AM +0000, Tvrtko Ursulin wrote:
> >>
> >>On 02/11/2016 17:50, Chris Wilson wrote:
> >>>+struct i915_dependency {
> >>>+	struct i915_priotree *signal;
> >>>+	struct list_head pre_link, post_link;
> >>>+	unsigned long flags;
> >>>+#define I915_DEPENDENCY_ALLOC BIT(0)
> >>>+};
> >>>+
> >>>+struct i915_priotree {
> >>>+	struct list_head pre_list; /* who is before us, we depend upon */
> >>>+	struct list_head post_list; /* who is after us, they depend upon us */
> >>>+};
> >>
> >>I need a picture to imagine this data structure. :(
> >
> >The names suck.
> 
> When you wrote this I assumed you would respin shortly with some
> better names?

Not yet. I kind of like

struct i915_dependency {
	struct i915_priotree *signaler;
	struct list_head signaler_link;
	struct list_head listener_link;
};

struct i915_priotree {
	struct list_head signalers_list; /* before us, we depend on them */
	struct list_head listeners_list; /* those after, who depend on us */
};

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-04 15:11         ` Chris Wilson
@ 2016-11-07  9:12           ` Tvrtko Ursulin
  2016-11-07  9:30             ` Chris Wilson
  0 siblings, 1 reply; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-07  9:12 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 04/11/2016 15:11, Chris Wilson wrote:
> On Fri, Nov 04, 2016 at 02:44:44PM +0000, Tvrtko Ursulin wrote:
>>
>> On 03/11/2016 11:55, Chris Wilson wrote:
>>> On Thu, Nov 03, 2016 at 11:03:47AM +0000, Tvrtko Ursulin wrote:
>>>>
>>>> On 02/11/2016 17:50, Chris Wilson wrote:
>>>>> +struct i915_dependency {
>>>>> +	struct i915_priotree *signal;
>>>>> +	struct list_head pre_link, post_link;
>>>>> +	unsigned long flags;
>>>>> +#define I915_DEPENDENCY_ALLOC BIT(0)
>>>>> +};
>>>>> +
>>>>> +struct i915_priotree {
>>>>> +	struct list_head pre_list; /* who is before us, we depend upon */
>>>>> +	struct list_head post_list; /* who is after us, they depend upon us */
>>>>> +};
>>>>
>>>> I need a picture to imagine this data structure. :(
>>>
>>> The names suck.
>>
>> When you wrote this I assumed you would respin shortly with some
>> better names?
>
> Not yet. I kind of like
>
> struct i915_dependency {
> 	struct i915_priotree *signaler;
> 	struct list_head signaler_link;
> 	struct list_head listener_link;
> };
>
> struct i915_priotree {
> 	struct list_head signalers_list; /* before us, we depend on them */
> 	struct list_head listeners_list; /* those after, who depend on us */
> };
>

What is the signaler in i915_dependency?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-07  9:12           ` Tvrtko Ursulin
@ 2016-11-07  9:30             ` Chris Wilson
  2016-11-07 13:30               ` Tvrtko Ursulin
  0 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-07  9:30 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Mon, Nov 07, 2016 at 09:12:57AM +0000, Tvrtko Ursulin wrote:
> 
> On 04/11/2016 15:11, Chris Wilson wrote:
> >On Fri, Nov 04, 2016 at 02:44:44PM +0000, Tvrtko Ursulin wrote:
> >>
> >>On 03/11/2016 11:55, Chris Wilson wrote:
> >>>On Thu, Nov 03, 2016 at 11:03:47AM +0000, Tvrtko Ursulin wrote:
> >>>>
> >>>>On 02/11/2016 17:50, Chris Wilson wrote:
> >>>>>+struct i915_dependency {
> >>>>>+	struct i915_priotree *signal;
> >>>>>+	struct list_head pre_link, post_link;
> >>>>>+	unsigned long flags;
> >>>>>+#define I915_DEPENDENCY_ALLOC BIT(0)
> >>>>>+};
> >>>>>+
> >>>>>+struct i915_priotree {
> >>>>>+	struct list_head pre_list; /* who is before us, we depend upon */
> >>>>>+	struct list_head post_list; /* who is after us, they depend upon us */
> >>>>>+};
> >>>>
> >>>>I need a picture to imagine this data structure. :(
> >>>
> >>>The names suck.
> >>
> >>When you wrote this I assumed you would respin shortly with some
> >>better names?
> >
> >Not yet. I kind of like
> >
> >struct i915_dependency {
> >	struct i915_priotree *signaler;
> >	struct list_head signaler_link;
> >	struct list_head listener_link;
> >};
> >
> >struct i915_priotree {
> >	struct list_head signalers_list; /* before us, we depend on them */
> >	struct list_head listeners_list; /* those after, who depend on us */
> >};
> >
> 
> What is the signaler in i915_dependency?

That would be the actual dependency. The fences have a notion of
waiters, but we need to track the signalers in order to perform PI.
Fwiw,

+struct i915_dependency {
+       struct i915_priotree *signaler;
+       struct list_head signal_link;
+       struct list_head wait_link;
+       unsigned long flags;
+#define I915_DEPENDENCY_ALLOC BIT(0)
+};
+
+/* Requests exist in a complex web of interdependencies. Each request
+ * has to wait for some other request to complete before it is ready to be run
+ * (e.g. we have to wait until the pixels have been rendering into a texture
+ * before we can copy from it). We track the readiness of a request in terms
+ * of fences, but we also need to keep the dependency tree for the lifetime
+ * of the request (beyond the life of an individual fence). We use the tree
+ * at various points to reorder the requests whilst keeping the requests
+ * in order with respect to their various dependencies.
+ */
+struct i915_priotree {
+       struct list_head signalers_list; /* those before us, we depend upon */
+       struct list_head waiters_list; /* those after us, they depend upon us */
+};


-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-07  9:30             ` Chris Wilson
@ 2016-11-07 13:30               ` Tvrtko Ursulin
  2016-11-07 13:39                 ` Chris Wilson
  0 siblings, 1 reply; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-07 13:30 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 07/11/2016 09:30, Chris Wilson wrote:
> On Mon, Nov 07, 2016 at 09:12:57AM +0000, Tvrtko Ursulin wrote:
>>
>> On 04/11/2016 15:11, Chris Wilson wrote:
>>> On Fri, Nov 04, 2016 at 02:44:44PM +0000, Tvrtko Ursulin wrote:
>>>>
>>>> On 03/11/2016 11:55, Chris Wilson wrote:
>>>>> On Thu, Nov 03, 2016 at 11:03:47AM +0000, Tvrtko Ursulin wrote:
>>>>>>
>>>>>> On 02/11/2016 17:50, Chris Wilson wrote:
>>>>>>> +struct i915_dependency {
>>>>>>> +	struct i915_priotree *signal;
>>>>>>> +	struct list_head pre_link, post_link;
>>>>>>> +	unsigned long flags;
>>>>>>> +#define I915_DEPENDENCY_ALLOC BIT(0)
>>>>>>> +};
>>>>>>> +
>>>>>>> +struct i915_priotree {
>>>>>>> +	struct list_head pre_list; /* who is before us, we depend upon */
>>>>>>> +	struct list_head post_list; /* who is after us, they depend upon us */
>>>>>>> +};
>>>>>>
>>>>>> I need a picture to imagine this data structure. :(
>>>>>
>>>>> The names suck.
>>>>
>>>> When you wrote this I assumed you would respin shortly with some
>>>> better names?
>>>
>>> Not yet. I kind of like
>>>
>>> struct i915_dependency {
>>> 	struct i915_priotree *signaler;
>>> 	struct list_head signaler_link;
>>> 	struct list_head listener_link;
>>> };
>>>
>>> struct i915_priotree {
>>> 	struct list_head signalers_list; /* before us, we depend on them */
>>> 	struct list_head listeners_list; /* those after, who depend on us */
>>> };
>>>
>>
>> What is the signaler in i915_dependency?
>
> That would be the actual dependency. The fences have a notion of
> waiters, but we need to track the signalers in order to perform PI.
> Fwiw,
>
> +struct i915_dependency {
> +       struct i915_priotree *signaler;
> +       struct list_head signal_link;
> +       struct list_head wait_link;
> +       unsigned long flags;
> +#define I915_DEPENDENCY_ALLOC BIT(0)
> +};
> +
> +/* Requests exist in a complex web of interdependencies. Each request
> + * has to wait for some other request to complete before it is ready to be run
> + * (e.g. we have to wait until the pixels have been rendering into a texture
> + * before we can copy from it). We track the readiness of a request in terms
> + * of fences, but we also need to keep the dependency tree for the lifetime
> + * of the request (beyond the life of an individual fence). We use the tree
> + * at various points to reorder the requests whilst keeping the requests
> + * in order with respect to their various dependencies.
> + */
> +struct i915_priotree {
> +       struct list_head signalers_list; /* those before us, we depend upon */
> +       struct list_head waiters_list; /* those after us, they depend upon us */
> +};
>
>

req->depq is just an optimisation to avoid one allocation in the common 
case?

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-07 13:30               ` Tvrtko Ursulin
@ 2016-11-07 13:39                 ` Chris Wilson
  2016-11-07 13:42                   ` Tvrtko Ursulin
  0 siblings, 1 reply; 40+ messages in thread
From: Chris Wilson @ 2016-11-07 13:39 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Mon, Nov 07, 2016 at 01:30:43PM +0000, Tvrtko Ursulin wrote:
> 
> On 07/11/2016 09:30, Chris Wilson wrote:
> >On Mon, Nov 07, 2016 at 09:12:57AM +0000, Tvrtko Ursulin wrote:
> >>
> >>On 04/11/2016 15:11, Chris Wilson wrote:
> >>>On Fri, Nov 04, 2016 at 02:44:44PM +0000, Tvrtko Ursulin wrote:
> >>>>
> >>>>On 03/11/2016 11:55, Chris Wilson wrote:
> >>>>>On Thu, Nov 03, 2016 at 11:03:47AM +0000, Tvrtko Ursulin wrote:
> >>>>>>
> >>>>>>On 02/11/2016 17:50, Chris Wilson wrote:
> >>>>>>>+struct i915_dependency {
> >>>>>>>+	struct i915_priotree *signal;
> >>>>>>>+	struct list_head pre_link, post_link;
> >>>>>>>+	unsigned long flags;
> >>>>>>>+#define I915_DEPENDENCY_ALLOC BIT(0)
> >>>>>>>+};
> >>>>>>>+
> >>>>>>>+struct i915_priotree {
> >>>>>>>+	struct list_head pre_list; /* who is before us, we depend upon */
> >>>>>>>+	struct list_head post_list; /* who is after us, they depend upon us */
> >>>>>>>+};
> >>>>>>
> >>>>>>I need a picture to imagine this data structure. :(
> >>>>>
> >>>>>The names suck.
> >>>>
> >>>>When you wrote this I assumed you would respin shortly with some
> >>>>better names?
> >>>
> >>>Not yet. I kind of like
> >>>
> >>>struct i915_dependency {
> >>>	struct i915_priotree *signaler;
> >>>	struct list_head signaler_link;
> >>>	struct list_head listener_link;
> >>>};
> >>>
> >>>struct i915_priotree {
> >>>	struct list_head signalers_list; /* before us, we depend on them */
> >>>	struct list_head listeners_list; /* those after, who depend on us */
> >>>};
> >>>
> >>
> >>What is the signaler in i915_dependency?
> >
> >That would be the actual dependency. The fences have a notion of
> >waiters, but we need to track the signalers in order to perform PI.
> >Fwiw,
> >
> >+struct i915_dependency {
> >+       struct i915_priotree *signaler;
> >+       struct list_head signal_link;
> >+       struct list_head wait_link;
> >+       unsigned long flags;
> >+#define I915_DEPENDENCY_ALLOC BIT(0)
> >+};
> >+
> >+/* Requests exist in a complex web of interdependencies. Each request
> >+ * has to wait for some other request to complete before it is ready to be run
> >+ * (e.g. we have to wait until the pixels have been rendering into a texture
> >+ * before we can copy from it). We track the readiness of a request in terms
> >+ * of fences, but we also need to keep the dependency tree for the lifetime
> >+ * of the request (beyond the life of an individual fence). We use the tree
> >+ * at various points to reorder the requests whilst keeping the requests
> >+ * in order with respect to their various dependencies.
> >+ */
> >+struct i915_priotree {
> >+       struct list_head signalers_list; /* those before us, we depend upon */
> >+       struct list_head waiters_list; /* those after us, they depend upon us */
> >+};
> >
> >
> 
> req->depq is just an optimisation to avoid one allocation in the
> common case?

Because we can't handle an allocation failure at the point of use, so we
need to preallocate it at the time of request construction - so easiest
to stuff it into the request (and reasonable since we almost always need
to use - the only time we don't is on the very first request on a new
timeline).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-07 13:39                 ` Chris Wilson
@ 2016-11-07 13:42                   ` Tvrtko Ursulin
  0 siblings, 0 replies; 40+ messages in thread
From: Tvrtko Ursulin @ 2016-11-07 13:42 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 07/11/2016 13:39, Chris Wilson wrote:
> On Mon, Nov 07, 2016 at 01:30:43PM +0000, Tvrtko Ursulin wrote:
>>
>> On 07/11/2016 09:30, Chris Wilson wrote:
>>> On Mon, Nov 07, 2016 at 09:12:57AM +0000, Tvrtko Ursulin wrote:
>>>>
>>>> On 04/11/2016 15:11, Chris Wilson wrote:
>>>>> On Fri, Nov 04, 2016 at 02:44:44PM +0000, Tvrtko Ursulin wrote:
>>>>>>
>>>>>> On 03/11/2016 11:55, Chris Wilson wrote:
>>>>>>> On Thu, Nov 03, 2016 at 11:03:47AM +0000, Tvrtko Ursulin wrote:
>>>>>>>>
>>>>>>>> On 02/11/2016 17:50, Chris Wilson wrote:
>>>>>>>>> +struct i915_dependency {
>>>>>>>>> +	struct i915_priotree *signal;
>>>>>>>>> +	struct list_head pre_link, post_link;
>>>>>>>>> +	unsigned long flags;
>>>>>>>>> +#define I915_DEPENDENCY_ALLOC BIT(0)
>>>>>>>>> +};
>>>>>>>>> +
>>>>>>>>> +struct i915_priotree {
>>>>>>>>> +	struct list_head pre_list; /* who is before us, we depend upon */
>>>>>>>>> +	struct list_head post_list; /* who is after us, they depend upon us */
>>>>>>>>> +};
>>>>>>>>
>>>>>>>> I need a picture to imagine this data structure. :(
>>>>>>>
>>>>>>> The names suck.
>>>>>>
>>>>>> When you wrote this I assumed you would respin shortly with some
>>>>>> better names?
>>>>>
>>>>> Not yet. I kind of like
>>>>>
>>>>> struct i915_dependency {
>>>>> 	struct i915_priotree *signaler;
>>>>> 	struct list_head signaler_link;
>>>>> 	struct list_head listener_link;
>>>>> };
>>>>>
>>>>> struct i915_priotree {
>>>>> 	struct list_head signalers_list; /* before us, we depend on them */
>>>>> 	struct list_head listeners_list; /* those after, who depend on us */
>>>>> };
>>>>>
>>>>
>>>> What is the signaler in i915_dependency?
>>>
>>> That would be the actual dependency. The fences have a notion of
>>> waiters, but we need to track the signalers in order to perform PI.
>>> Fwiw,
>>>
>>> +struct i915_dependency {
>>> +       struct i915_priotree *signaler;
>>> +       struct list_head signal_link;
>>> +       struct list_head wait_link;
>>> +       unsigned long flags;
>>> +#define I915_DEPENDENCY_ALLOC BIT(0)
>>> +};
>>> +
>>> +/* Requests exist in a complex web of interdependencies. Each request
>>> + * has to wait for some other request to complete before it is ready to be run
>>> + * (e.g. we have to wait until the pixels have been rendering into a texture
>>> + * before we can copy from it). We track the readiness of a request in terms
>>> + * of fences, but we also need to keep the dependency tree for the lifetime
>>> + * of the request (beyond the life of an individual fence). We use the tree
>>> + * at various points to reorder the requests whilst keeping the requests
>>> + * in order with respect to their various dependencies.
>>> + */
>>> +struct i915_priotree {
>>> +       struct list_head signalers_list; /* those before us, we depend upon */
>>> +       struct list_head waiters_list; /* those after us, they depend upon us */
>>> +};
>>>
>>>
>>
>> req->depq is just an optimisation to avoid one allocation in the
>> common case?
>
> Because we can't handle an allocation failure at the point of use, so we
> need to preallocate it at the time of request construction - so easiest
> to stuff it into the request (and reasonable since we almost always need
> to use - the only time we don't is on the very first request on a new
> timeline).

Yes, all clear now. Think I've figured it out. :)

Awaiting for a new series now.

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2016-11-07 13:42 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-02 17:50 Trivial priority scheduler Chris Wilson
2016-11-02 17:50 ` [PATCH 01/12] drm/i915: Split request submit/execute phase into two Chris Wilson
2016-11-03 10:35   ` Tvrtko Ursulin
2016-11-02 17:50 ` [PATCH 02/12] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
2016-11-03 10:34   ` Tvrtko Ursulin
2016-11-03 10:51     ` Chris Wilson
2016-11-03 11:27       ` Chris Wilson
2016-11-02 17:50 ` [PATCH 03/12] drm/i915: Remove engine->execlist_lock Chris Wilson
2016-11-03 10:47   ` Tvrtko Ursulin
2016-11-03 12:28     ` Chris Wilson
2016-11-03 13:31       ` Tvrtko Ursulin
2016-11-02 17:50 ` [PATCH 04/12] drm/i915/scheduler: Signal the arrival of a new request Chris Wilson
2016-11-03 10:49   ` Tvrtko Ursulin
2016-11-02 17:50 ` [PATCH 05/12] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
2016-11-03 11:03   ` Tvrtko Ursulin
2016-11-03 11:55     ` Chris Wilson
2016-11-04 14:44       ` Tvrtko Ursulin
2016-11-04 15:11         ` Chris Wilson
2016-11-07  9:12           ` Tvrtko Ursulin
2016-11-07  9:30             ` Chris Wilson
2016-11-07 13:30               ` Tvrtko Ursulin
2016-11-07 13:39                 ` Chris Wilson
2016-11-07 13:42                   ` Tvrtko Ursulin
2016-11-02 17:50 ` [PATCH 06/12] drm/i915/scheduler: Execute requests in order of priorities Chris Wilson
2016-11-03 16:21   ` Tvrtko Ursulin
2016-11-03 17:44     ` Chris Wilson
2016-11-03 19:47     ` Chris Wilson
2016-11-04  9:20       ` Chris Wilson
2016-11-02 17:50 ` [PATCH 07/12] drm/i915/scheduler: Boost priorities for flips Chris Wilson
2016-11-03 16:29   ` Tvrtko Ursulin
2016-11-03 16:54     ` Chris Wilson
2016-11-02 17:50 ` [PATCH 08/12] drm/i915/guc: Cache the client mapping Chris Wilson
2016-11-03 16:37   ` Tvrtko Ursulin
2016-11-03 20:01     ` Chris Wilson
2016-11-02 17:50 ` [PATCH 09/12] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
2016-11-03 13:17   ` Tvrtko Ursulin
2016-11-02 17:50 ` [PATCH 10/12] drm/i915/scheduler: Support user-defined priorities Chris Wilson
2016-11-02 17:50 ` [PATCH 11/12] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
2016-11-02 17:50 ` [PATCH 12/12] drm/i915: Support explicit fencing for execbuf Chris Wilson
2016-11-02 18:45 ` ✓ Fi.CI.BAT: success for series starting with [01/12] drm/i915: Split request submit/execute phase into two Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.