All of lore.kernel.org
 help / color / mirror / Atom feed
* Trivial scheduler, take 2
@ 2016-11-07 13:59 Chris Wilson
  2016-11-07 13:59 ` [PATCH v2 01/11] drm/i915: Create distinct lockclasses for execution vs user timelines Chris Wilson
                   ` (13 more replies)
  0 siblings, 14 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

Not much to report here. Biggest change is converting the recursive DFS
for PI into an iterative algorithm. The biggest challenge remains trying
to get a consistent set of names so we avoid confusing ourselves going
forwards into preemption. Biggest TODO task is check for perf regressions
from the deferred submission mechanism and for the impact of using a
2-slot dispatch for the GuC workqueue (and the extra tasklet wakeups for
execlists, which should not be so bad as the tasklet is due to be woken
up within a few tens of nanoseconds after the user-interrupt is sent!)
-Chris

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v2 01/11] drm/i915: Create distinct lockclasses for execution vs user timelines
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-08  7:43   ` Joonas Lahtinen
  2016-11-07 13:59 ` [PATCH v2 02/11] drm/i915: Split request submit/execute phase into two Chris Wilson
                   ` (12 subsequent siblings)
  13 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

In order to simplify the lockdep annotation, as they become more complex
in the future with deferred execution and multiple paths through the
same functions, create a separate lockclass for the user timeline and
the hardware execution timeline.

We should only ever be locking the user timeline and the execution
timeline in parallel so we only need to create two lock classes, rather
than a separate class for every timeline.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c          |  4 +---
 drivers/gpu/drm/i915/i915_gem_request.c  |  2 +-
 drivers/gpu/drm/i915/i915_gem_timeline.c | 31 +++++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_gem_timeline.h |  1 +
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 490fd302bd1a..f68c80a7ebd6 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4752,9 +4752,7 @@ i915_gem_load_init(struct drm_device *dev)
 
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	INIT_LIST_HEAD(&dev_priv->gt.timelines);
-	err = i915_gem_timeline_init(dev_priv,
-				     &dev_priv->gt.global_timeline,
-				     "[execution]");
+	err = i915_gem_timeline_init__global(dev_priv);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 	if (err)
 		goto err_requests;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 79b0046d9a57..8984eb799da0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -347,7 +347,7 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 				request->ring->vaddr + request->postfix);
 	engine->submit_request(request);
 
-	spin_lock_nested(&request->timeline->lock, SINGLE_DEPTH_NESTING);
+	spin_lock(&request->timeline->lock);
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c
index fc8f13a79f8f..3e503e35ffd1 100644
--- a/drivers/gpu/drm/i915/i915_gem_timeline.c
+++ b/drivers/gpu/drm/i915/i915_gem_timeline.c
@@ -24,9 +24,11 @@
 
 #include "i915_drv.h"
 
-int i915_gem_timeline_init(struct drm_i915_private *i915,
-			   struct i915_gem_timeline *timeline,
-			   const char *name)
+static int __i915_gem_timeline_init(struct drm_i915_private *i915,
+				    struct i915_gem_timeline *timeline,
+				    const char *name,
+				    struct lock_class_key *lockclass,
+				    const char *lockname)
 {
 	unsigned int i;
 	u64 fences;
@@ -47,8 +49,11 @@ int i915_gem_timeline_init(struct drm_i915_private *i915,
 
 		tl->fence_context = fences++;
 		tl->common = timeline;
-
+#ifdef CONFIG_DEBUG_SPINLOCK
+		__raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass);
+#else
 		spin_lock_init(&tl->lock);
+#endif
 		init_request_active(&tl->last_request, NULL);
 		INIT_LIST_HEAD(&tl->requests);
 	}
@@ -56,6 +61,24 @@ int i915_gem_timeline_init(struct drm_i915_private *i915,
 	return 0;
 }
 
+int i915_gem_timeline_init(struct drm_i915_private *i915,
+			   struct i915_gem_timeline *timeline,
+			   const char *name)
+{
+	static struct lock_class_key class;
+	return __i915_gem_timeline_init(i915, timeline, name,
+					&class, "timeline");
+}
+
+int i915_gem_timeline_init__global(struct drm_i915_private *i915)
+{
+	static struct lock_class_key class;
+	return __i915_gem_timeline_init(i915,
+					&i915->gt.global_timeline,
+					"[execution]",
+					&class, "global-timeline");
+}
+
 void i915_gem_timeline_fini(struct i915_gem_timeline *tl)
 {
 	lockdep_assert_held(&tl->i915->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.h b/drivers/gpu/drm/i915/i915_gem_timeline.h
index f2bf7b1d49a1..98d99a62b4ae 100644
--- a/drivers/gpu/drm/i915/i915_gem_timeline.h
+++ b/drivers/gpu/drm/i915/i915_gem_timeline.h
@@ -67,6 +67,7 @@ struct i915_gem_timeline {
 int i915_gem_timeline_init(struct drm_i915_private *i915,
 			   struct i915_gem_timeline *tl,
 			   const char *name);
+int i915_gem_timeline_init__global(struct drm_i915_private *i915);
 void i915_gem_timeline_fini(struct i915_gem_timeline *tl);
 
 #endif
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v2 02/11] drm/i915: Split request submit/execute phase into two
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
  2016-11-07 13:59 ` [PATCH v2 01/11] drm/i915: Create distinct lockclasses for execution vs user timelines Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-08  9:06   ` Joonas Lahtinen
  2016-11-07 13:59 ` [PATCH v2 03/11] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
                   ` (11 subsequent siblings)
  13 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

In order to support deferred scheduling, we need to differentiate
between when the request is ready to run (i.e. the submit fence is
signaled) and when the request is actually run (a new execute fence).
This is typically split between the request itself wanting to wait upon
others (for which we use the submit fence) and the CPU wanting to wait
upon the request, for which we use the execute fence to be sure the
hardware is ready to signal completion.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_gem_request.c | 33 ++++++++++++++++++++++++---------
 drivers/gpu/drm/i915/i915_gem_request.h | 15 +++++++++++++++
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 8984eb799da0..e41d51a68ed8 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -351,11 +351,19 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
+	i915_sw_fence_commit(&request->execute);
+
 	spin_unlock_irqrestore(&timeline->lock, flags);
 
 	return NOTIFY_DONE;
 }
 
+static int __i915_sw_fence_call
+execute_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	return NOTIFY_DONE;
+}
+
 /**
  * i915_gem_request_alloc - allocate a request structure
  *
@@ -441,6 +449,12 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       __timeline_get_seqno(req->timeline->common));
 
 	i915_sw_fence_init(&req->submit, submit_notify);
+	i915_sw_fence_init(&req->execute, execute_notify);
+	/* Ensure that the execute fence completes after the submit fence -
+	 * as we complete the execute fence from within the submit fence
+	 * callback, its completion would otherwise be visible first.
+	 */
+	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
 
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
@@ -817,9 +831,9 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 }
 
 static long
-__i915_request_wait_for_submit(struct drm_i915_gem_request *request,
-			       unsigned int flags,
-			       long timeout)
+__i915_request_wait_for_execute(struct drm_i915_gem_request *request,
+				unsigned int flags,
+				long timeout)
 {
 	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
 		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
@@ -831,9 +845,9 @@ __i915_request_wait_for_submit(struct drm_i915_gem_request *request,
 		add_wait_queue(q, &reset);
 
 	do {
-		prepare_to_wait(&request->submit.wait, &wait, state);
+		prepare_to_wait(&request->execute.wait, &wait, state);
 
-		if (i915_sw_fence_done(&request->submit))
+		if (i915_sw_fence_done(&request->execute))
 			break;
 
 		if (flags & I915_WAIT_LOCKED &&
@@ -851,7 +865,7 @@ __i915_request_wait_for_submit(struct drm_i915_gem_request *request,
 
 		timeout = io_schedule_timeout(timeout);
 	} while (timeout);
-	finish_wait(&request->submit.wait, &wait);
+	finish_wait(&request->execute.wait, &wait);
 
 	if (flags & I915_WAIT_LOCKED)
 		remove_wait_queue(q, &reset);
@@ -903,13 +917,14 @@ long i915_wait_request(struct drm_i915_gem_request *req,
 
 	trace_i915_gem_request_wait_begin(req);
 
-	if (!i915_sw_fence_done(&req->submit)) {
-		timeout = __i915_request_wait_for_submit(req, flags, timeout);
+	if (!i915_sw_fence_done(&req->execute)) {
+		timeout = __i915_request_wait_for_execute(req, flags, timeout);
 		if (timeout < 0)
 			goto complete;
 
-		GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
+		GEM_BUG_ON(!i915_sw_fence_done(&req->execute));
 	}
+	GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
 	GEM_BUG_ON(!req->global_seqno);
 
 	/* Optimistic short spin before touching IRQs */
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 75f8360b3421..c8547d9b9004 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -84,8 +84,23 @@ struct drm_i915_gem_request {
 	struct intel_timeline *timeline;
 	struct intel_signal_node signaling;
 
+	/* Fences for the various phases in the request's lifetime.
+	 *
+	 * The submit fence is used to await upon all of the request's
+	 * dependencies. When it is signaled, the request is ready to run.
+	 * It is used by the driver to then queue the request for execution.
+	 *
+	 * The execute fence is used to signal when the request has been
+	 * sent to hardware.
+	 *
+	 * It is illegal for the submit fence of one request to wait upon the
+	 * execute fence of an earlier request. It should be sufficient to
+	 * wait upon the submit fence of the earlier request.
+	 */
 	struct i915_sw_fence submit;
+	struct i915_sw_fence execute;
 	wait_queue_t submitq;
+	wait_queue_t execq;
 
 	u32 global_seqno;
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v2 03/11] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
  2016-11-07 13:59 ` [PATCH v2 01/11] drm/i915: Create distinct lockclasses for execution vs user timelines Chris Wilson
  2016-11-07 13:59 ` [PATCH v2 02/11] drm/i915: Split request submit/execute phase into two Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-10 10:43   ` Tvrtko Ursulin
  2016-11-07 13:59 ` [PATCH v2 04/11] drm/i915: Remove engine->execlist_lock Chris Wilson
                   ` (10 subsequent siblings)
  13 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

Defer the transfer from the client's timeline onto the execution
timeline from the point of readiness to the point of actual submission.
For example, in execlists, a request is finally submitted to hardware
when the hardware is ready, and only put onto the hardware queue when
the request is ready. By deferring the transfer, we ensure that the
timeline is maintained in retirement order if we decide to queue the
requests onto the hardware in a different order than fifo.

v2: Rebased onto distinct global/user timeline lock classes.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_request.c    | 31 +++++++++++++++++-------------
 drivers/gpu/drm/i915/i915_gem_request.h    |  2 ++
 drivers/gpu/drm/i915/i915_guc_submission.c | 14 +++++++++++++-
 drivers/gpu/drm/i915/intel_lrc.c           | 23 +++++++++++++---------
 drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
 5 files changed, 49 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index e41d51a68ed8..19c29fafb07a 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
 	return atomic_inc_return(&tl->next_seqno);
 }
 
-static int __i915_sw_fence_call
-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
 {
-	struct drm_i915_gem_request *request =
-		container_of(fence, typeof(*request), submit);
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_timeline *timeline;
-	unsigned long flags;
 	u32 seqno;
 
-	if (state != FENCE_COMPLETE)
-		return NOTIFY_DONE;
-
 	/* Transfer from per-context onto the global per-engine timeline */
 	timeline = engine->timeline;
 	GEM_BUG_ON(timeline == request->timeline);
-
-	/* Will be called from irq-context when using foreign DMA fences */
-	spin_lock_irqsave(&timeline->lock, flags);
+	assert_spin_locked(&timeline->lock);
 
 	seqno = timeline_get_seqno(timeline->common);
 	GEM_BUG_ON(!seqno);
@@ -345,15 +336,29 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	GEM_BUG_ON(!request->global_seqno);
 	engine->emit_breadcrumb(request,
 				request->ring->vaddr + request->postfix);
-	engine->submit_request(request);
 
 	spin_lock(&request->timeline->lock);
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
 	i915_sw_fence_commit(&request->execute);
+}
 
-	spin_unlock_irqrestore(&timeline->lock, flags);
+static int __i915_sw_fence_call
+submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	if (state == FENCE_COMPLETE) {
+		struct drm_i915_gem_request *request =
+			container_of(fence, typeof(*request), submit);
+		struct intel_engine_cs *engine = request->engine;
+		unsigned long flags;
+
+		/* Will be called from irq-context when using foreign fences. */
+		spin_lock_irqsave_nested(&engine->timeline->lock, flags,
+					 SINGLE_DEPTH_NESTING);
+		engine->submit_request(request);
+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
+	}
 
 	return NOTIFY_DONE;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index c8547d9b9004..d8904863d3d9 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -241,6 +241,8 @@ void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
 #define i915_add_request_no_flush(req) \
 	__i915_add_request(req, false)
 
+void __i915_gem_request_submit(struct drm_i915_gem_request *request);
+
 struct intel_rps_client;
 #define NO_WAITBOOST ERR_PTR(-1)
 #define IS_RPS_CLIENT(p) (!IS_ERR(p))
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 666dab7a675a..83438c6a8864 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -629,11 +629,23 @@ static int guc_ring_doorbell(struct i915_guc_client *gc)
 static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
 	struct drm_i915_private *dev_priv = rq->i915;
-	unsigned int engine_id = rq->engine->id;
+	struct intel_engine_cs *engine = rq->engine;
+	unsigned int engine_id = engine->id;
 	struct intel_guc *guc = &rq->i915->guc;
 	struct i915_guc_client *client = guc->execbuf_client;
 	int b_ret;
 
+	/* We keep the previous context alive until we retire the following
+	 * request. This ensures that any the context object is still pinned
+	 * for any residual writes the HW makes into it on the context switch
+	 * into the next object following the breadcrumb. Otherwise, we may
+	 * retire the context too early.
+	 */
+	rq->previous_context = engine->last_context;
+	engine->last_context = rq->ctx;
+
+	__i915_gem_request_submit(rq);
+
 	spin_lock(&client->wq_lock);
 	guc_wq_item_append(client, rq);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fa3012c342cc..fa4920e72e93 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -434,6 +434,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *cursor, *last;
 	struct execlist_port *port = engine->execlist_port;
+	unsigned long flags;
 	bool submit = false;
 
 	last = port->request;
@@ -469,6 +470,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * and context switches) submission.
 	 */
 
+	spin_lock_irqsave(&engine->timeline->lock, flags);
 	spin_lock(&engine->execlist_lock);
 	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
 		/* Can we combine this request with the current port? It has to
@@ -501,6 +503,17 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			i915_gem_request_assign(&port->request, last);
 			port++;
 		}
+
+		/* We keep the previous context alive until we retire the
+		 * following request. This ensures that any the context object
+		 * is still pinned for any residual writes the HW makes into it
+		 * on the context switch into the next object following the
+		 * breadcrumb. Otherwise, we may retire the context too early.
+		 */
+		cursor->previous_context = engine->last_context;
+		engine->last_context = cursor->ctx;
+
+		__i915_gem_request_submit(cursor);
 		last = cursor;
 		submit = true;
 	}
@@ -512,6 +525,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		i915_gem_request_assign(&port->request, last);
 	}
 	spin_unlock(&engine->execlist_lock);
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	if (submit)
 		execlists_submit_ports(engine);
@@ -599,15 +613,6 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 
 	spin_lock_irqsave(&engine->execlist_lock, flags);
 
-	/* We keep the previous context alive until we retire the following
-	 * request. This ensures that any the context object is still pinned
-	 * for any residual writes the HW makes into it on the context switch
-	 * into the next object following the breadcrumb. Otherwise, we may
-	 * retire the context too early.
-	 */
-	request->previous_context = engine->last_context;
-	engine->last_context = request->ctx;
-
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 700e93d80616..f91ee24e2763 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1294,6 +1294,8 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
 {
 	struct drm_i915_private *dev_priv = request->i915;
 
+	__i915_gem_request_submit(request);
+
 	I915_WRITE_TAIL(request->engine, request->tail);
 }
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v2 04/11] drm/i915: Remove engine->execlist_lock
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (2 preceding siblings ...)
  2016-11-07 13:59 ` [PATCH v2 03/11] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-07 13:59 ` [PATCH v2 05/11] drm/i915/scheduler: Signal the arrival of a new request Chris Wilson
                   ` (9 subsequent siblings)
  13 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

The execlist_lock is now completely subsumed by the engine->timeline->lock,
and so we can remove the redundant layer of locking.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     | 4 ++--
 drivers/gpu/drm/i915/i915_gem.c         | 8 ++++++--
 drivers/gpu/drm/i915/intel_engine_cs.c  | 1 -
 drivers/gpu/drm/i915/intel_lrc.c        | 7 +------
 drivers/gpu/drm/i915/intel_ringbuffer.h | 1 -
 5 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index c9465fbff2df..3cb96d260dfb 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3256,11 +3256,11 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 				seq_printf(m, "\t\tELSP[1] idle\n");
 			rcu_read_unlock();
 
-			spin_lock_irq(&engine->execlist_lock);
+			spin_lock_irq(&engine->timeline->lock);
 			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
 				print_request(m, rq, "\t\tQ ");
 			}
-			spin_unlock_irq(&engine->execlist_lock);
+			spin_unlock_irq(&engine->timeline->lock);
 		} else if (INTEL_GEN(dev_priv) > 6) {
 			seq_printf(m, "\tPP_DIR_BASE: 0x%08x\n",
 				   I915_READ(RING_PP_DIR_BASE(engine)));
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index f68c80a7ebd6..df803e82eb07 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2689,12 +2689,16 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
 	 */
 
 	if (i915.enable_execlists) {
-		spin_lock(&engine->execlist_lock);
+		unsigned long flags;
+
+		spin_lock_irqsave(&engine->timeline->lock, flags);
+
 		INIT_LIST_HEAD(&engine->execlist_queue);
 		i915_gem_request_put(engine->execlist_port[0].request);
 		i915_gem_request_put(engine->execlist_port[1].request);
 		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
-		spin_unlock(&engine->execlist_lock);
+
+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 841f8d1e1410..298f0f95dd3f 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -237,7 +237,6 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
 void intel_engine_setup_common(struct intel_engine_cs *engine)
 {
 	INIT_LIST_HEAD(&engine->execlist_queue);
-	spin_lock_init(&engine->execlist_lock);
 
 	intel_engine_init_timeline(engine);
 	intel_engine_init_hangcheck(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index fa4920e72e93..94933f4297bb 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -471,7 +471,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 */
 
 	spin_lock_irqsave(&engine->timeline->lock, flags);
-	spin_lock(&engine->execlist_lock);
 	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
 		/* Can we combine this request with the current port? It has to
 		 * be the same context/ringbuffer and not have any exceptions
@@ -524,7 +523,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 
 		i915_gem_request_assign(&port->request, last);
 	}
-	spin_unlock(&engine->execlist_lock);
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	if (submit)
@@ -609,15 +607,12 @@ static void intel_lrc_irq_handler(unsigned long data)
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
-	unsigned long flags;
 
-	spin_lock_irqsave(&engine->execlist_lock, flags);
+	assert_spin_locked(&engine->timeline->lock);
 
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
-
-	spin_unlock_irqrestore(&engine->execlist_lock, flags);
 }
 
 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 642b54692d0d..062bc8e1872a 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -335,7 +335,6 @@ struct intel_engine_cs {
 
 	/* Execlists */
 	struct tasklet_struct irq_tasklet;
-	spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
 	struct execlist_port {
 		struct drm_i915_gem_request *request;
 		unsigned int count;
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v2 05/11] drm/i915/scheduler: Signal the arrival of a new request
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (3 preceding siblings ...)
  2016-11-07 13:59 ` [PATCH v2 04/11] drm/i915: Remove engine->execlist_lock Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-07 13:59 ` [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
                   ` (8 subsequent siblings)
  13 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

The start of the scheduler, add a hook into request submission for the
scheduler to see the arrival of new requests and prepare its runqueues.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c         |  4 ++++
 drivers/gpu/drm/i915/i915_gem_request.c | 13 +++++++++++++
 drivers/gpu/drm/i915/intel_engine_cs.c  |  3 +++
 drivers/gpu/drm/i915/intel_ringbuffer.h |  9 +++++++++
 include/uapi/drm/i915_drm.h             |  5 +++++
 5 files changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 0213a3090ab3..dc4db5ebf869 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -323,6 +323,10 @@ static int i915_getparam(struct drm_device *dev, void *data,
 		 */
 		value = i915_gem_mmap_gtt_version();
 		break;
+	case I915_PARAM_HAS_SCHEDULER:
+		value = dev_priv->engine[RCS] &&
+			dev_priv->engine[RCS]->schedule;
+		break;
 	case I915_PARAM_MMAP_VERSION:
 		/* Remember to bump this if the version changes! */
 	case I915_PARAM_HAS_GEM:
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 19c29fafb07a..93f77df9bc51 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -756,6 +756,19 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 
 	i915_gem_mark_busy(engine);
 
+	/* Let the backend know a new request has arrived that may need
+	 * to adjust the existing execution schedule due to a high priority
+	 * request - i.e. we may want to preempt the current request in order
+	 * to run a high priority dependency chain *before* we can execute this
+	 * request.
+	 *
+	 * This is called before the request is ready to run so that we can
+	 * decide whether to preempt the entire chain so that it is ready to
+	 * run at the earliest possible convenience.
+	 */
+	if (engine->schedule)
+		engine->schedule(request, 0);
+
 	local_bh_disable();
 	i915_sw_fence_commit(&request->submit);
 	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 298f0f95dd3f..c9171a058478 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -102,6 +102,9 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
 	engine->mmio_base = info->mmio_base;
 	engine->irq_shift = info->irq_shift;
 
+	/* Nothing to do here, execute in order of dependencies */
+	engine->schedule = NULL;
+
 	dev_priv->engine[id] = engine;
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 062bc8e1872a..75991a3c694b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -267,6 +267,15 @@ struct intel_engine_cs {
 	 */
 	void		(*submit_request)(struct drm_i915_gem_request *req);
 
+	/* Call when the priority on a request has changed and it and its
+	 * dependencies may need rescheduling. Note the request itself may
+	 * not be ready to run!
+	 *
+	 * Called under the struct_mutex.
+	 */
+	void		(*schedule)(struct drm_i915_gem_request *request,
+				    int priority);
+
 	/* Some chipsets are not quite as coherent as advertised and need
 	 * an expensive kick to force a true read of the up-to-date seqno.
 	 * However, the up-to-date seqno is not always required and the last
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 03725fe89859..1c12a350eca3 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -389,6 +389,11 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_MIN_EU_IN_POOL	 39
 #define I915_PARAM_MMAP_GTT_VERSION	 40
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
+ * priorities and the driver will attempt to execute batches in priority order.
+ */
+#define I915_PARAM_HAS_SCHEDULER	 41
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (4 preceding siblings ...)
  2016-11-07 13:59 ` [PATCH v2 05/11] drm/i915/scheduler: Signal the arrival of a new request Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-08 12:20   ` Chris Wilson
  2016-11-10 14:45   ` Tvrtko Ursulin
  2016-11-07 13:59 ` [PATCH v2 07/11] drm/i915/scheduler: Boost priorities for flips Chris Wilson
                   ` (7 subsequent siblings)
  13 siblings, 2 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

The scheduler needs to know the dependencies of each request for the
lifetime of the request, as it may choose to reschedule the requests at
any time and must ensure the dependency tree is not broken. This is in
additional to using the fence to only allow execution after all
dependencies have been completed.

One option was to extend the fence to support the bidirectional
dependency tracking required by the scheduler. However the mismatch in
lifetimes between the submit fence and the request essentially meant
that we had to build a completely separate struct (and we could not
simply reuse the existing waitqueue in the fence for one half of the
dependency tracking). The extra dependency tracking simply did not mesh
well with the fence, and keeping it separate both keeps the fence
implementation simpler and allows us to extend the dependency tracking
into a priority tree (whilst maintaining support for reordering the
tree).

To avoid the additional allocations and list manipulations, the use of
the priotree is disabled when there are no schedulers to use it.

v2: Create a dedicated slab for i915_dependency.
    Rename the lists.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
 drivers/gpu/drm/i915/i915_drv.h            |   1 +
 drivers/gpu/drm/i915/i915_gem.c            |  14 ++-
 drivers/gpu/drm/i915/i915_gem_request.c    |  96 +++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_request.h    |  40 ++++++++-
 drivers/gpu/drm/i915/i915_guc_submission.c |   1 +
 drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
 drivers/gpu/drm/i915/intel_lrc.c           | 135 +++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
 9 files changed, 282 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 3cb96d260dfb..dac435680e98 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
 			  struct drm_i915_gem_request *rq,
 			  const char *prefix)
 {
-	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
+	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
 		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
+		   rq->priotree.priority,
 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
 		   rq->timeline->common->name);
 }
@@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 
 		if (i915.enable_execlists) {
 			u32 ptr, read, write;
+			struct rb_node *rb;
 
 			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
 				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 			rcu_read_unlock();
 
 			spin_lock_irq(&engine->timeline->lock);
-			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
+			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
+				rq = rb_entry(rb, typeof(*rq), priotree.node);
 				print_request(m, rq, "\t\tQ ");
 			}
 			spin_unlock_irq(&engine->timeline->lock);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 4735b4177100..e790147209f3 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1791,6 +1791,7 @@ struct drm_i915_private {
 	struct kmem_cache *objects;
 	struct kmem_cache *vmas;
 	struct kmem_cache *requests;
+	struct kmem_cache *dependencies;
 
 	const struct intel_device_info info;
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index df803e82eb07..a4dc2da2323a 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2693,10 +2693,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
 
 		spin_lock_irqsave(&engine->timeline->lock, flags);
 
-		INIT_LIST_HEAD(&engine->execlist_queue);
 		i915_gem_request_put(engine->execlist_port[0].request);
 		i915_gem_request_put(engine->execlist_port[1].request);
 		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+		engine->execlist_queue = RB_ROOT;
+		engine->execlist_first = NULL;
 
 		spin_unlock_irqrestore(&engine->timeline->lock, flags);
 	}
@@ -4754,12 +4755,18 @@ i915_gem_load_init(struct drm_device *dev)
 	if (!dev_priv->requests)
 		goto err_vmas;
 
+	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
+					    SLAB_HWCACHE_ALIGN |
+					    SLAB_RECLAIM_ACCOUNT);
+	if (!dev_priv->dependencies)
+		goto err_requests;
+
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	INIT_LIST_HEAD(&dev_priv->gt.timelines);
 	err = i915_gem_timeline_init__global(dev_priv);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 	if (err)
-		goto err_requests;
+		goto err_dependencies;
 
 	INIT_LIST_HEAD(&dev_priv->context_list);
 	INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
@@ -4787,6 +4794,8 @@ i915_gem_load_init(struct drm_device *dev)
 
 	return 0;
 
+err_dependencies:
+	kmem_cache_destroy(dev_priv->dependencies);
 err_requests:
 	kmem_cache_destroy(dev_priv->requests);
 err_vmas:
@@ -4803,6 +4812,7 @@ void i915_gem_load_cleanup(struct drm_device *dev)
 
 	WARN_ON(!llist_empty(&dev_priv->mm.free_list));
 
+	kmem_cache_destroy(dev_priv->dependencies);
 	kmem_cache_destroy(dev_priv->requests);
 	kmem_cache_destroy(dev_priv->vmas);
 	kmem_cache_destroy(dev_priv->objects);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 93f77df9bc51..278b103a4e95 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -113,6 +113,82 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 	spin_unlock(&file_priv->mm.lock);
 }
 
+static struct i915_dependency *
+i915_dependency_alloc(struct drm_i915_private *i915)
+{
+	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
+}
+
+static void
+i915_dependency_free(struct drm_i915_private *i915,
+		     struct i915_dependency *dep)
+{
+	kmem_cache_free(i915->dependencies, dep);
+}
+
+static void
+__i915_priotree_add_dependency(struct i915_priotree *pt,
+			       struct i915_priotree *signal,
+			       struct i915_dependency *dep,
+			       unsigned long flags)
+{
+	INIT_LIST_HEAD(&dep->dfs_link);
+	list_add(&dep->wait_link, &signal->waiters_list);
+	list_add(&dep->signal_link, &pt->signalers_list);
+	dep->signaler = signal;
+	dep->flags = flags;
+}
+
+static int
+i915_priotree_add_dependency(struct drm_i915_private *i915,
+			     struct i915_priotree *pt,
+			     struct i915_priotree *signal)
+{
+	struct i915_dependency *dep;
+
+	dep = i915_dependency_alloc(i915);
+	if (!dep)
+		return -ENOMEM;
+
+	__i915_priotree_add_dependency(pt, signal, dep, I915_DEPENDENCY_ALLOC);
+	return 0;
+}
+
+static void
+i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
+{
+	struct i915_dependency *dep, *next;
+
+	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
+
+	/* Everyone we depended upon (the fences we wait to be signaled)
+	 * should retire before us and remove themselves from our list.
+	 * However, retirement is run independently on each timeline and
+	 * so we may be called out-of-order.
+	 */
+	list_for_each_entry_safe(dep, next, &pt->signalers_list, signal_link) {
+		list_del(&dep->wait_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			i915_dependency_free(i915, dep);
+	}
+
+	/* Remove ourselves from everyone who depends upon us */
+	list_for_each_entry_safe(dep, next, &pt->waiters_list, wait_link) {
+		list_del(&dep->signal_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			i915_dependency_free(i915, dep);
+	}
+}
+
+static void
+i915_priotree_init(struct i915_priotree *pt)
+{
+	INIT_LIST_HEAD(&pt->signalers_list);
+	INIT_LIST_HEAD(&pt->waiters_list);
+	RB_CLEAR_NODE(&pt->node);
+	pt->priority = INT_MIN;
+}
+
 void i915_gem_retire_noop(struct i915_gem_active *active,
 			  struct drm_i915_gem_request *request)
 {
@@ -182,6 +258,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	i915_gem_context_put(request->ctx);
 
 	dma_fence_signal(&request->fence);
+
+	i915_priotree_fini(request->i915, &request->priotree);
 	i915_gem_request_put(request);
 }
 
@@ -461,6 +539,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 */
 	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
 
+	i915_priotree_init(&req->priotree);
+
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
 	req->engine = engine;
@@ -514,6 +594,14 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
 
 	GEM_BUG_ON(to == from);
 
+	if (to->engine->schedule) {
+		ret = i915_priotree_add_dependency(to->i915,
+						   &to->priotree,
+						   &from->priotree);
+		if (ret < 0)
+			return ret;
+	}
+
 	if (to->timeline == from->timeline)
 		return 0;
 
@@ -737,9 +825,15 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 
 	prev = i915_gem_active_raw(&timeline->last_request,
 				   &request->i915->drm.struct_mutex);
-	if (prev)
+	if (prev) {
 		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
 					     &request->submitq);
+		if (engine->schedule)
+			__i915_priotree_add_dependency(&request->priotree,
+						       &prev->priotree,
+						       &request->dep,
+						       0);
+	}
 
 	spin_lock_irq(&timeline->lock);
 	list_add_tail(&request->link, &timeline->requests);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index d8904863d3d9..584d76170df3 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -41,6 +41,32 @@ struct intel_signal_node {
 	struct intel_wait wait;
 };
 
+struct i915_dependency {
+	struct i915_priotree *signaler;
+	struct list_head signal_link;
+	struct list_head wait_link;
+	struct list_head dfs_link;
+	unsigned long flags;
+#define I915_DEPENDENCY_ALLOC BIT(0)
+};
+
+/* Requests exist in a complex web of interdependencies. Each request
+ * has to wait for some other request to complete before it is ready to be run
+ * (e.g. we have to wait until the pixels have been rendering into a texture
+ * before we can copy from it). We track the readiness of a request in terms
+ * of fences, but we also need to keep the dependency tree for the lifetime
+ * of the request (beyond the life of an individual fence). We use the tree
+ * at various points to reorder the requests whilst keeping the requests
+ * in order with respect to their various dependencies.
+ */
+struct i915_priotree {
+	struct list_head signalers_list; /* those before us, we depend upon */
+	struct list_head waiters_list; /* those after us, they depend upon us */
+	struct rb_node node;
+	int priority;
+#define I915_PRIORITY_MAX 1024
+};
+
 /**
  * Request queue structure.
  *
@@ -102,6 +128,17 @@ struct drm_i915_gem_request {
 	wait_queue_t submitq;
 	wait_queue_t execq;
 
+	/* A list of everyone we wait upon, and everyone who waits upon us.
+	 * Even though we will not be submitted to the hardware before the
+	 * submit fence is signaled (it waits for all external events as well
+	 * as our own requests), the scheduler still needs to know the
+	 * dependency tree for the lifetime of the request (from execbuf
+	 * to retirement), i.e. bidirectional dependency information for the
+	 * request not tied to individual fences.
+	 */
+	struct i915_priotree priotree;
+	struct i915_dependency dep;
+
 	u32 global_seqno;
 
 	/** GEM sequence number associated with the previous request,
@@ -158,9 +195,6 @@ struct drm_i915_gem_request {
 	struct drm_i915_file_private *file_priv;
 	/** file_priv list entry for this request */
 	struct list_head client_list;
-
-	/** Link in the execlist submission queue, guarded by execlist_lock. */
-	struct list_head execlist_link;
 };
 
 extern const struct dma_fence_ops i915_fence_ops;
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 83438c6a8864..7c6819968307 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1532,6 +1532,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
 		engine->submit_request = i915_guc_submit;
+		engine->schedule = NULL;
 
 		/* Replay the current set of previously submitted requests */
 		list_for_each_entry(request,
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index c9171a058478..3da4d466e332 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -239,7 +239,8 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
  */
 void intel_engine_setup_common(struct intel_engine_cs *engine)
 {
-	INIT_LIST_HEAD(&engine->execlist_queue);
+	engine->execlist_queue = RB_ROOT;
+	engine->execlist_first = NULL;
 
 	intel_engine_init_timeline(engine);
 	intel_engine_init_hangcheck(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 94933f4297bb..af944a246511 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -432,9 +432,10 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
 
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *cursor, *last;
+	struct drm_i915_gem_request *last;
 	struct execlist_port *port = engine->execlist_port;
 	unsigned long flags;
+	struct rb_node *rb;
 	bool submit = false;
 
 	last = port->request;
@@ -471,7 +472,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 */
 
 	spin_lock_irqsave(&engine->timeline->lock, flags);
-	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
 		/* Can we combine this request with the current port? It has to
 		 * be the same context/ringbuffer and not have any exceptions
 		 * (e.g. GVT saying never to combine contexts).
@@ -503,6 +508,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			port++;
 		}
 
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
 		/* We keep the previous context alive until we retire the
 		 * following request. This ensures that any the context object
 		 * is still pinned for any residual writes the HW makes into it
@@ -517,11 +527,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		submit = true;
 	}
 	if (submit) {
-		/* Decouple all the requests submitted from the queue */
-		engine->execlist_queue.next = &cursor->execlist_link;
-		cursor->execlist_link.prev = &engine->execlist_queue;
-
 		i915_gem_request_assign(&port->request, last);
+		engine->execlist_first = rb;
 	}
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
@@ -604,17 +611,126 @@ static void intel_lrc_irq_handler(unsigned long data)
 	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
 }
 
+static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
+{
+	struct rb_node **p, *rb;
+	bool first = true;
+
+	/* most positive priority is scheduled first, equal priorities fifo */
+	rb = NULL;
+	p = &root->rb_node;
+	while (*p) {
+		struct i915_priotree *pos;
+
+		rb = *p;
+		pos = rb_entry(rb, typeof(*pos), node);
+		if (pt->priority > pos->priority) {
+			p = &rb->rb_left;
+		} else {
+			p = &rb->rb_right;
+			first = false;
+		}
+	}
+	rb_link_node(&pt->node, rb, p);
+	rb_insert_color(&pt->node, root);
+
+	return first;
+}
+
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
 
 	assert_spin_locked(&engine->timeline->lock);
 
-	list_add_tail(&request->execlist_link, &engine->execlist_queue);
+	if (insert_request(&request->priotree, &engine->execlist_queue))
+		engine->execlist_first = &request->priotree.node;
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
 }
 
+static struct intel_engine_cs *
+pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
+{
+	struct intel_engine_cs *engine;
+
+	engine = container_of(pt,
+			      struct drm_i915_gem_request,
+			      priotree)->engine;
+	if (engine != locked) {
+		if (locked)
+			spin_unlock_irq(&locked->timeline->lock);
+		spin_lock_irq(&engine->timeline->lock);
+	}
+
+	return engine;
+}
+
+static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
+{
+	struct intel_engine_cs *engine = NULL;
+	struct i915_dependency *dep, *p;
+	struct i915_dependency stack;
+	LIST_HEAD(dfs);
+
+	if (prio <= READ_ONCE(request->priotree.priority))
+		return;
+
+	/* Need BKL in order to use the temporary link inside i915_dependency */
+	lockdep_assert_held(&request->i915->drm.struct_mutex);
+
+	stack.signaler = &request->priotree;
+	list_add(&stack.dfs_link, &dfs);
+
+	/* Recursively bump all dependent priorities to match the new request */
+	list_for_each_entry_safe(dep, p, &dfs, dfs_link) {
+		struct i915_priotree *pt = dep->signaler;
+
+		list_for_each_entry(p, &pt->signalers_list, signal_link)
+			if (prio > READ_ONCE(p->signaler->priority))
+				list_move_tail(&p->dfs_link, &dfs);
+
+		p = list_first_entry(&dep->dfs_link, typeof(*p), dfs_link);
+		if (!RB_EMPTY_NODE(&pt->node))
+			continue;
+
+		engine = pt_lock_engine(pt, engine);
+
+		/* If it is not already in the rbtree, we can update the
+		 * priority inplace and skip over it (and its dependencies)
+		 * if it is referenced again as we descend the dfs.
+		 */
+		if (prio > pt->priority && RB_EMPTY_NODE(&pt->node)) {
+			pt->priority = prio;
+			list_del_init(&dep->dfs_link);
+		}
+	}
+
+	/* Fifo and depth-first replacement ensure our deps execute before us */
+	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
+		struct i915_priotree *pt = dep->signaler;
+
+		INIT_LIST_HEAD(&dep->dfs_link);
+
+		engine = pt_lock_engine(pt, engine);
+
+		if (prio <= pt->priority)
+			continue;
+
+		GEM_BUG_ON(RB_EMPTY_NODE(&pt->node));
+
+		pt->priority = prio;
+		rb_erase(&pt->node, &engine->execlist_queue);
+		if (insert_request(pt, &engine->execlist_queue))
+			engine->execlist_first = &pt->node;
+	}
+
+	if (engine)
+		spin_unlock_irq(&engine->timeline->lock);
+
+	/* XXX Do we need to preempt to make room for us and our deps? */
+}
+
 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
@@ -1651,8 +1767,10 @@ void intel_execlists_enable_submission(struct drm_i915_private *dev_priv)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	for_each_engine(engine, dev_priv, id)
+	for_each_engine(engine, dev_priv, id) {
 		engine->submit_request = execlists_submit_request;
+		engine->schedule = execlists_schedule;
+	}
 }
 
 static void
@@ -1665,6 +1783,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	engine->emit_breadcrumb = gen8_emit_breadcrumb;
 	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
 	engine->submit_request = execlists_submit_request;
+	engine->schedule = execlists_schedule;
 
 	engine->irq_enable = gen8_logical_ring_enable_irq;
 	engine->irq_disable = gen8_logical_ring_disable_irq;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 75991a3c694b..cbc148863a03 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -348,7 +348,8 @@ struct intel_engine_cs {
 		struct drm_i915_gem_request *request;
 		unsigned int count;
 	} execlist_port[2];
-	struct list_head execlist_queue;
+	struct rb_root execlist_queue;
+	struct rb_node *execlist_first;
 	unsigned int fw_domains;
 	bool disable_lite_restore_wa;
 	bool preempt_wa;
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v2 07/11] drm/i915/scheduler: Boost priorities for flips
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (5 preceding siblings ...)
  2016-11-07 13:59 ` [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-10 10:52   ` Tvrtko Ursulin
  2016-11-07 13:59 ` [PATCH v2 08/11] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
                   ` (6 subsequent siblings)
  13 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

Boost the priority of any rendering required to show the next pageflip
as we want to avoid missing the vblank by being delayed by invisible
workload. We prioritise avoiding jank and jitter in the GUI over
starving background tasks.

v2: Descend dma_fence_array when boosting priorities.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h      |  5 +++
 drivers/gpu/drm/i915/i915_gem.c      | 63 ++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_display.c |  2 ++
 3 files changed, 70 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index e790147209f3..5c658c6d06e4 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3420,6 +3420,11 @@ int i915_gem_object_wait(struct drm_i915_gem_object *obj,
 			 unsigned int flags,
 			 long timeout,
 			 struct intel_rps_client *rps);
+int i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
+				  unsigned int flags,
+				  int priority);
+#define I915_PRIORITY_DISPLAY I915_PRIORITY_MAX
+
 int __must_check
 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
 				  bool write);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a4dc2da2323a..ae31686fb16d 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -34,6 +34,7 @@
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
 #include "intel_mocs.h"
+#include <linux/dma-fence-array.h>
 #include <linux/reservation.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
@@ -433,6 +434,68 @@ i915_gem_object_wait_reservation(struct reservation_object *resv,
 	return timeout;
 }
 
+static void __fence_set_priority(struct dma_fence *fence, int prio)
+{
+	struct drm_i915_gem_request *rq;
+	struct intel_engine_cs *engine;
+
+	if (!dma_fence_is_i915(fence))
+		return;
+
+	rq = to_request(fence);
+	engine = rq->engine;
+	if (!engine->schedule)
+		return;
+
+	engine->schedule(rq, prio);
+}
+
+static void fence_set_priority(struct dma_fence *fence, int prio)
+{
+	if (dma_fence_is_array(fence)) {
+		struct dma_fence_array *array = to_dma_fence_array(fence);
+		int i;
+
+		for (i = 0; i < array->num_fences; i++)
+			__fence_set_priority(array->fences[i], prio);
+	} else
+		__fence_set_priority(fence, prio);
+}
+
+int
+i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
+			      unsigned int flags,
+			      int prio)
+{
+	struct dma_fence *excl;
+
+	if (flags & I915_WAIT_ALL) {
+		struct dma_fence **shared;
+		unsigned int count, i;
+		int ret;
+
+		ret = reservation_object_get_fences_rcu(obj->resv,
+							&excl, &count, &shared);
+		if (ret)
+			return ret;
+
+		for (i = 0; i < count; i++) {
+			fence_set_priority(shared[i], prio);
+			dma_fence_put(shared[i]);
+		}
+
+		kfree(shared);
+	} else {
+		excl = reservation_object_get_excl_rcu(obj->resv);
+	}
+
+	if (excl) {
+		fence_set_priority(excl, prio);
+		dma_fence_put(excl);
+	}
+	return 0;
+}
+
 /**
  * Waits for rendering to the object to be completed
  * @obj: i915 gem object
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 92ab01f33208..650e2e452a2c 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -14784,6 +14784,8 @@ intel_prepare_plane_fb(struct drm_plane *plane,
 						      GFP_KERNEL);
 		if (ret < 0)
 			return ret;
+
+		i915_gem_object_wait_priority(obj, 0, I915_PRIORITY_DISPLAY);
 	}
 
 	if (plane->type == DRM_PLANE_TYPE_CURSOR &&
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v2 08/11] HACK drm/i915/scheduler: emulate a scheduler for guc
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (6 preceding siblings ...)
  2016-11-07 13:59 ` [PATCH v2 07/11] drm/i915/scheduler: Boost priorities for flips Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-07 13:59 ` [PATCH v2 09/11] drm/i915/scheduler: Support user-defined priorities Chris Wilson
                   ` (5 subsequent siblings)
  13 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 83 ++++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_irq.c            |  4 +-
 drivers/gpu/drm/i915/intel_lrc.c           |  3 --
 3 files changed, 75 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 7c6819968307..088f5a99ecfc 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 	u32 freespace;
 	int ret;
 
-	spin_lock(&gc->wq_lock);
+	spin_lock_irq(&gc->wq_lock);
 	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
 	freespace -= gc->wq_rsvd;
 	if (likely(freespace >= wqi_size)) {
@@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 		gc->no_wq_space++;
 		ret = -EAGAIN;
 	}
-	spin_unlock(&gc->wq_lock);
+	spin_unlock_irq(&gc->wq_lock);
 
 	return ret;
 }
@@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
 
 	GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
 
-	spin_lock(&gc->wq_lock);
+	spin_lock_irq(&gc->wq_lock);
 	gc->wq_rsvd -= wqi_size;
-	spin_unlock(&gc->wq_lock);
+	spin_unlock_irq(&gc->wq_lock);
 }
 
 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
 	spin_unlock(&client->wq_lock);
 }
 
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *last = port[0].request;
+	unsigned long flags;
+	struct rb_node *rb;
+	bool submit = false;
+
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
+		if (last && cursor->ctx != last->ctx) {
+			if (port != engine->execlist_port)
+				break;
+
+			i915_gem_request_assign(&port->request, last);
+			dma_fence_enable_sw_signaling(&last->fence);
+			port++;
+		}
+
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
+		i915_guc_submit(cursor);
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		i915_gem_request_assign(&port->request, last);
+		dma_fence_enable_sw_signaling(&last->fence);
+		engine->execlist_first = rb;
+	}
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+
+	return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *rq;
+	bool submit;
+
+	do {
+		rq = port[0].request;
+		while (rq && i915_gem_request_completed(rq)) {
+			i915_gem_request_put(rq);
+			rq = port[1].request;
+			port[0].request = rq;
+			port[1].request = NULL;
+		}
+
+		submit = false;
+		if (!port[1].request)
+			submit = i915_guc_dequeue(engine);
+	} while (submit);
+}
+
 /*
  * Everything below here is concerned with setup & teardown, and is
  * therefore not part of the somewhat time-critical batch-submission
@@ -1531,16 +1595,13 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
-		engine->submit_request = i915_guc_submit;
-		engine->schedule = NULL;
+		tasklet_init(&engine->irq_tasklet,
+			     i915_guc_irq_handler,
+			     (unsigned long)engine);
 
 		/* Replay the current set of previously submitted requests */
-		list_for_each_entry(request,
-				    &engine->timeline->requests, link) {
+		list_for_each_entry(request, &engine->timeline->requests, link)
 			client->wq_rsvd += sizeof(struct guc_wq_item);
-			if (i915_sw_fence_done(&request->submit))
-				i915_guc_submit(request);
-		}
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 6d7505b5c5e7..217f63e17e4e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 static __always_inline void
 gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 {
-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
+		tasklet_schedule(&engine->irq_tasklet);
 		notify_ring(engine);
+	}
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
 		tasklet_schedule(&engine->irq_tasklet);
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index af944a246511..7c8e768b736e 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1399,9 +1399,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	request->ring->last_retired_head = -1;
 	intel_ring_update_space(request->ring);
 
-	if (i915.enable_guc_submission)
-		return;
-
 	/* Catch up with any missed context-switch interrupts */
 	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
 	if (request->ctx != port[0].request->ctx) {
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v2 09/11] drm/i915/scheduler: Support user-defined priorities
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (7 preceding siblings ...)
  2016-11-07 13:59 ` [PATCH v2 08/11] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-10 13:02   ` Tvrtko Ursulin
  2016-11-07 13:59 ` [PATCH v2 10/11] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
                   ` (4 subsequent siblings)
  13 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

Use a priority stored in the context as the initial value when
submitting a request. This allows us to change the default priority on a
per-context basis, allowing different contexts to be favoured with GPU
time at the expense of lower importance work. The user can adjust the
context's priority via I915_CONTEXT_PARAM_PRIORITY, with more positive
values being higher priority (they will be serviced earlier, after their
dependencies have been resolved). Any prerequisite work for an execbuf
will have its priority raised to match the new request as required.

Normal users can specify any value in the range of -1023 to 0 [default],
i.e. they can reduce the priority of their workloads (and temporarily
boost it back to normal if so desired).

Privileged users can specify any value in the range of -1023 to 1023,
[default is 0], i.e. they can raise their priority above all overs and
so potentially starve the system.

Note that the existing schedulers are not fair, nor load balancing, the
execution is strictly by priority on a first-come, first-served basis,
and the driver may choose to boost some requests above the range
available to users.

This priority was originally based around nice(2), but evolved to allow
clients to adjust their priority within a small range, and allow for a
privileged high priority range.

For example, this can be used to implement EGL_IMG_context_priority
https://www.khronos.org/registry/egl/extensions/IMG/EGL_IMG_context_priority.txt

	EGL_CONTEXT_PRIORITY_LEVEL_IMG determines the priority level of
        the context to be created. This attribute is a hint, as an
        implementation may not support multiple contexts at some
        priority levels and system policy may limit access to high
        priority contexts to appropriate system privilege level. The
        default value for EGL_CONTEXT_PRIORITY_LEVEL_IMG is
        EGL_CONTEXT_PRIORITY_MEDIUM_IMG."

so we can map

	PRIORITY_HIGH -> 1023 [privileged, will failback to 0]
	PRIORITY_MED -> 0 [default]
	PRIORITY_LOW -> -1023

They also map onto the priorities used by VkQueue (and a VkQueue is
essentially a timeline, our i915_gem_context under full-ppgtt).

Testcase: igt/gem_exec_schedule
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_gem_context.c | 21 +++++++++++++++++++++
 drivers/gpu/drm/i915/i915_gem_request.c |  2 +-
 include/uapi/drm/i915_drm.h             |  3 +++
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5c658c6d06e4..d253aeee0fb2 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -949,6 +949,7 @@ struct i915_gem_context {
 	/* Unique identifier for this context, used by the hw for tracking */
 	unsigned int hw_id;
 	u32 user_handle;
+	int priority; /* greater priorities are serviced first */
 
 	u32 ggtt_alignment;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 6dd475735f0a..48b5aacf5fc2 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -476,6 +476,7 @@ int i915_gem_context_init(struct drm_device *dev)
 		return PTR_ERR(ctx);
 	}
 
+	ctx->priority = -I915_PRIORITY_MAX; /* lowest priority; idle task */
 	dev_priv->kernel_context = ctx;
 
 	DRM_DEBUG_DRIVER("%s context support initialized\n",
@@ -1100,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
 		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
 		break;
+	case I915_CONTEXT_PARAM_PRIORITY:
+		args->value = ctx->priority;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1155,6 +1159,23 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
 		}
 		break;
+
+	case I915_CONTEXT_PARAM_PRIORITY:
+		{
+			int priority = args->value;
+
+			if (args->size)
+				ret = -EINVAL;
+			else if (priority >= I915_PRIORITY_MAX ||
+				 priority <= -I915_PRIORITY_MAX)
+				ret = -EINVAL;
+			else if (priority > 0 && !capable(CAP_SYS_ADMIN))
+				ret = -EPERM;
+			else
+				ctx->priority = priority;
+		}
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 278b103a4e95..cfda095f0234 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -861,7 +861,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * run at the earliest possible convenience.
 	 */
 	if (engine->schedule)
-		engine->schedule(request, 0);
+		engine->schedule(request, request->ctx->priority);
 
 	local_bh_disable();
 	i915_sw_fence_commit(&request->submit);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1c12a350eca3..47901a8ad682 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -391,6 +391,8 @@ typedef struct drm_i915_irq_wait {
 
 /* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
  * priorities and the driver will attempt to execute batches in priority order.
+ * The initial priority for each batch is supplied by the context and is
+ * controlled via I915_CONTEXT_PARAM_PRIORITY.
  */
 #define I915_PARAM_HAS_SCHEDULER	 41
 
@@ -1224,6 +1226,7 @@ struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
+#define I915_CONTEXT_PARAM_PRIORITY	0x5
 	__u64 value;
 };
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v2 10/11] drm/i915: Enable userspace to opt-out of implicit fencing
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (8 preceding siblings ...)
  2016-11-07 13:59 ` [PATCH v2 09/11] drm/i915/scheduler: Support user-defined priorities Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-07 13:59 ` [PATCH v2 11/11] drm/i915: Support explicit fencing for execbuf Chris Wilson
                   ` (3 subsequent siblings)
  13 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

Userspace is faced with a dilemma. The kernel requires implicit fencing
to manage resource usage (we always must wait for the GPU to finish
before releasing its PTE) and for third parties. However, userspace may
wish to avoid this serialisation if it is either using explicit fencing
between parties and wants more fine-grained access to buffers (e.g. it
may partition the buffer between uses and track fences on ranges rather
than the implicit fences tracking the whole object). It follows that
userspace needs a mechanism to avoid the kernel's serialisation on its
implicit fences before execbuf execution.

The next question is whether this is an object, execbuf or context flag.
Hybrid users (such as using explicit EGL_ANDROID_native_sync fencing on
shared winsys buffers, but implicit fencing on internal surfaces)
require a per-object level flag. Given that this flag need to be only
set once for the lifetime of the object, this reduces the convenience of
having an execbuf or context level flag (and avoids having multiple
pieces of uABI controlling the same feature).

Incorrect use of this flag will result in rendering corruption and GPU
hangs - but will not result in use-after-free or similar resource
tracking issues.

Serious caveat: write ordering is not strictly correct after setting
this flag on a render target on multiple engines. This affects all
subsequent GEM operations (execbuf, set-domain, pread) and shared
dma-buf operations. A fix is possible - but costly (both in terms of
further ABI changes and runtime overhead).

Testcase: igt/gem_exec_async
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
 include/uapi/drm/i915_drm.h                | 29 ++++++++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index dc4db5ebf869..1b8642301a52 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -343,6 +343,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_HANDLE_LUT:
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
+	case I915_PARAM_HAS_EXEC_ASYNC:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 322c580a739f..0d6d758db5b9 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1111,6 +1111,9 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 	list_for_each_entry(vma, vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
+		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
+			continue;
+
 		ret = i915_gem_request_await_object
 			(req, obj, obj->base.pending_write_domain);
 		if (ret)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 47901a8ad682..4bd83c0b07db 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -396,6 +396,12 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_SCHEDULER	 41
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to opt-out of
+ * synchronisation with implicit fencing on individual objects.
+ */
+#define I915_PARAM_HAS_EXEC_ASYNC	 42
+
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -736,8 +742,29 @@ struct drm_i915_gem_exec_object2 {
 #define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
 #define EXEC_OBJECT_PINNED		 (1<<4)
 #define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
+/* The kernel implicitly tracks GPU activity on all GEM objects, and
+ * synchronises operations with outstanding rendering. This includes
+ * rendering on other devices if exported via dma-buf. However, sometimes
+ * this tracking is too coarse and the user knows better. For example,
+ * if the object is split into non-overlapping ranges shared between different
+ * clients or engines (i.e. suballocating objects), the implicit tracking
+ * by kernel assumes that each operation affects the whole object rather
+ * than an individual range, causing needless synchronisation between clients.
+ * The kernel will also forgo any CPU cache flushes prior to rendering from
+ * the object as the client is expected to be also handling such domain
+ * tracking.
+ *
+ * The kernel maintains the implicit tracking in order to manage resources
+ * used by the GPU - this flag only disables the synchronisation prior to
+ * rendering with this object in this execbuf.
+ *
+ * Opting out of implicit synhronisation requires the user to do its own
+ * explicit tracking to avoid rendering corruption. See, for example,
+ * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
+ */
+#define EXEC_OBJECT_ASYNC		(1<<6)
 /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_PAD_TO_SIZE<<1)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_ASYNC<<1)
 	__u64 flags;
 
 	union {
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v2 11/11] drm/i915: Support explicit fencing for execbuf
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (9 preceding siblings ...)
  2016-11-07 13:59 ` [PATCH v2 10/11] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
@ 2016-11-07 13:59 ` Chris Wilson
  2016-11-07 15:18 ` ✓ Fi.CI.BAT: success for series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines Patchwork
                   ` (2 subsequent siblings)
  13 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-07 13:59 UTC (permalink / raw)
  To: intel-gfx

Now that the user can opt-out of implicit fencing, we need to give them
back control over the fencing. We employ sync_file to wrap our
drm_i915_gem_request and provide an fd that userspace can merge with
other sync_file fds and pass back to the kernel to wait upon before
future execution.

Testcase: igt/gem_exec_fence
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/Kconfig               |  1 +
 drivers/gpu/drm/i915/i915_drv.c            |  3 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 54 +++++++++++++++++++++++++++---
 include/uapi/drm/i915_drm.h                | 35 ++++++++++++++++++-
 4 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index df96aed6975a..8e93b61bd8c3 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -19,6 +19,7 @@ config DRM_I915
 	select INPUT if ACPI
 	select ACPI_VIDEO if ACPI
 	select ACPI_BUTTON if ACPI
+	select SYNC_FILE
 	help
 	  Choose this option if you have a system that has "Intel Graphics
 	  Media Accelerator" or "HD Graphics" integrated graphics,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 1b8642301a52..cb20854ea37e 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -344,6 +344,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
 	case I915_PARAM_HAS_EXEC_ASYNC:
+	case I915_PARAM_HAS_EXEC_FENCE:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
@@ -2533,7 +2534,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_HWS_ADDR, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_INIT, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH),
-	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2_WR, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_GEM_PIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_UNPIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0d6d758db5b9..55a8db2690c6 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -28,6 +28,7 @@
 
 #include <linux/dma_remapping.h>
 #include <linux/reservation.h>
+#include <linux/sync_file.h>
 #include <linux/uaccess.h>
 
 #include <drm/drmP.h>
@@ -1589,6 +1590,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct i915_execbuffer_params *params = &params_master;
 	const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 dispatch_flags;
+	struct dma_fence *in_fence = NULL;
+	struct sync_file *out_fence = NULL;
+	int out_fence_fd = -1;
 	int ret;
 	bool need_relocs;
 
@@ -1632,6 +1636,23 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		dispatch_flags |= I915_DISPATCH_RS;
 	}
 
+	if (args->flags & I915_EXEC_FENCE_IN) {
+		in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
+		if (!in_fence) {
+			ret = -EINVAL;
+			goto pre_mutex_err;
+		}
+	}
+
+	if (args->flags & I915_EXEC_FENCE_OUT) {
+		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
+		if (out_fence_fd < 0) {
+			ret = out_fence_fd;
+			out_fence_fd = -1;
+			goto pre_mutex_err;
+		}
+	}
+
 	/* Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
 	 * process. Upon first dispatch, we acquire another prolonged
@@ -1776,6 +1797,21 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		goto err_batch_unpin;
 	}
 
+	if (in_fence) {
+		ret = i915_gem_request_await_dma_fence(params->request,
+						       in_fence);
+		if (ret < 0)
+			goto err_request;
+	}
+
+	if (out_fence_fd != -1) {
+		out_fence = sync_file_create(&params->request->fence);
+		if (!out_fence) {
+			ret = -ENOMEM;
+			goto err_request;
+		}
+	}
+
 	/* Whilst this request exists, batch_obj will be on the
 	 * active_list, and so will hold the active reference. Only when this
 	 * request is retired will the the batch_obj be moved onto the
@@ -1803,6 +1839,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	ret = execbuf_submit(params, args, &eb->vmas);
 err_request:
 	__i915_add_request(params->request, ret == 0);
+	if (out_fence) {
+		if (ret == 0) {
+			fd_install(out_fence_fd, out_fence->file);
+			args->rsvd2 &= GENMASK_ULL(0, 31); /* keep in-fence */
+			args->rsvd2 |= (u64)out_fence_fd << 32;
+			out_fence_fd = -1;
+		} else {
+			fput(out_fence->file);
+		}
+	}
 
 err_batch_unpin:
 	/*
@@ -1824,6 +1870,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
 	intel_runtime_pm_put(dev_priv);
+	if (out_fence_fd != -1)
+		put_unused_fd(out_fence_fd);
+	dma_fence_put(in_fence);
 	return ret;
 }
 
@@ -1931,11 +1980,6 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	if (args->rsvd2 != 0) {
-		DRM_DEBUG("dirty rvsd2 field\n");
-		return -EINVAL;
-	}
-
 	exec2_list = drm_malloc_gfp(args->buffer_count,
 				    sizeof(*exec2_list),
 				    GFP_TEMPORARY);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 4bd83c0b07db..90082269fb50 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -246,6 +246,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
 #define DRM_I915_OVERLAY_ATTRS	0x28
 #define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_EXECBUFFER2_WR	DRM_I915_GEM_EXECBUFFER2
 #define DRM_I915_GET_SPRITE_COLORKEY	0x2a
 #define DRM_I915_SET_SPRITE_COLORKEY	0x2b
 #define DRM_I915_GEM_WAIT	0x2c
@@ -279,6 +280,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
 #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
 #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
 #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
@@ -401,6 +403,12 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_ASYNC	 42
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports explicit fence support -
+ * both being able to pass in a sync_file fd to wait upon before executing,
+ * and being able to return a new sync_file fd that is signaled when the
+ * current request is complete.
+ */
+#define I915_PARAM_HAS_EXEC_FENCE	 43
 
 typedef struct drm_i915_getparam {
 	__s32 param;
@@ -854,7 +862,32 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
+/* Setting I915_EXEC_FENCE_IN implies that lower_32_bits(rsvd2) represent
+ * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
+ * the batch.
+ *
+ * Returns -EINVAL if the sync_file fd cannot be found.
+ */
+#define I915_EXEC_FENCE_IN		(1<<16)
+
+/* Setting I915_EXEC_FENCE_OUT causes the ioctl to return a sync_file fd
+ * in the upper_32_bits(rsvd2) upon success. Ownership of the fd is given
+ * to the caller, and it should be close() after use. (The fd is a regular
+ * file descriptor and will be cleaned up on process termination. It holds
+ * a reference to the request, but nothing else.)
+ *
+ * The sync_file fd can be combined with other sync_file and passed either
+ * to execbuf using I915_EXEC_FENCE_IN, to atomic KMS ioctls (so that a flip
+ * will only occur after this request completes), or to other devices.
+ *
+ * Using I915_EXEC_FENCE_OUT requires use of
+ * DRM_IOCTL_I915_GEM_EXECBUFFER2_WR ioctl so that the result is written
+ * back to userspace. Failure to do so will cause the out-fence to always
+ * be reported as zero, and the real fence fd to be leaked.
+ */
+#define I915_EXEC_FENCE_OUT		(1<<17)
+
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (10 preceding siblings ...)
  2016-11-07 13:59 ` [PATCH v2 11/11] drm/i915: Support explicit fencing for execbuf Chris Wilson
@ 2016-11-07 15:18 ` Patchwork
  2016-11-10 11:45 ` ✓ Fi.CI.BAT: success for series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines (rev2) Patchwork
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
  13 siblings, 0 replies; 82+ messages in thread
From: Patchwork @ 2016-11-07 15:18 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines
URL   : https://patchwork.freedesktop.org/series/14926/
State : success

== Summary ==

Series 14926v1 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/14926/revisions/1/mbox/


fi-bdw-5557u     total:241  pass:226  dwarn:0   dfail:0   fail:0   skip:15 
fi-bsw-n3050     total:241  pass:201  dwarn:0   dfail:0   fail:0   skip:40 
fi-bxt-t5700     total:241  pass:213  dwarn:0   dfail:0   fail:0   skip:28 
fi-byt-j1900     total:241  pass:213  dwarn:0   dfail:0   fail:0   skip:28 
fi-byt-n2820     total:241  pass:209  dwarn:0   dfail:0   fail:0   skip:32 
fi-hsw-4770      total:241  pass:221  dwarn:0   dfail:0   fail:0   skip:20 
fi-hsw-4770r     total:241  pass:221  dwarn:0   dfail:0   fail:0   skip:20 
fi-ilk-650       total:241  pass:188  dwarn:0   dfail:0   fail:0   skip:53 
fi-ivb-3520m     total:241  pass:219  dwarn:0   dfail:0   fail:0   skip:22 
fi-ivb-3770      total:241  pass:219  dwarn:0   dfail:0   fail:0   skip:22 
fi-kbl-7200u     total:241  pass:219  dwarn:0   dfail:0   fail:0   skip:22 
fi-skl-6260u     total:241  pass:227  dwarn:0   dfail:0   fail:0   skip:14 
fi-skl-6700hq    total:241  pass:220  dwarn:0   dfail:0   fail:0   skip:21 
fi-skl-6700k     total:241  pass:219  dwarn:1   dfail:0   fail:0   skip:21 
fi-skl-6770hq    total:241  pass:227  dwarn:0   dfail:0   fail:0   skip:14 
fi-snb-2520m     total:241  pass:209  dwarn:0   dfail:0   fail:0   skip:32 
fi-snb-2600      total:241  pass:208  dwarn:0   dfail:0   fail:0   skip:33 

44f80301cde325b9a33e594f8bec88f84e02fffa drm-intel-nightly: 2016y-11m-07d-12h-48m-36s UTC integration manifest
6716437 drm/i915: Support explicit fencing for execbuf
0ceaa08 drm/i915: Enable userspace to opt-out of implicit fencing
e393102 drm/i915/scheduler: Support user-defined priorities
30873fa HACK drm/i915/scheduler: emulate a scheduler for guc
ee261f83 drm/i915/scheduler: Boost priorities for flips
ccad888 drm/i915/scheduler: Record all dependencies upon request construction
01edcb1 drm/i915/scheduler: Signal the arrival of a new request
82cefae drm/i915: Remove engine->execlist_lock
24066ba drm/i915: Defer transfer onto execution timeline to actual hw submission
b464542 drm/i915: Split request submit/execute phase into two
ce170b4 drm/i915: Create distinct lockclasses for execution vs user timelines

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_2921/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 01/11] drm/i915: Create distinct lockclasses for execution vs user timelines
  2016-11-07 13:59 ` [PATCH v2 01/11] drm/i915: Create distinct lockclasses for execution vs user timelines Chris Wilson
@ 2016-11-08  7:43   ` Joonas Lahtinen
  2016-11-08  8:50     ` Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Joonas Lahtinen @ 2016-11-08  7:43 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On ma, 2016-11-07 at 13:59 +0000, Chris Wilson wrote:
> @@ -56,6 +61,24 @@ int i915_gem_timeline_init(struct drm_i915_private *i915,
>  	return 0;
>  }
>  
> +int i915_gem_timeline_init(struct drm_i915_private *i915,
> +			   struct i915_gem_timeline *timeline,
> +			   const char *name)
> +{
> +	static struct lock_class_key class;
> +	return __i915_gem_timeline_init(i915, timeline, name,
> +					&class, "timeline");
> +}
> +
> +int i915_gem_timeline_init__global(struct drm_i915_private *i915)
> +{
> +	static struct lock_class_key class;
> +	return __i915_gem_timeline_init(i915,
> +					&i915->gt.global_timeline,
> +					"[execution]",
> +					&class, "global-timeline");
> +}
> +

These names might have potential to be confusing in lockdep splat,
don't you think?

> @@ -67,6 +67,7 @@ struct i915_gem_timeline {
>  int i915_gem_timeline_init(struct drm_i915_private *i915,
>  			   struct i915_gem_timeline *tl,
>  			   const char *name);
> +int i915_gem_timeline_init__global(struct drm_i915_private *i915);

I'm not super fond of this function name either :P But init_timeline vs
timeline_init is also confusing.

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 01/11] drm/i915: Create distinct lockclasses for execution vs user timelines
  2016-11-08  7:43   ` Joonas Lahtinen
@ 2016-11-08  8:50     ` Chris Wilson
  0 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-08  8:50 UTC (permalink / raw)
  To: Joonas Lahtinen; +Cc: intel-gfx

On Tue, Nov 08, 2016 at 09:43:21AM +0200, Joonas Lahtinen wrote:
> On ma, 2016-11-07 at 13:59 +0000, Chris Wilson wrote:
> > @@ -56,6 +61,24 @@ int i915_gem_timeline_init(struct drm_i915_private *i915,
> >  	return 0;
> >  }
> >  
> > +int i915_gem_timeline_init(struct drm_i915_private *i915,
> > +			   struct i915_gem_timeline *timeline,
> > +			   const char *name)
> > +{
> > +	static struct lock_class_key class;
> > +	return __i915_gem_timeline_init(i915, timeline, name,
> > +					&class, "timeline");
> > +}
> > +
> > +int i915_gem_timeline_init__global(struct drm_i915_private *i915)
> > +{
> > +	static struct lock_class_key class;
> > +	return __i915_gem_timeline_init(i915,
> > +					&i915->gt.global_timeline,
> > +					"[execution]",
> > +					&class, "global-timeline");
> > +}
> > +
> 
> These names might have potential to be confusing in lockdep splat,
> don't you think?

Urm, I lost a fixup. They are meant to be &timeline->lock and
&global_timeline->lock.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 02/11] drm/i915: Split request submit/execute phase into two
  2016-11-07 13:59 ` [PATCH v2 02/11] drm/i915: Split request submit/execute phase into two Chris Wilson
@ 2016-11-08  9:06   ` Joonas Lahtinen
  0 siblings, 0 replies; 82+ messages in thread
From: Joonas Lahtinen @ 2016-11-08  9:06 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On ma, 2016-11-07 at 13:59 +0000, Chris Wilson wrote:
> In order to support deferred scheduling, we need to differentiate
> between when the request is ready to run (i.e. the submit fence is
> signaled) and when the request is actually run (a new execute fence).
> This is typically split between the request itself wanting to wait upon
> others (for which we use the submit fence) and the CPU wanting to wait
> upon the request, for which we use the execute fence to be sure the
> hardware is ready to signal completion.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-07 13:59 ` [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
@ 2016-11-08 12:20   ` Chris Wilson
  2016-11-10 10:44     ` Tvrtko Ursulin
  2016-11-10 14:45   ` Tvrtko Ursulin
  1 sibling, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-08 12:20 UTC (permalink / raw)
  To: intel-gfx

On Mon, Nov 07, 2016 at 01:59:45PM +0000, Chris Wilson wrote:
> The scheduler needs to know the dependencies of each request for the
> lifetime of the request, as it may choose to reschedule the requests at
> any time and must ensure the dependency tree is not broken. This is in
> additional to using the fence to only allow execution after all
> dependencies have been completed.
> 
> One option was to extend the fence to support the bidirectional
> dependency tracking required by the scheduler. However the mismatch in
> lifetimes between the submit fence and the request essentially meant
> that we had to build a completely separate struct (and we could not
> simply reuse the existing waitqueue in the fence for one half of the
> dependency tracking). The extra dependency tracking simply did not mesh
> well with the fence, and keeping it separate both keeps the fence
> implementation simpler and allows us to extend the dependency tracking
> into a priority tree (whilst maintaining support for reordering the
> tree).
> 
> To avoid the additional allocations and list manipulations, the use of
> the priotree is disabled when there are no schedulers to use it.
> 
> v2: Create a dedicated slab for i915_dependency.
>     Rename the lists.

Sod. I've squashed the priority sort into this as a rebase calamity.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 03/11] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-07 13:59 ` [PATCH v2 03/11] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
@ 2016-11-10 10:43   ` Tvrtko Ursulin
  2016-11-10 11:11     ` Chris Wilson
  2016-11-10 11:23     ` [PATCH v3] " Chris Wilson
  0 siblings, 2 replies; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-10 10:43 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 07/11/2016 13:59, Chris Wilson wrote:
> Defer the transfer from the client's timeline onto the execution
> timeline from the point of readiness to the point of actual submission.
> For example, in execlists, a request is finally submitted to hardware
> when the hardware is ready, and only put onto the hardware queue when
> the request is ready. By deferring the transfer, we ensure that the
> timeline is maintained in retirement order if we decide to queue the
> requests onto the hardware in a different order than fifo.
>
> v2: Rebased onto distinct global/user timeline lock classes.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_request.c    | 31 +++++++++++++++++-------------
>  drivers/gpu/drm/i915/i915_gem_request.h    |  2 ++
>  drivers/gpu/drm/i915/i915_guc_submission.c | 14 +++++++++++++-
>  drivers/gpu/drm/i915/intel_lrc.c           | 23 +++++++++++++---------
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
>  5 files changed, 49 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index e41d51a68ed8..19c29fafb07a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
>  	return atomic_inc_return(&tl->next_seqno);
>  }
>
> -static int __i915_sw_fence_call
> -submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +void __i915_gem_request_submit(struct drm_i915_gem_request *request)
>  {
> -	struct drm_i915_gem_request *request =
> -		container_of(fence, typeof(*request), submit);
>  	struct intel_engine_cs *engine = request->engine;
>  	struct intel_timeline *timeline;
> -	unsigned long flags;
>  	u32 seqno;
>
> -	if (state != FENCE_COMPLETE)
> -		return NOTIFY_DONE;
> -
>  	/* Transfer from per-context onto the global per-engine timeline */
>  	timeline = engine->timeline;
>  	GEM_BUG_ON(timeline == request->timeline);
> -
> -	/* Will be called from irq-context when using foreign DMA fences */
> -	spin_lock_irqsave(&timeline->lock, flags);
> +	assert_spin_locked(&timeline->lock);
>
>  	seqno = timeline_get_seqno(timeline->common);
>  	GEM_BUG_ON(!seqno);
> @@ -345,15 +336,29 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>  	GEM_BUG_ON(!request->global_seqno);
>  	engine->emit_breadcrumb(request,
>  				request->ring->vaddr + request->postfix);
> -	engine->submit_request(request);
>
>  	spin_lock(&request->timeline->lock);
>  	list_move_tail(&request->link, &timeline->requests);
>  	spin_unlock(&request->timeline->lock);
>
>  	i915_sw_fence_commit(&request->execute);
> +}
>
> -	spin_unlock_irqrestore(&timeline->lock, flags);
> +static int __i915_sw_fence_call
> +submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +{
> +	if (state == FENCE_COMPLETE) {
> +		struct drm_i915_gem_request *request =
> +			container_of(fence, typeof(*request), submit);
> +		struct intel_engine_cs *engine = request->engine;
> +		unsigned long flags;
> +
> +		/* Will be called from irq-context when using foreign fences. */
> +		spin_lock_irqsave_nested(&engine->timeline->lock, flags,
> +					 SINGLE_DEPTH_NESTING);
> +		engine->submit_request(request);
> +		spin_unlock_irqrestore(&engine->timeline->lock, flags);

Would it be cleaner to move the lock taking to engine->submit_request?

And is _nested still required? I thought you said it is not. I can't 
find signalling under the timeline lock either.

Regards,

Tvrtko



> +	}
>
>  	return NOTIFY_DONE;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index c8547d9b9004..d8904863d3d9 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -241,6 +241,8 @@ void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
>  #define i915_add_request_no_flush(req) \
>  	__i915_add_request(req, false)
>
> +void __i915_gem_request_submit(struct drm_i915_gem_request *request);
> +
>  struct intel_rps_client;
>  #define NO_WAITBOOST ERR_PTR(-1)
>  #define IS_RPS_CLIENT(p) (!IS_ERR(p))
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 666dab7a675a..83438c6a8864 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -629,11 +629,23 @@ static int guc_ring_doorbell(struct i915_guc_client *gc)
>  static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  {
>  	struct drm_i915_private *dev_priv = rq->i915;
> -	unsigned int engine_id = rq->engine->id;
> +	struct intel_engine_cs *engine = rq->engine;
> +	unsigned int engine_id = engine->id;
>  	struct intel_guc *guc = &rq->i915->guc;
>  	struct i915_guc_client *client = guc->execbuf_client;
>  	int b_ret;
>
> +	/* We keep the previous context alive until we retire the following
> +	 * request. This ensures that any the context object is still pinned
> +	 * for any residual writes the HW makes into it on the context switch
> +	 * into the next object following the breadcrumb. Otherwise, we may
> +	 * retire the context too early.
> +	 */
> +	rq->previous_context = engine->last_context;
> +	engine->last_context = rq->ctx;
> +
> +	__i915_gem_request_submit(rq);
> +
>  	spin_lock(&client->wq_lock);
>  	guc_wq_item_append(client, rq);
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index fa3012c342cc..fa4920e72e93 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -434,6 +434,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_gem_request *cursor, *last;
>  	struct execlist_port *port = engine->execlist_port;
> +	unsigned long flags;
>  	bool submit = false;
>
>  	last = port->request;
> @@ -469,6 +470,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	 * and context switches) submission.
>  	 */
>
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
>  	spin_lock(&engine->execlist_lock);
>  	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
>  		/* Can we combine this request with the current port? It has to
> @@ -501,6 +503,17 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			i915_gem_request_assign(&port->request, last);
>  			port++;
>  		}
> +
> +		/* We keep the previous context alive until we retire the
> +		 * following request. This ensures that any the context object
> +		 * is still pinned for any residual writes the HW makes into it
> +		 * on the context switch into the next object following the
> +		 * breadcrumb. Otherwise, we may retire the context too early.
> +		 */
> +		cursor->previous_context = engine->last_context;
> +		engine->last_context = cursor->ctx;
> +
> +		__i915_gem_request_submit(cursor);
>  		last = cursor;
>  		submit = true;
>  	}
> @@ -512,6 +525,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		i915_gem_request_assign(&port->request, last);
>  	}
>  	spin_unlock(&engine->execlist_lock);
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>
>  	if (submit)
>  		execlists_submit_ports(engine);
> @@ -599,15 +613,6 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
>
>  	spin_lock_irqsave(&engine->execlist_lock, flags);
>
> -	/* We keep the previous context alive until we retire the following
> -	 * request. This ensures that any the context object is still pinned
> -	 * for any residual writes the HW makes into it on the context switch
> -	 * into the next object following the breadcrumb. Otherwise, we may
> -	 * retire the context too early.
> -	 */
> -	request->previous_context = engine->last_context;
> -	engine->last_context = request->ctx;
> -
>  	list_add_tail(&request->execlist_link, &engine->execlist_queue);
>  	if (execlists_elsp_idle(engine))
>  		tasklet_hi_schedule(&engine->irq_tasklet);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 700e93d80616..f91ee24e2763 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1294,6 +1294,8 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
>  {
>  	struct drm_i915_private *dev_priv = request->i915;
>
> +	__i915_gem_request_submit(request);
> +
>  	I915_WRITE_TAIL(request->engine, request->tail);
>  }
>
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-08 12:20   ` Chris Wilson
@ 2016-11-10 10:44     ` Tvrtko Ursulin
  2016-11-10 10:55       ` Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-10 10:44 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, tvrtko.ursulin


On 08/11/2016 12:20, Chris Wilson wrote:
> On Mon, Nov 07, 2016 at 01:59:45PM +0000, Chris Wilson wrote:
>> The scheduler needs to know the dependencies of each request for the
>> lifetime of the request, as it may choose to reschedule the requests at
>> any time and must ensure the dependency tree is not broken. This is in
>> additional to using the fence to only allow execution after all
>> dependencies have been completed.
>>
>> One option was to extend the fence to support the bidirectional
>> dependency tracking required by the scheduler. However the mismatch in
>> lifetimes between the submit fence and the request essentially meant
>> that we had to build a completely separate struct (and we could not
>> simply reuse the existing waitqueue in the fence for one half of the
>> dependency tracking). The extra dependency tracking simply did not mesh
>> well with the fence, and keeping it separate both keeps the fence
>> implementation simpler and allows us to extend the dependency tracking
>> into a priority tree (whilst maintaining support for reordering the
>> tree).
>>
>> To avoid the additional allocations and list manipulations, the use of
>> the priotree is disabled when there are no schedulers to use it.
>>
>> v2: Create a dedicated slab for i915_dependency.
>>     Rename the lists.
>
> Sod. I've squashed the priority sort into this as a rebase calamity.

Waiting for the other patches to get looked at before untangle/respin or 
just forgot? :)

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 07/11] drm/i915/scheduler: Boost priorities for flips
  2016-11-07 13:59 ` [PATCH v2 07/11] drm/i915/scheduler: Boost priorities for flips Chris Wilson
@ 2016-11-10 10:52   ` Tvrtko Ursulin
  0 siblings, 0 replies; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-10 10:52 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 07/11/2016 13:59, Chris Wilson wrote:
> Boost the priority of any rendering required to show the next pageflip
> as we want to avoid missing the vblank by being delayed by invisible
> workload. We prioritise avoiding jank and jitter in the GUI over
> starving background tasks.
>
> v2: Descend dma_fence_array when boosting priorities.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_drv.h      |  5 +++
>  drivers/gpu/drm/i915/i915_gem.c      | 63 ++++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_display.c |  2 ++
>  3 files changed, 70 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index e790147209f3..5c658c6d06e4 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -3420,6 +3420,11 @@ int i915_gem_object_wait(struct drm_i915_gem_object *obj,
>  			 unsigned int flags,
>  			 long timeout,
>  			 struct intel_rps_client *rps);
> +int i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
> +				  unsigned int flags,
> +				  int priority);
> +#define I915_PRIORITY_DISPLAY I915_PRIORITY_MAX
> +
>  int __must_check
>  i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
>  				  bool write);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index a4dc2da2323a..ae31686fb16d 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -34,6 +34,7 @@
>  #include "intel_drv.h"
>  #include "intel_frontbuffer.h"
>  #include "intel_mocs.h"
> +#include <linux/dma-fence-array.h>
>  #include <linux/reservation.h>
>  #include <linux/shmem_fs.h>
>  #include <linux/slab.h>
> @@ -433,6 +434,68 @@ i915_gem_object_wait_reservation(struct reservation_object *resv,
>  	return timeout;
>  }
>
> +static void __fence_set_priority(struct dma_fence *fence, int prio)
> +{
> +	struct drm_i915_gem_request *rq;
> +	struct intel_engine_cs *engine;
> +
> +	if (!dma_fence_is_i915(fence))
> +		return;
> +
> +	rq = to_request(fence);
> +	engine = rq->engine;
> +	if (!engine->schedule)
> +		return;
> +
> +	engine->schedule(rq, prio);
> +}
> +
> +static void fence_set_priority(struct dma_fence *fence, int prio)
> +{
> +	if (dma_fence_is_array(fence)) {
> +		struct dma_fence_array *array = to_dma_fence_array(fence);
> +		int i;
> +
> +		for (i = 0; i < array->num_fences; i++)
> +			__fence_set_priority(array->fences[i], prio);
> +	} else
> +		__fence_set_priority(fence, prio);
> +}
> +
> +int
> +i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
> +			      unsigned int flags,
> +			      int prio)
> +{
> +	struct dma_fence *excl;
> +
> +	if (flags & I915_WAIT_ALL) {
> +		struct dma_fence **shared;
> +		unsigned int count, i;
> +		int ret;
> +
> +		ret = reservation_object_get_fences_rcu(obj->resv,
> +							&excl, &count, &shared);
> +		if (ret)
> +			return ret;
> +
> +		for (i = 0; i < count; i++) {
> +			fence_set_priority(shared[i], prio);
> +			dma_fence_put(shared[i]);
> +		}
> +
> +		kfree(shared);
> +	} else {
> +		excl = reservation_object_get_excl_rcu(obj->resv);
> +	}
> +
> +	if (excl) {
> +		fence_set_priority(excl, prio);
> +		dma_fence_put(excl);
> +	}
> +	return 0;
> +}
> +
>  /**
>   * Waits for rendering to the object to be completed
>   * @obj: i915 gem object
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index 92ab01f33208..650e2e452a2c 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -14784,6 +14784,8 @@ intel_prepare_plane_fb(struct drm_plane *plane,
>  						      GFP_KERNEL);
>  		if (ret < 0)
>  			return ret;
> +
> +		i915_gem_object_wait_priority(obj, 0, I915_PRIORITY_DISPLAY);
>  	}
>
>  	if (plane->type == DRM_PLANE_TYPE_CURSOR &&
>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-10 10:44     ` Tvrtko Ursulin
@ 2016-11-10 10:55       ` Chris Wilson
  2016-11-10 11:54         ` Tvrtko Ursulin
  0 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-10 10:55 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 10, 2016 at 10:44:44AM +0000, Tvrtko Ursulin wrote:
> 
> On 08/11/2016 12:20, Chris Wilson wrote:
> >On Mon, Nov 07, 2016 at 01:59:45PM +0000, Chris Wilson wrote:
> >>The scheduler needs to know the dependencies of each request for the
> >>lifetime of the request, as it may choose to reschedule the requests at
> >>any time and must ensure the dependency tree is not broken. This is in
> >>additional to using the fence to only allow execution after all
> >>dependencies have been completed.
> >>
> >>One option was to extend the fence to support the bidirectional
> >>dependency tracking required by the scheduler. However the mismatch in
> >>lifetimes between the submit fence and the request essentially meant
> >>that we had to build a completely separate struct (and we could not
> >>simply reuse the existing waitqueue in the fence for one half of the
> >>dependency tracking). The extra dependency tracking simply did not mesh
> >>well with the fence, and keeping it separate both keeps the fence
> >>implementation simpler and allows us to extend the dependency tracking
> >>into a priority tree (whilst maintaining support for reordering the
> >>tree).
> >>
> >>To avoid the additional allocations and list manipulations, the use of
> >>the priotree is disabled when there are no schedulers to use it.
> >>
> >>v2: Create a dedicated slab for i915_dependency.
> >>    Rename the lists.
> >
> >Sod. I've squashed the priority sort into this as a rebase calamity.
> 
> Waiting for the other patches to get looked at before
> untangle/respin or just forgot? :)

Just imagine the split ;) Think of the intel_lrc.c as a separate patch.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 03/11] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-10 10:43   ` Tvrtko Ursulin
@ 2016-11-10 11:11     ` Chris Wilson
  2016-11-10 11:51       ` Tvrtko Ursulin
  2016-11-10 11:23     ` [PATCH v3] " Chris Wilson
  1 sibling, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-10 11:11 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 10, 2016 at 10:43:29AM +0000, Tvrtko Ursulin wrote:
> 
> On 07/11/2016 13:59, Chris Wilson wrote:
> >Defer the transfer from the client's timeline onto the execution
> >timeline from the point of readiness to the point of actual submission.
> >For example, in execlists, a request is finally submitted to hardware
> >when the hardware is ready, and only put onto the hardware queue when
> >the request is ready. By deferring the transfer, we ensure that the
> >timeline is maintained in retirement order if we decide to queue the
> >requests onto the hardware in a different order than fifo.
> >
> >v2: Rebased onto distinct global/user timeline lock classes.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> > drivers/gpu/drm/i915/i915_gem_request.c    | 31 +++++++++++++++++-------------
> > drivers/gpu/drm/i915/i915_gem_request.h    |  2 ++
> > drivers/gpu/drm/i915/i915_guc_submission.c | 14 +++++++++++++-
> > drivers/gpu/drm/i915/intel_lrc.c           | 23 +++++++++++++---------
> > drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
> > 5 files changed, 49 insertions(+), 23 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> >index e41d51a68ed8..19c29fafb07a 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_request.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_request.c
> >@@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
> > 	return atomic_inc_return(&tl->next_seqno);
> > }
> >
> >-static int __i915_sw_fence_call
> >-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> >+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
> > {
> >-	struct drm_i915_gem_request *request =
> >-		container_of(fence, typeof(*request), submit);
> > 	struct intel_engine_cs *engine = request->engine;
> > 	struct intel_timeline *timeline;
> >-	unsigned long flags;
> > 	u32 seqno;
> >
> >-	if (state != FENCE_COMPLETE)
> >-		return NOTIFY_DONE;
> >-
> > 	/* Transfer from per-context onto the global per-engine timeline */
> > 	timeline = engine->timeline;
> > 	GEM_BUG_ON(timeline == request->timeline);
> >-
> >-	/* Will be called from irq-context when using foreign DMA fences */
> >-	spin_lock_irqsave(&timeline->lock, flags);
> >+	assert_spin_locked(&timeline->lock);
> >
> > 	seqno = timeline_get_seqno(timeline->common);
> > 	GEM_BUG_ON(!seqno);
> >@@ -345,15 +336,29 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> > 	GEM_BUG_ON(!request->global_seqno);
> > 	engine->emit_breadcrumb(request,
> > 				request->ring->vaddr + request->postfix);
> >-	engine->submit_request(request);
> >
> > 	spin_lock(&request->timeline->lock);
> > 	list_move_tail(&request->link, &timeline->requests);
> > 	spin_unlock(&request->timeline->lock);
> >
> > 	i915_sw_fence_commit(&request->execute);
> >+}
> >
> >-	spin_unlock_irqrestore(&timeline->lock, flags);
> >+static int __i915_sw_fence_call
> >+submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> >+{
> >+	if (state == FENCE_COMPLETE) {
> >+		struct drm_i915_gem_request *request =
> >+			container_of(fence, typeof(*request), submit);
> >+		struct intel_engine_cs *engine = request->engine;
> >+		unsigned long flags;
> >+
> >+		/* Will be called from irq-context when using foreign fences. */
> >+		spin_lock_irqsave_nested(&engine->timeline->lock, flags,
> >+					 SINGLE_DEPTH_NESTING);
> >+		engine->submit_request(request);
> >+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
> Would it be cleaner to move the lock taking to engine->submit_request?

Perhaps. Certainly pushes the ugliness down a layer!
 
> And is _nested still required? I thought you said it is not. I can't
> find signalling under the timeline lock either.

It is still required to sort out the ordering between external
lockclasses (vgem/nouveau/etc)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v3] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-10 10:43   ` Tvrtko Ursulin
  2016-11-10 11:11     ` Chris Wilson
@ 2016-11-10 11:23     ` Chris Wilson
  1 sibling, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-10 11:23 UTC (permalink / raw)
  To: intel-gfx

Defer the transfer from the client's timeline onto the execution
timeline from the point of readiness to the point of actual submission.
For example, in execlists, a request is finally submitted to hardware
when the hardware is ready, and only put onto the hardware queue when
the request is ready. By deferring the transfer, we ensure that the
timeline is maintained in retirement order if we decide to queue the
requests onto the hardware in a different order than fifo.

v2: Rebased onto distinct global/user timeline lock classes.
v3: Play with the position of the spin_lock().

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_request.c    | 39 ++++++++++++++++++++----------
 drivers/gpu/drm/i915/i915_gem_request.h    |  3 +++
 drivers/gpu/drm/i915/i915_guc_submission.c | 14 ++++++++++-
 drivers/gpu/drm/i915/intel_lrc.c           | 23 +++++++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
 5 files changed, 58 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index d0f6b9f82636..149dd821bcf3 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -306,25 +306,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
 	return atomic_inc_return(&tl->next_seqno);
 }
 
-static int __i915_sw_fence_call
-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
 {
-	struct drm_i915_gem_request *request =
-		container_of(fence, typeof(*request), submit);
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_timeline *timeline;
-	unsigned long flags;
 	u32 seqno;
 
-	if (state != FENCE_COMPLETE)
-		return NOTIFY_DONE;
-
 	/* Transfer from per-context onto the global per-engine timeline */
 	timeline = engine->timeline;
 	GEM_BUG_ON(timeline == request->timeline);
-
-	/* Will be called from irq-context when using foreign DMA fences */
-	spin_lock_irqsave(&timeline->lock, flags);
+	assert_spin_locked(&timeline->lock);
 
 	seqno = timeline_get_seqno(timeline->common);
 	GEM_BUG_ON(!seqno);
@@ -344,15 +335,37 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	GEM_BUG_ON(!request->global_seqno);
 	engine->emit_breadcrumb(request,
 				request->ring->vaddr + request->postfix);
-	engine->submit_request(request);
 
 	spin_lock(&request->timeline->lock);
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
 	i915_sw_fence_commit(&request->execute);
+}
+
+void i915_gem_request_submit(struct drm_i915_gem_request *request)
+{
+	struct intel_engine_cs *engine = request->engine;
+	unsigned long flags;
 
-	spin_unlock_irqrestore(&timeline->lock, flags);
+	/* Will be called from irq-context when using foreign fences. */
+	spin_lock_irqsave_nested(&engine->timeline->lock, flags,
+				 SINGLE_DEPTH_NESTING);
+
+	__i915_gem_request_submit(request);
+
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+}
+
+static int __i915_sw_fence_call
+submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	if (state == FENCE_COMPLETE) {
+		struct drm_i915_gem_request *request =
+			container_of(fence, typeof(*request), submit);
+
+		request->engine->submit_request(request);
+	}
 
 	return NOTIFY_DONE;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 544ea3c12961..242f09591ba7 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -229,6 +229,9 @@ void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
 #define i915_add_request_no_flush(req) \
 	__i915_add_request(req, false)
 
+void __i915_gem_request_submit(struct drm_i915_gem_request *request);
+void i915_gem_request_submit(struct drm_i915_gem_request *request);
+
 struct intel_rps_client;
 #define NO_WAITBOOST ERR_PTR(-1)
 #define IS_RPS_CLIENT(p) (!IS_ERR(p))
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 666dab7a675a..942f5000d372 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -629,11 +629,23 @@ static int guc_ring_doorbell(struct i915_guc_client *gc)
 static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
 	struct drm_i915_private *dev_priv = rq->i915;
-	unsigned int engine_id = rq->engine->id;
+	struct intel_engine_cs *engine = rq->engine;
+	unsigned int engine_id = engine->id;
 	struct intel_guc *guc = &rq->i915->guc;
 	struct i915_guc_client *client = guc->execbuf_client;
 	int b_ret;
 
+	/* We keep the previous context alive until we retire the following
+	 * request. This ensures that any the context object is still pinned
+	 * for any residual writes the HW makes into it on the context switch
+	 * into the next object following the breadcrumb. Otherwise, we may
+	 * retire the context too early.
+	 */
+	rq->previous_context = engine->last_context;
+	engine->last_context = rq->ctx;
+
+	i915_gem_request_submit(rq);
+
 	spin_lock(&client->wq_lock);
 	guc_wq_item_append(client, rq);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dde04b7643b1..dca41834dec1 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -434,6 +434,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *cursor, *last;
 	struct execlist_port *port = engine->execlist_port;
+	unsigned long flags;
 	bool submit = false;
 
 	last = port->request;
@@ -469,6 +470,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * and context switches) submission.
 	 */
 
+	spin_lock_irqsave(&engine->timeline->lock, flags);
 	spin_lock(&engine->execlist_lock);
 	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
 		/* Can we combine this request with the current port? It has to
@@ -501,6 +503,17 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			i915_gem_request_assign(&port->request, last);
 			port++;
 		}
+
+		/* We keep the previous context alive until we retire the
+		 * following request. This ensures that any the context object
+		 * is still pinned for any residual writes the HW makes into it
+		 * on the context switch into the next object following the
+		 * breadcrumb. Otherwise, we may retire the context too early.
+		 */
+		cursor->previous_context = engine->last_context;
+		engine->last_context = cursor->ctx;
+
+		__i915_gem_request_submit(cursor);
 		last = cursor;
 		submit = true;
 	}
@@ -512,6 +525,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		i915_gem_request_assign(&port->request, last);
 	}
 	spin_unlock(&engine->execlist_lock);
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	if (submit)
 		execlists_submit_ports(engine);
@@ -621,15 +635,6 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 
 	spin_lock_irqsave(&engine->execlist_lock, flags);
 
-	/* We keep the previous context alive until we retire the following
-	 * request. This ensures that any the context object is still pinned
-	 * for any residual writes the HW makes into it on the context switch
-	 * into the next object following the breadcrumb. Otherwise, we may
-	 * retire the context too early.
-	 */
-	request->previous_context = engine->last_context;
-	engine->last_context = request->ctx;
-
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 700e93d80616..aeb637dc1fdf 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1294,6 +1294,8 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
 {
 	struct drm_i915_private *dev_priv = request->i915;
 
+	i915_gem_request_submit(request);
+
 	I915_WRITE_TAIL(request->engine, request->tail);
 }
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* ✓ Fi.CI.BAT: success for series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines (rev2)
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (11 preceding siblings ...)
  2016-11-07 15:18 ` ✓ Fi.CI.BAT: success for series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines Patchwork
@ 2016-11-10 11:45 ` Patchwork
  2016-11-10 12:04   ` Saarinen, Jani
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
  13 siblings, 1 reply; 82+ messages in thread
From: Patchwork @ 2016-11-10 11:45 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines (rev2)
URL   : https://patchwork.freedesktop.org/series/14926/
State : success

== Summary ==

Series 14926v2 Series without cover letter
https://patchwork.freedesktop.org/api/1.0/series/14926/revisions/2/mbox/


fi-byt-j1900     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28 
fi-byt-n2820     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
fi-hsw-4770      total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
fi-hsw-4770r     total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20 
fi-ilk-650       total:244  pass:191  dwarn:0   dfail:0   fail:0   skip:53 
fi-ivb-3520m     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-ivb-3770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22 
fi-snb-2520m     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32 
fi-snb-2600      total:244  pass:211  dwarn:0   dfail:0   fail:0   skip:33 
fi-kbl-7200u failed to connect after reboot

eb88955cdc6a1f4dabff6bc27747c1c9e9a3aaef drm-intel-nightly: 2016y-11m-10d-09h-29m-41s UTC integration manifest
b0e329d drm/i915: Support explicit fencing for execbuf
6c4e06f drm/i915: Enable userspace to opt-out of implicit fencing
bfe3e23 drm/i915/scheduler: Support user-defined priorities
465ba68 HACK drm/i915/scheduler: emulate a scheduler for guc
f39120b drm/i915/scheduler: Boost priorities for flips
27e53d9 drm/i915/scheduler: Record all dependencies upon request construction
dda333d drm/i915/scheduler: Signal the arrival of a new request
f1edd87 drm/i915: Remove engine->execlist_lock
aad42bc drm/i915: Defer transfer onto execution timeline to actual hw submission
c646d89 drm/i915: Split request submit/execute phase into two
67c609a drm/i915: Create distinct lockclasses for execution vs user timelines

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_2954/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 03/11] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-10 11:11     ` Chris Wilson
@ 2016-11-10 11:51       ` Tvrtko Ursulin
  2016-11-10 14:43         ` Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-10 11:51 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 10/11/2016 11:11, Chris Wilson wrote:
> On Thu, Nov 10, 2016 at 10:43:29AM +0000, Tvrtko Ursulin wrote:
>>
>> On 07/11/2016 13:59, Chris Wilson wrote:
>>> Defer the transfer from the client's timeline onto the execution
>>> timeline from the point of readiness to the point of actual submission.
>>> For example, in execlists, a request is finally submitted to hardware
>>> when the hardware is ready, and only put onto the hardware queue when
>>> the request is ready. By deferring the transfer, we ensure that the
>>> timeline is maintained in retirement order if we decide to queue the
>>> requests onto the hardware in a different order than fifo.
>>>
>>> v2: Rebased onto distinct global/user timeline lock classes.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>> drivers/gpu/drm/i915/i915_gem_request.c    | 31 +++++++++++++++++-------------
>>> drivers/gpu/drm/i915/i915_gem_request.h    |  2 ++
>>> drivers/gpu/drm/i915/i915_guc_submission.c | 14 +++++++++++++-
>>> drivers/gpu/drm/i915/intel_lrc.c           | 23 +++++++++++++---------
>>> drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
>>> 5 files changed, 49 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
>>> index e41d51a68ed8..19c29fafb07a 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_request.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
>>> @@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
>>> 	return atomic_inc_return(&tl->next_seqno);
>>> }
>>>
>>> -static int __i915_sw_fence_call
>>> -submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>>> +void __i915_gem_request_submit(struct drm_i915_gem_request *request)
>>> {
>>> -	struct drm_i915_gem_request *request =
>>> -		container_of(fence, typeof(*request), submit);
>>> 	struct intel_engine_cs *engine = request->engine;
>>> 	struct intel_timeline *timeline;
>>> -	unsigned long flags;
>>> 	u32 seqno;
>>>
>>> -	if (state != FENCE_COMPLETE)
>>> -		return NOTIFY_DONE;
>>> -
>>> 	/* Transfer from per-context onto the global per-engine timeline */
>>> 	timeline = engine->timeline;
>>> 	GEM_BUG_ON(timeline == request->timeline);
>>> -
>>> -	/* Will be called from irq-context when using foreign DMA fences */
>>> -	spin_lock_irqsave(&timeline->lock, flags);
>>> +	assert_spin_locked(&timeline->lock);
>>>
>>> 	seqno = timeline_get_seqno(timeline->common);
>>> 	GEM_BUG_ON(!seqno);
>>> @@ -345,15 +336,29 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>>> 	GEM_BUG_ON(!request->global_seqno);
>>> 	engine->emit_breadcrumb(request,
>>> 				request->ring->vaddr + request->postfix);
>>> -	engine->submit_request(request);
>>>
>>> 	spin_lock(&request->timeline->lock);
>>> 	list_move_tail(&request->link, &timeline->requests);
>>> 	spin_unlock(&request->timeline->lock);
>>>
>>> 	i915_sw_fence_commit(&request->execute);
>>> +}
>>>
>>> -	spin_unlock_irqrestore(&timeline->lock, flags);
>>> +static int __i915_sw_fence_call
>>> +submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>>> +{
>>> +	if (state == FENCE_COMPLETE) {
>>> +		struct drm_i915_gem_request *request =
>>> +			container_of(fence, typeof(*request), submit);
>>> +		struct intel_engine_cs *engine = request->engine;
>>> +		unsigned long flags;
>>> +
>>> +		/* Will be called from irq-context when using foreign fences. */
>>> +		spin_lock_irqsave_nested(&engine->timeline->lock, flags,
>>> +					 SINGLE_DEPTH_NESTING);
>>> +		engine->submit_request(request);
>>> +		spin_unlock_irqrestore(&engine->timeline->lock, flags);
>
>> Would it be cleaner to move the lock taking to engine->submit_request?
>
> Perhaps. Certainly pushes the ugliness down a layer!
>
>> And is _nested still required? I thought you said it is not. I can't
>> find signalling under the timeline lock either.
>
> It is still required to sort out the ordering between external
> lockclasses (vgem/nouveau/etc)

Hm, how? I don't see it. :(

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-10 10:55       ` Chris Wilson
@ 2016-11-10 11:54         ` Tvrtko Ursulin
  2016-11-10 12:10           ` Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-10 11:54 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, tvrtko.ursulin


On 10/11/2016 10:55, Chris Wilson wrote:
> On Thu, Nov 10, 2016 at 10:44:44AM +0000, Tvrtko Ursulin wrote:
>>
>> On 08/11/2016 12:20, Chris Wilson wrote:
>>> On Mon, Nov 07, 2016 at 01:59:45PM +0000, Chris Wilson wrote:
>>>> The scheduler needs to know the dependencies of each request for the
>>>> lifetime of the request, as it may choose to reschedule the requests at
>>>> any time and must ensure the dependency tree is not broken. This is in
>>>> additional to using the fence to only allow execution after all
>>>> dependencies have been completed.
>>>>
>>>> One option was to extend the fence to support the bidirectional
>>>> dependency tracking required by the scheduler. However the mismatch in
>>>> lifetimes between the submit fence and the request essentially meant
>>>> that we had to build a completely separate struct (and we could not
>>>> simply reuse the existing waitqueue in the fence for one half of the
>>>> dependency tracking). The extra dependency tracking simply did not mesh
>>>> well with the fence, and keeping it separate both keeps the fence
>>>> implementation simpler and allows us to extend the dependency tracking
>>>> into a priority tree (whilst maintaining support for reordering the
>>>> tree).
>>>>
>>>> To avoid the additional allocations and list manipulations, the use of
>>>> the priotree is disabled when there are no schedulers to use it.
>>>>
>>>> v2: Create a dedicated slab for i915_dependency.
>>>>    Rename the lists.
>>>
>>> Sod. I've squashed the priority sort into this as a rebase calamity.
>>
>> Waiting for the other patches to get looked at before
>> untangle/respin or just forgot? :)
>
> Just imagine the split ;) Think of the intel_lrc.c as a separate patch.

It is a little bit more than that, but more importantly, is your plan 
now to just keep it squashed? I can pre-review it squashed, but I think 
it would be very desirable to eventually split it up again.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: ✓ Fi.CI.BAT: success for series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines (rev2)
  2016-11-10 11:45 ` ✓ Fi.CI.BAT: success for series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines (rev2) Patchwork
@ 2016-11-10 12:04   ` Saarinen, Jani
  0 siblings, 0 replies; 82+ messages in thread
From: Saarinen, Jani @ 2016-11-10 12:04 UTC (permalink / raw)
  To: intel-gfx, Chris Wilson

Hi, 
> == Series Details ==
> 
> Series: series starting with [v2,01/11] drm/i915: Create distinct lockclasses for
> execution vs user timelines (rev2)
> URL   : https://patchwork.freedesktop.org/series/14926/
> State : success
> 
> == Summary ==
> 
> Series 14926v2 Series without cover letter
> https://patchwork.freedesktop.org/api/1.0/series/14926/revisions/2/mbox/
> 
> 
> fi-byt-j1900     total:244  pass:216  dwarn:0   dfail:0   fail:0   skip:28
> fi-byt-n2820     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32
> fi-hsw-4770      total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20
> fi-hsw-4770r     total:244  pass:224  dwarn:0   dfail:0   fail:0   skip:20
> fi-ilk-650       total:244  pass:191  dwarn:0   dfail:0   fail:0   skip:53
> fi-ivb-3520m     total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22
> fi-ivb-3770      total:244  pass:222  dwarn:0   dfail:0   fail:0   skip:22
> fi-snb-2520m     total:244  pass:212  dwarn:0   dfail:0   fail:0   skip:32
> fi-snb-2600      total:244  pass:211  dwarn:0   dfail:0   fail:0   skip:33
> fi-kbl-7200u failed to connect after reboot
Please note that all Gen8+ machines did not boot with this patched kernel.

> 
> eb88955cdc6a1f4dabff6bc27747c1c9e9a3aaef drm-intel-nightly: 2016y-11m-
> 10d-09h-29m-41s UTC integration manifest b0e329d drm/i915: Support explicit
> fencing for execbuf 6c4e06f drm/i915: Enable userspace to opt-out of implicit
> fencing
> bfe3e23 drm/i915/scheduler: Support user-defined priorities
> 465ba68 HACK drm/i915/scheduler: emulate a scheduler for guc f39120b
> drm/i915/scheduler: Boost priorities for flips
> 27e53d9 drm/i915/scheduler: Record all dependencies upon request
> construction dda333d drm/i915/scheduler: Signal the arrival of a new request
> f1edd87 drm/i915: Remove engine->execlist_lock aad42bc drm/i915: Defer
> transfer onto execution timeline to actual hw submission
> c646d89 drm/i915: Split request submit/execute phase into two 67c609a
> drm/i915: Create distinct lockclasses for execution vs user timelines
> 
> == Logs ==
> 
> For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_2954/


Jani Saarinen
Intel Finland Oy - BIC 0357606-4 - Westendinkatu 7, 02160 Espoo


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-10 11:54         ` Tvrtko Ursulin
@ 2016-11-10 12:10           ` Chris Wilson
  0 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-10 12:10 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 10, 2016 at 11:54:51AM +0000, Tvrtko Ursulin wrote:
> 
> On 10/11/2016 10:55, Chris Wilson wrote:
> >On Thu, Nov 10, 2016 at 10:44:44AM +0000, Tvrtko Ursulin wrote:
> >>
> >>On 08/11/2016 12:20, Chris Wilson wrote:
> >>>On Mon, Nov 07, 2016 at 01:59:45PM +0000, Chris Wilson wrote:
> >>>>The scheduler needs to know the dependencies of each request for the
> >>>>lifetime of the request, as it may choose to reschedule the requests at
> >>>>any time and must ensure the dependency tree is not broken. This is in
> >>>>additional to using the fence to only allow execution after all
> >>>>dependencies have been completed.
> >>>>
> >>>>One option was to extend the fence to support the bidirectional
> >>>>dependency tracking required by the scheduler. However the mismatch in
> >>>>lifetimes between the submit fence and the request essentially meant
> >>>>that we had to build a completely separate struct (and we could not
> >>>>simply reuse the existing waitqueue in the fence for one half of the
> >>>>dependency tracking). The extra dependency tracking simply did not mesh
> >>>>well with the fence, and keeping it separate both keeps the fence
> >>>>implementation simpler and allows us to extend the dependency tracking
> >>>>into a priority tree (whilst maintaining support for reordering the
> >>>>tree).
> >>>>
> >>>>To avoid the additional allocations and list manipulations, the use of
> >>>>the priotree is disabled when there are no schedulers to use it.
> >>>>
> >>>>v2: Create a dedicated slab for i915_dependency.
> >>>>   Rename the lists.
> >>>
> >>>Sod. I've squashed the priority sort into this as a rebase calamity.
> >>
> >>Waiting for the other patches to get looked at before
> >>untangle/respin or just forgot? :)
> >
> >Just imagine the split ;) Think of the intel_lrc.c as a separate patch.
> 
> It is a little bit more than that, but more importantly, is your
> plan now to just keep it squashed? I can pre-review it squashed, but
> I think it would be very desirable to eventually split it up again.

I've unsplit it, but I didn't resend the series again because I didn't
think it was that important, resending just this pair would mess up CI.
The changed code is just the DFS which is where I would appreciate
comments.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 09/11] drm/i915/scheduler: Support user-defined priorities
  2016-11-07 13:59 ` [PATCH v2 09/11] drm/i915/scheduler: Support user-defined priorities Chris Wilson
@ 2016-11-10 13:02   ` Tvrtko Ursulin
  2016-11-10 13:10     ` Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-10 13:02 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 07/11/2016 13:59, Chris Wilson wrote:
> Use a priority stored in the context as the initial value when
> submitting a request. This allows us to change the default priority on a
> per-context basis, allowing different contexts to be favoured with GPU
> time at the expense of lower importance work. The user can adjust the
> context's priority via I915_CONTEXT_PARAM_PRIORITY, with more positive
> values being higher priority (they will be serviced earlier, after their
> dependencies have been resolved). Any prerequisite work for an execbuf
> will have its priority raised to match the new request as required.
>
> Normal users can specify any value in the range of -1023 to 0 [default],
> i.e. they can reduce the priority of their workloads (and temporarily
> boost it back to normal if so desired).
>
> Privileged users can specify any value in the range of -1023 to 1023,
> [default is 0], i.e. they can raise their priority above all overs and
> so potentially starve the system.
>
> Note that the existing schedulers are not fair, nor load balancing, the
> execution is strictly by priority on a first-come, first-served basis,
> and the driver may choose to boost some requests above the range
> available to users.
>
> This priority was originally based around nice(2), but evolved to allow
> clients to adjust their priority within a small range, and allow for a
> privileged high priority range.
>
> For example, this can be used to implement EGL_IMG_context_priority
> https://www.khronos.org/registry/egl/extensions/IMG/EGL_IMG_context_priority.txt
>
> 	EGL_CONTEXT_PRIORITY_LEVEL_IMG determines the priority level of
>         the context to be created. This attribute is a hint, as an
>         implementation may not support multiple contexts at some
>         priority levels and system policy may limit access to high
>         priority contexts to appropriate system privilege level. The
>         default value for EGL_CONTEXT_PRIORITY_LEVEL_IMG is
>         EGL_CONTEXT_PRIORITY_MEDIUM_IMG."
>
> so we can map
>
> 	PRIORITY_HIGH -> 1023 [privileged, will failback to 0]
> 	PRIORITY_MED -> 0 [default]
> 	PRIORITY_LOW -> -1023
>
> They also map onto the priorities used by VkQueue (and a VkQueue is
> essentially a timeline, our i915_gem_context under full-ppgtt).
>
> Testcase: igt/gem_exec_schedule
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>  drivers/gpu/drm/i915/i915_gem_context.c | 21 +++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_gem_request.c |  2 +-
>  include/uapi/drm/i915_drm.h             |  3 +++
>  4 files changed, 26 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 5c658c6d06e4..d253aeee0fb2 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -949,6 +949,7 @@ struct i915_gem_context {
>  	/* Unique identifier for this context, used by the hw for tracking */
>  	unsigned int hw_id;
>  	u32 user_handle;
> +	int priority; /* greater priorities are serviced first */
>
>  	u32 ggtt_alignment;
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 6dd475735f0a..48b5aacf5fc2 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -476,6 +476,7 @@ int i915_gem_context_init(struct drm_device *dev)
>  		return PTR_ERR(ctx);
>  	}
>
> +	ctx->priority = -I915_PRIORITY_MAX; /* lowest priority; idle task */

Does this matter at all? What are we submitting from the kernel context? 
If we are some things then is it even correct?

Another thing, -I915_PRIORITY_MAX looks strange, why not have 
I915_PRIORITY_MIN?

>  	dev_priv->kernel_context = ctx;
>
>  	DRM_DEBUG_DRIVER("%s context support initialized\n",
> @@ -1100,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
>  		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
>  		break;
> +	case I915_CONTEXT_PARAM_PRIORITY:
> +		args->value = ctx->priority;
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> @@ -1155,6 +1159,23 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
>  		}
>  		break;
> +
> +	case I915_CONTEXT_PARAM_PRIORITY:
> +		{
> +			int priority = args->value;
> +
> +			if (args->size)
> +				ret = -EINVAL;
> +			else if (priority >= I915_PRIORITY_MAX ||
> +				 priority <= -I915_PRIORITY_MAX)
> +				ret = -EINVAL;
> +			else if (priority > 0 && !capable(CAP_SYS_ADMIN))
> +				ret = -EPERM;
> +			else
> +				ctx->priority = priority;
> +		}
> +		break;
> +
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 278b103a4e95..cfda095f0234 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -861,7 +861,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
>  	 * run at the earliest possible convenience.
>  	 */
>  	if (engine->schedule)
> -		engine->schedule(request, 0);
> +		engine->schedule(request, request->ctx->priority);
>
>  	local_bh_disable();
>  	i915_sw_fence_commit(&request->submit);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 1c12a350eca3..47901a8ad682 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -391,6 +391,8 @@ typedef struct drm_i915_irq_wait {
>
>  /* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
>   * priorities and the driver will attempt to execute batches in priority order.
> + * The initial priority for each batch is supplied by the context and is
> + * controlled via I915_CONTEXT_PARAM_PRIORITY.
>   */
>  #define I915_PARAM_HAS_SCHEDULER	 41
>
> @@ -1224,6 +1226,7 @@ struct drm_i915_gem_context_param {
>  #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
>  #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
>  #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
> +#define I915_CONTEXT_PARAM_PRIORITY	0x5
>  	__u64 value;
>  };
>
>

Otherwise looks OK to me.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 09/11] drm/i915/scheduler: Support user-defined priorities
  2016-11-10 13:02   ` Tvrtko Ursulin
@ 2016-11-10 13:10     ` Chris Wilson
  0 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-10 13:10 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 10, 2016 at 01:02:08PM +0000, Tvrtko Ursulin wrote:
> 
> On 07/11/2016 13:59, Chris Wilson wrote:
> >Use a priority stored in the context as the initial value when
> >submitting a request. This allows us to change the default priority on a
> >per-context basis, allowing different contexts to be favoured with GPU
> >time at the expense of lower importance work. The user can adjust the
> >context's priority via I915_CONTEXT_PARAM_PRIORITY, with more positive
> >values being higher priority (they will be serviced earlier, after their
> >dependencies have been resolved). Any prerequisite work for an execbuf
> >will have its priority raised to match the new request as required.
> >
> >Normal users can specify any value in the range of -1023 to 0 [default],
> >i.e. they can reduce the priority of their workloads (and temporarily
> >boost it back to normal if so desired).
> >
> >Privileged users can specify any value in the range of -1023 to 1023,
> >[default is 0], i.e. they can raise their priority above all overs and
> >so potentially starve the system.
> >
> >Note that the existing schedulers are not fair, nor load balancing, the
> >execution is strictly by priority on a first-come, first-served basis,
> >and the driver may choose to boost some requests above the range
> >available to users.
> >
> >This priority was originally based around nice(2), but evolved to allow
> >clients to adjust their priority within a small range, and allow for a
> >privileged high priority range.
> >
> >For example, this can be used to implement EGL_IMG_context_priority
> >https://www.khronos.org/registry/egl/extensions/IMG/EGL_IMG_context_priority.txt
> >
> >	EGL_CONTEXT_PRIORITY_LEVEL_IMG determines the priority level of
> >        the context to be created. This attribute is a hint, as an
> >        implementation may not support multiple contexts at some
> >        priority levels and system policy may limit access to high
> >        priority contexts to appropriate system privilege level. The
> >        default value for EGL_CONTEXT_PRIORITY_LEVEL_IMG is
> >        EGL_CONTEXT_PRIORITY_MEDIUM_IMG."
> >
> >so we can map
> >
> >	PRIORITY_HIGH -> 1023 [privileged, will failback to 0]
> >	PRIORITY_MED -> 0 [default]
> >	PRIORITY_LOW -> -1023
> >
> >They also map onto the priorities used by VkQueue (and a VkQueue is
> >essentially a timeline, our i915_gem_context under full-ppgtt).
> >
> >Testcase: igt/gem_exec_schedule
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> > drivers/gpu/drm/i915/i915_drv.h         |  1 +
> > drivers/gpu/drm/i915/i915_gem_context.c | 21 +++++++++++++++++++++
> > drivers/gpu/drm/i915/i915_gem_request.c |  2 +-
> > include/uapi/drm/i915_drm.h             |  3 +++
> > 4 files changed, 26 insertions(+), 1 deletion(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >index 5c658c6d06e4..d253aeee0fb2 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.h
> >+++ b/drivers/gpu/drm/i915/i915_drv.h
> >@@ -949,6 +949,7 @@ struct i915_gem_context {
> > 	/* Unique identifier for this context, used by the hw for tracking */
> > 	unsigned int hw_id;
> > 	u32 user_handle;
> >+	int priority; /* greater priorities are serviced first */
> >
> > 	u32 ggtt_alignment;
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> >index 6dd475735f0a..48b5aacf5fc2 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_context.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_context.c
> >@@ -476,6 +476,7 @@ int i915_gem_context_init(struct drm_device *dev)
> > 		return PTR_ERR(ctx);
> > 	}
> >
> >+	ctx->priority = -I915_PRIORITY_MAX; /* lowest priority; idle task */
> 
> Does this matter at all? What are we submitting from the kernel
> context? If we are some things then is it even correct?

We idle from the kernel context, we issue rc6 setup in case userspace
hasn't. Yes, everything we do today is of lowest priority, and
arbitrarily boosting a request is not a problem for the kernel (but we
don't support deboosting).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 03/11] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-10 11:51       ` Tvrtko Ursulin
@ 2016-11-10 14:43         ` Chris Wilson
  0 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-10 14:43 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 10, 2016 at 11:51:27AM +0000, Tvrtko Ursulin wrote:
> 
> On 10/11/2016 11:11, Chris Wilson wrote:
> >On Thu, Nov 10, 2016 at 10:43:29AM +0000, Tvrtko Ursulin wrote:
> >>
> >>On 07/11/2016 13:59, Chris Wilson wrote:
> >>>Defer the transfer from the client's timeline onto the execution
> >>>timeline from the point of readiness to the point of actual submission.
> >>>For example, in execlists, a request is finally submitted to hardware
> >>>when the hardware is ready, and only put onto the hardware queue when
> >>>the request is ready. By deferring the transfer, we ensure that the
> >>>timeline is maintained in retirement order if we decide to queue the
> >>>requests onto the hardware in a different order than fifo.
> >>>
> >>>v2: Rebased onto distinct global/user timeline lock classes.
> >>>
> >>>Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>---
> >>>drivers/gpu/drm/i915/i915_gem_request.c    | 31 +++++++++++++++++-------------
> >>>drivers/gpu/drm/i915/i915_gem_request.h    |  2 ++
> >>>drivers/gpu/drm/i915/i915_guc_submission.c | 14 +++++++++++++-
> >>>drivers/gpu/drm/i915/intel_lrc.c           | 23 +++++++++++++---------
> >>>drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
> >>>5 files changed, 49 insertions(+), 23 deletions(-)
> >>>
> >>>diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> >>>index e41d51a68ed8..19c29fafb07a 100644
> >>>--- a/drivers/gpu/drm/i915/i915_gem_request.c
> >>>+++ b/drivers/gpu/drm/i915/i915_gem_request.c
> >>>@@ -307,25 +307,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
> >>>	return atomic_inc_return(&tl->next_seqno);
> >>>}
> >>>
> >>>-static int __i915_sw_fence_call
> >>>-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> >>>+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
> >>>{
> >>>-	struct drm_i915_gem_request *request =
> >>>-		container_of(fence, typeof(*request), submit);
> >>>	struct intel_engine_cs *engine = request->engine;
> >>>	struct intel_timeline *timeline;
> >>>-	unsigned long flags;
> >>>	u32 seqno;
> >>>
> >>>-	if (state != FENCE_COMPLETE)
> >>>-		return NOTIFY_DONE;
> >>>-
> >>>	/* Transfer from per-context onto the global per-engine timeline */
> >>>	timeline = engine->timeline;
> >>>	GEM_BUG_ON(timeline == request->timeline);
> >>>-
> >>>-	/* Will be called from irq-context when using foreign DMA fences */
> >>>-	spin_lock_irqsave(&timeline->lock, flags);
> >>>+	assert_spin_locked(&timeline->lock);
> >>>
> >>>	seqno = timeline_get_seqno(timeline->common);
> >>>	GEM_BUG_ON(!seqno);
> >>>@@ -345,15 +336,29 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> >>>	GEM_BUG_ON(!request->global_seqno);
> >>>	engine->emit_breadcrumb(request,
> >>>				request->ring->vaddr + request->postfix);
> >>>-	engine->submit_request(request);
> >>>
> >>>	spin_lock(&request->timeline->lock);
> >>>	list_move_tail(&request->link, &timeline->requests);
> >>>	spin_unlock(&request->timeline->lock);
> >>>
> >>>	i915_sw_fence_commit(&request->execute);
> >>>+}
> >>>
> >>>-	spin_unlock_irqrestore(&timeline->lock, flags);
> >>>+static int __i915_sw_fence_call
> >>>+submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> >>>+{
> >>>+	if (state == FENCE_COMPLETE) {
> >>>+		struct drm_i915_gem_request *request =
> >>>+			container_of(fence, typeof(*request), submit);
> >>>+		struct intel_engine_cs *engine = request->engine;
> >>>+		unsigned long flags;
> >>>+
> >>>+		/* Will be called from irq-context when using foreign fences. */
> >>>+		spin_lock_irqsave_nested(&engine->timeline->lock, flags,
> >>>+					 SINGLE_DEPTH_NESTING);
> >>>+		engine->submit_request(request);
> >>>+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
> >
> >>Would it be cleaner to move the lock taking to engine->submit_request?
> >
> >Perhaps. Certainly pushes the ugliness down a layer!
> >
> >>And is _nested still required? I thought you said it is not. I can't
> >>find signalling under the timeline lock either.
> >
> >It is still required to sort out the ordering between external
> >lockclasses (vgem/nouveau/etc)
> 
> Hm, how? I don't see it. :(

vgem was triggering a different path but it was the combination of
swfences causing the lockdep splat.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-07 13:59 ` [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
  2016-11-08 12:20   ` Chris Wilson
@ 2016-11-10 14:45   ` Tvrtko Ursulin
  2016-11-10 15:01     ` Chris Wilson
  1 sibling, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-10 14:45 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 07/11/2016 13:59, Chris Wilson wrote:
> The scheduler needs to know the dependencies of each request for the
> lifetime of the request, as it may choose to reschedule the requests at
> any time and must ensure the dependency tree is not broken. This is in
> additional to using the fence to only allow execution after all
> dependencies have been completed.
>
> One option was to extend the fence to support the bidirectional
> dependency tracking required by the scheduler. However the mismatch in
> lifetimes between the submit fence and the request essentially meant
> that we had to build a completely separate struct (and we could not
> simply reuse the existing waitqueue in the fence for one half of the
> dependency tracking). The extra dependency tracking simply did not mesh
> well with the fence, and keeping it separate both keeps the fence
> implementation simpler and allows us to extend the dependency tracking
> into a priority tree (whilst maintaining support for reordering the
> tree).
>
> To avoid the additional allocations and list manipulations, the use of
> the priotree is disabled when there are no schedulers to use it.
>
> v2: Create a dedicated slab for i915_dependency.
>     Rename the lists.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
>  drivers/gpu/drm/i915/i915_drv.h            |   1 +
>  drivers/gpu/drm/i915/i915_gem.c            |  14 ++-
>  drivers/gpu/drm/i915/i915_gem_request.c    |  96 +++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_gem_request.h    |  40 ++++++++-
>  drivers/gpu/drm/i915/i915_guc_submission.c |   1 +
>  drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
>  drivers/gpu/drm/i915/intel_lrc.c           | 135 +++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
>  9 files changed, 282 insertions(+), 18 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 3cb96d260dfb..dac435680e98 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
>  			  struct drm_i915_gem_request *rq,
>  			  const char *prefix)
>  {
> -	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
> +	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
>  		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
> +		   rq->priotree.priority,
>  		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
>  		   rq->timeline->common->name);
>  }
> @@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>
>  		if (i915.enable_execlists) {
>  			u32 ptr, read, write;
> +			struct rb_node *rb;
>
>  			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
>  				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
> @@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  			rcu_read_unlock();
>
>  			spin_lock_irq(&engine->timeline->lock);
> -			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
> +			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
> +				rq = rb_entry(rb, typeof(*rq), priotree.node);
>  				print_request(m, rq, "\t\tQ ");
>  			}
>  			spin_unlock_irq(&engine->timeline->lock);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 4735b4177100..e790147209f3 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1791,6 +1791,7 @@ struct drm_i915_private {
>  	struct kmem_cache *objects;
>  	struct kmem_cache *vmas;
>  	struct kmem_cache *requests;
> +	struct kmem_cache *dependencies;
>
>  	const struct intel_device_info info;
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index df803e82eb07..a4dc2da2323a 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2693,10 +2693,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
>
>  		spin_lock_irqsave(&engine->timeline->lock, flags);
>
> -		INIT_LIST_HEAD(&engine->execlist_queue);
>  		i915_gem_request_put(engine->execlist_port[0].request);
>  		i915_gem_request_put(engine->execlist_port[1].request);
>  		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> +		engine->execlist_queue = RB_ROOT;
> +		engine->execlist_first = NULL;
>
>  		spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  	}
> @@ -4754,12 +4755,18 @@ i915_gem_load_init(struct drm_device *dev)
>  	if (!dev_priv->requests)
>  		goto err_vmas;
>
> +	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
> +					    SLAB_HWCACHE_ALIGN |
> +					    SLAB_RECLAIM_ACCOUNT);
> +	if (!dev_priv->dependencies)
> +		goto err_requests;
> +
>  	mutex_lock(&dev_priv->drm.struct_mutex);
>  	INIT_LIST_HEAD(&dev_priv->gt.timelines);
>  	err = i915_gem_timeline_init__global(dev_priv);
>  	mutex_unlock(&dev_priv->drm.struct_mutex);
>  	if (err)
> -		goto err_requests;
> +		goto err_dependencies;
>
>  	INIT_LIST_HEAD(&dev_priv->context_list);
>  	INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
> @@ -4787,6 +4794,8 @@ i915_gem_load_init(struct drm_device *dev)
>
>  	return 0;
>
> +err_dependencies:
> +	kmem_cache_destroy(dev_priv->dependencies);
>  err_requests:
>  	kmem_cache_destroy(dev_priv->requests);
>  err_vmas:
> @@ -4803,6 +4812,7 @@ void i915_gem_load_cleanup(struct drm_device *dev)
>
>  	WARN_ON(!llist_empty(&dev_priv->mm.free_list));
>
> +	kmem_cache_destroy(dev_priv->dependencies);
>  	kmem_cache_destroy(dev_priv->requests);
>  	kmem_cache_destroy(dev_priv->vmas);
>  	kmem_cache_destroy(dev_priv->objects);
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 93f77df9bc51..278b103a4e95 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -113,6 +113,82 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>  	spin_unlock(&file_priv->mm.lock);
>  }
>
> +static struct i915_dependency *
> +i915_dependency_alloc(struct drm_i915_private *i915)
> +{
> +	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
> +}
> +
> +static void
> +i915_dependency_free(struct drm_i915_private *i915,
> +		     struct i915_dependency *dep)
> +{
> +	kmem_cache_free(i915->dependencies, dep);
> +}
> +
> +static void
> +__i915_priotree_add_dependency(struct i915_priotree *pt,
> +			       struct i915_priotree *signal,
> +			       struct i915_dependency *dep,
> +			       unsigned long flags)
> +{
> +	INIT_LIST_HEAD(&dep->dfs_link);
> +	list_add(&dep->wait_link, &signal->waiters_list);
> +	list_add(&dep->signal_link, &pt->signalers_list);
> +	dep->signaler = signal;
> +	dep->flags = flags;
> +}
> +
> +static int
> +i915_priotree_add_dependency(struct drm_i915_private *i915,
> +			     struct i915_priotree *pt,
> +			     struct i915_priotree *signal)
> +{
> +	struct i915_dependency *dep;
> +
> +	dep = i915_dependency_alloc(i915);
> +	if (!dep)
> +		return -ENOMEM;
> +
> +	__i915_priotree_add_dependency(pt, signal, dep, I915_DEPENDENCY_ALLOC);
> +	return 0;
> +}
> +
> +static void
> +i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
> +{
> +	struct i915_dependency *dep, *next;
> +
> +	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
> +
> +	/* Everyone we depended upon (the fences we wait to be signaled)
> +	 * should retire before us and remove themselves from our list.
> +	 * However, retirement is run independently on each timeline and
> +	 * so we may be called out-of-order.
> +	 */
> +	list_for_each_entry_safe(dep, next, &pt->signalers_list, signal_link) {
> +		list_del(&dep->wait_link);
> +		if (dep->flags & I915_DEPENDENCY_ALLOC)
> +			i915_dependency_free(i915, dep);
> +	}
> +
> +	/* Remove ourselves from everyone who depends upon us */
> +	list_for_each_entry_safe(dep, next, &pt->waiters_list, wait_link) {
> +		list_del(&dep->signal_link);
> +		if (dep->flags & I915_DEPENDENCY_ALLOC)
> +			i915_dependency_free(i915, dep);
> +	}
> +}
> +
> +static void
> +i915_priotree_init(struct i915_priotree *pt)
> +{
> +	INIT_LIST_HEAD(&pt->signalers_list);
> +	INIT_LIST_HEAD(&pt->waiters_list);
> +	RB_CLEAR_NODE(&pt->node);
> +	pt->priority = INT_MIN;

Not I915_PRIORITY_MIN? Or it has to be smaller? In which case 
BUILD_BUG_ON(INT_MIN >= I915_PRIORITY_MIN)?

> +}
> +
>  void i915_gem_retire_noop(struct i915_gem_active *active,
>  			  struct drm_i915_gem_request *request)
>  {
> @@ -182,6 +258,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
>  	i915_gem_context_put(request->ctx);
>
>  	dma_fence_signal(&request->fence);
> +
> +	i915_priotree_fini(request->i915, &request->priotree);
>  	i915_gem_request_put(request);
>  }
>
> @@ -461,6 +539,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>  	 */
>  	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
>
> +	i915_priotree_init(&req->priotree);
> +
>  	INIT_LIST_HEAD(&req->active_list);
>  	req->i915 = dev_priv;
>  	req->engine = engine;
> @@ -514,6 +594,14 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
>
>  	GEM_BUG_ON(to == from);
>
> +	if (to->engine->schedule) {
> +		ret = i915_priotree_add_dependency(to->i915,
> +						   &to->priotree,
> +						   &from->priotree);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
>  	if (to->timeline == from->timeline)
>  		return 0;
>
> @@ -737,9 +825,15 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
>
>  	prev = i915_gem_active_raw(&timeline->last_request,
>  				   &request->i915->drm.struct_mutex);
> -	if (prev)
> +	if (prev) {
>  		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
>  					     &request->submitq);
> +		if (engine->schedule)
> +			__i915_priotree_add_dependency(&request->priotree,
> +						       &prev->priotree,
> +						       &request->dep,
> +						       0);
> +	}
>
>  	spin_lock_irq(&timeline->lock);
>  	list_add_tail(&request->link, &timeline->requests);
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index d8904863d3d9..584d76170df3 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -41,6 +41,32 @@ struct intel_signal_node {
>  	struct intel_wait wait;
>  };
>
> +struct i915_dependency {
> +	struct i915_priotree *signaler;
> +	struct list_head signal_link;
> +	struct list_head wait_link;
> +	struct list_head dfs_link;
> +	unsigned long flags;
> +#define I915_DEPENDENCY_ALLOC BIT(0)
> +};
> +
> +/* Requests exist in a complex web of interdependencies. Each request
> + * has to wait for some other request to complete before it is ready to be run
> + * (e.g. we have to wait until the pixels have been rendering into a texture
> + * before we can copy from it). We track the readiness of a request in terms
> + * of fences, but we also need to keep the dependency tree for the lifetime
> + * of the request (beyond the life of an individual fence). We use the tree
> + * at various points to reorder the requests whilst keeping the requests
> + * in order with respect to their various dependencies.
> + */
> +struct i915_priotree {
> +	struct list_head signalers_list; /* those before us, we depend upon */
> +	struct list_head waiters_list; /* those after us, they depend upon us */
> +	struct rb_node node;
> +	int priority;
> +#define I915_PRIORITY_MAX 1024
> +};
> +
>  /**
>   * Request queue structure.
>   *
> @@ -102,6 +128,17 @@ struct drm_i915_gem_request {
>  	wait_queue_t submitq;
>  	wait_queue_t execq;
>
> +	/* A list of everyone we wait upon, and everyone who waits upon us.
> +	 * Even though we will not be submitted to the hardware before the
> +	 * submit fence is signaled (it waits for all external events as well
> +	 * as our own requests), the scheduler still needs to know the
> +	 * dependency tree for the lifetime of the request (from execbuf
> +	 * to retirement), i.e. bidirectional dependency information for the
> +	 * request not tied to individual fences.
> +	 */
> +	struct i915_priotree priotree;
> +	struct i915_dependency dep;
> +
>  	u32 global_seqno;
>
>  	/** GEM sequence number associated with the previous request,
> @@ -158,9 +195,6 @@ struct drm_i915_gem_request {
>  	struct drm_i915_file_private *file_priv;
>  	/** file_priv list entry for this request */
>  	struct list_head client_list;
> -
> -	/** Link in the execlist submission queue, guarded by execlist_lock. */
> -	struct list_head execlist_link;
>  };
>
>  extern const struct dma_fence_ops i915_fence_ops;
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 83438c6a8864..7c6819968307 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -1532,6 +1532,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
>  		engine->submit_request = i915_guc_submit;
> +		engine->schedule = NULL;
>
>  		/* Replay the current set of previously submitted requests */
>  		list_for_each_entry(request,
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index c9171a058478..3da4d466e332 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -239,7 +239,8 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
>   */
>  void intel_engine_setup_common(struct intel_engine_cs *engine)
>  {
> -	INIT_LIST_HEAD(&engine->execlist_queue);
> +	engine->execlist_queue = RB_ROOT;
> +	engine->execlist_first = NULL;
>
>  	intel_engine_init_timeline(engine);
>  	intel_engine_init_hangcheck(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 94933f4297bb..af944a246511 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -432,9 +432,10 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
>
>  static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_gem_request *cursor, *last;
> +	struct drm_i915_gem_request *last;
>  	struct execlist_port *port = engine->execlist_port;
>  	unsigned long flags;
> +	struct rb_node *rb;
>  	bool submit = false;
>
>  	last = port->request;
> @@ -471,7 +472,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	 */
>
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
> -	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
>  		/* Can we combine this request with the current port? It has to
>  		 * be the same context/ringbuffer and not have any exceptions
>  		 * (e.g. GVT saying never to combine contexts).
> @@ -503,6 +508,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			port++;
>  		}
>
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;

What does setting the priority to INT_MAX here do?

> +
>  		/* We keep the previous context alive until we retire the
>  		 * following request. This ensures that any the context object
>  		 * is still pinned for any residual writes the HW makes into it
> @@ -517,11 +527,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		submit = true;
>  	}
>  	if (submit) {
> -		/* Decouple all the requests submitted from the queue */
> -		engine->execlist_queue.next = &cursor->execlist_link;
> -		cursor->execlist_link.prev = &engine->execlist_queue;
> -
>  		i915_gem_request_assign(&port->request, last);
> +		engine->execlist_first = rb;
>  	}
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>
> @@ -604,17 +611,126 @@ static void intel_lrc_irq_handler(unsigned long data)
>  	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
>  }
>
> +static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
> +{
> +	struct rb_node **p, *rb;
> +	bool first = true;
> +
> +	/* most positive priority is scheduled first, equal priorities fifo */
> +	rb = NULL;
> +	p = &root->rb_node;
> +	while (*p) {
> +		struct i915_priotree *pos;
> +
> +		rb = *p;
> +		pos = rb_entry(rb, typeof(*pos), node);
> +		if (pt->priority > pos->priority) {
> +			p = &rb->rb_left;
> +		} else {
> +			p = &rb->rb_right;
> +			first = false;
> +		}
> +	}
> +	rb_link_node(&pt->node, rb, p);
> +	rb_insert_color(&pt->node, root);
> +
> +	return first;
> +}
> +
>  static void execlists_submit_request(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
>
>  	assert_spin_locked(&engine->timeline->lock);
>
> -	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> +	if (insert_request(&request->priotree, &engine->execlist_queue))
> +		engine->execlist_first = &request->priotree.node;
>  	if (execlists_elsp_idle(engine))
>  		tasklet_hi_schedule(&engine->irq_tasklet);
>  }
>
> +static struct intel_engine_cs *
> +pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
> +{
> +	struct intel_engine_cs *engine;
> +
> +	engine = container_of(pt,
> +			      struct drm_i915_gem_request,
> +			      priotree)->engine;
> +	if (engine != locked) {
> +		if (locked)
> +			spin_unlock_irq(&locked->timeline->lock);
> +		spin_lock_irq(&engine->timeline->lock);
> +	}
> +
> +	return engine;
> +}

Ha, cute. :)

> +
> +static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
> +{
> +	struct intel_engine_cs *engine = NULL;
> +	struct i915_dependency *dep, *p;
> +	struct i915_dependency stack;
> +	LIST_HEAD(dfs);
> +
> +	if (prio <= READ_ONCE(request->priotree.priority))
> +		return;
> +
> +	/* Need BKL in order to use the temporary link inside i915_dependency */
> +	lockdep_assert_held(&request->i915->drm.struct_mutex);

Could use a new i915 submission lock, but I suppose this is OK to start 
with.

> +
> +	stack.signaler = &request->priotree;
> +	list_add(&stack.dfs_link, &dfs);
> +

Tada, onto the DFS which I am not familiar with - but there's always 
Wikipedia. :)

> +	/* Recursively bump all dependent priorities to match the new request */
> +	list_for_each_entry_safe(dep, p, &dfs, dfs_link) {
> +		struct i915_priotree *pt = dep->signaler;
> +
> +		list_for_each_entry(p, &pt->signalers_list, signal_link)
> +			if (prio > READ_ONCE(p->signaler->priority))
> +				list_move_tail(&p->dfs_link, &dfs);
> +
> +		p = list_first_entry(&dep->dfs_link, typeof(*p), dfs_link);

This was fun. :)

> +		if (!RB_EMPTY_NODE(&pt->node))
> +			continue;
> +
> +		engine = pt_lock_engine(pt, engine);
> +
> +		/* If it is not already in the rbtree, we can update the
> +		 * priority inplace and skip over it (and its dependencies)
> +		 * if it is referenced again as we descend the dfs.
> +		 */

Are you sure it is OK to ignore the dependencies for new requests? I 
don't see why it would be.

> +		if (prio > pt->priority && RB_EMPTY_NODE(&pt->node)) {

Isn't the node guaranteed to be empty from the earlier test and continue?

> +			pt->priority = prio;
> +			list_del_init(&dep->dfs_link);
> +		}
> +	}
> +
> +	/* Fifo and depth-first replacement ensure our deps execute before us */
> +	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
> +		struct i915_priotree *pt = dep->signaler;
> +
> +		INIT_LIST_HEAD(&dep->dfs_link);
> +
> +		engine = pt_lock_engine(pt, engine);
> +
> +		if (prio <= pt->priority)
> +			continue;

How would these priorities end up in the list? The first loop skips over 
them.

> +
> +		GEM_BUG_ON(RB_EMPTY_NODE(&pt->node));
> +
> +		pt->priority = prio;
> +		rb_erase(&pt->node, &engine->execlist_queue);
> +		if (insert_request(pt, &engine->execlist_queue))
> +			engine->execlist_first = &pt->node;
> +	}
> +
> +	if (engine)
> +		spin_unlock_irq(&engine->timeline->lock);
> +
> +	/* XXX Do we need to preempt to make room for us and our deps? */
> +}
> +
>  int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
> @@ -1651,8 +1767,10 @@ void intel_execlists_enable_submission(struct drm_i915_private *dev_priv)
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>
> -	for_each_engine(engine, dev_priv, id)
> +	for_each_engine(engine, dev_priv, id) {
>  		engine->submit_request = execlists_submit_request;
> +		engine->schedule = execlists_schedule;
> +	}
>  }
>
>  static void
> @@ -1665,6 +1783,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>  	engine->emit_breadcrumb = gen8_emit_breadcrumb;
>  	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
>  	engine->submit_request = execlists_submit_request;
> +	engine->schedule = execlists_schedule;
>
>  	engine->irq_enable = gen8_logical_ring_enable_irq;
>  	engine->irq_disable = gen8_logical_ring_disable_irq;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 75991a3c694b..cbc148863a03 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -348,7 +348,8 @@ struct intel_engine_cs {
>  		struct drm_i915_gem_request *request;
>  		unsigned int count;
>  	} execlist_port[2];
> -	struct list_head execlist_queue;
> +	struct rb_root execlist_queue;
> +	struct rb_node *execlist_first;
>  	unsigned int fw_domains;
>  	bool disable_lite_restore_wa;
>  	bool preempt_wa;
>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-10 14:45   ` Tvrtko Ursulin
@ 2016-11-10 15:01     ` Chris Wilson
  2016-11-10 15:36       ` Tvrtko Ursulin
  0 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-10 15:01 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 10, 2016 at 02:45:39PM +0000, Tvrtko Ursulin wrote:
> 
> On 07/11/2016 13:59, Chris Wilson wrote:
> >The scheduler needs to know the dependencies of each request for the
> >lifetime of the request, as it may choose to reschedule the requests at
> >any time and must ensure the dependency tree is not broken. This is in
> >additional to using the fence to only allow execution after all
> >dependencies have been completed.
> >
> >One option was to extend the fence to support the bidirectional
> >dependency tracking required by the scheduler. However the mismatch in
> >lifetimes between the submit fence and the request essentially meant
> >that we had to build a completely separate struct (and we could not
> >simply reuse the existing waitqueue in the fence for one half of the
> >dependency tracking). The extra dependency tracking simply did not mesh
> >well with the fence, and keeping it separate both keeps the fence
> >implementation simpler and allows us to extend the dependency tracking
> >into a priority tree (whilst maintaining support for reordering the
> >tree).
> >
> >To avoid the additional allocations and list manipulations, the use of
> >the priotree is disabled when there are no schedulers to use it.
> >
> >v2: Create a dedicated slab for i915_dependency.
> >    Rename the lists.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >---
> > drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
> > drivers/gpu/drm/i915/i915_drv.h            |   1 +
> > drivers/gpu/drm/i915/i915_gem.c            |  14 ++-
> > drivers/gpu/drm/i915/i915_gem_request.c    |  96 +++++++++++++++++++-
> > drivers/gpu/drm/i915/i915_gem_request.h    |  40 ++++++++-
> > drivers/gpu/drm/i915/i915_guc_submission.c |   1 +
> > drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
> > drivers/gpu/drm/i915/intel_lrc.c           | 135 +++++++++++++++++++++++++++--
> > drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
> > 9 files changed, 282 insertions(+), 18 deletions(-)
> >
> >diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> >index 3cb96d260dfb..dac435680e98 100644
> >--- a/drivers/gpu/drm/i915/i915_debugfs.c
> >+++ b/drivers/gpu/drm/i915/i915_debugfs.c
> >@@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
> > 			  struct drm_i915_gem_request *rq,
> > 			  const char *prefix)
> > {
> >-	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
> >+	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
> > 		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
> >+		   rq->priotree.priority,
> > 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
> > 		   rq->timeline->common->name);
> > }
> >@@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
> >
> > 		if (i915.enable_execlists) {
> > 			u32 ptr, read, write;
> >+			struct rb_node *rb;
> >
> > 			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
> > 				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
> >@@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
> > 			rcu_read_unlock();
> >
> > 			spin_lock_irq(&engine->timeline->lock);
> >-			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
> >+			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
> >+				rq = rb_entry(rb, typeof(*rq), priotree.node);
> > 				print_request(m, rq, "\t\tQ ");
> > 			}
> > 			spin_unlock_irq(&engine->timeline->lock);
> >diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> >index 4735b4177100..e790147209f3 100644
> >--- a/drivers/gpu/drm/i915/i915_drv.h
> >+++ b/drivers/gpu/drm/i915/i915_drv.h
> >@@ -1791,6 +1791,7 @@ struct drm_i915_private {
> > 	struct kmem_cache *objects;
> > 	struct kmem_cache *vmas;
> > 	struct kmem_cache *requests;
> >+	struct kmem_cache *dependencies;
> >
> > 	const struct intel_device_info info;
> >
> >diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> >index df803e82eb07..a4dc2da2323a 100644
> >--- a/drivers/gpu/drm/i915/i915_gem.c
> >+++ b/drivers/gpu/drm/i915/i915_gem.c
> >@@ -2693,10 +2693,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
> >
> > 		spin_lock_irqsave(&engine->timeline->lock, flags);
> >
> >-		INIT_LIST_HEAD(&engine->execlist_queue);
> > 		i915_gem_request_put(engine->execlist_port[0].request);
> > 		i915_gem_request_put(engine->execlist_port[1].request);
> > 		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> >+		engine->execlist_queue = RB_ROOT;
> >+		engine->execlist_first = NULL;
> >
> > 		spin_unlock_irqrestore(&engine->timeline->lock, flags);
> > 	}
> >@@ -4754,12 +4755,18 @@ i915_gem_load_init(struct drm_device *dev)
> > 	if (!dev_priv->requests)
> > 		goto err_vmas;
> >
> >+	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
> >+					    SLAB_HWCACHE_ALIGN |
> >+					    SLAB_RECLAIM_ACCOUNT);
> >+	if (!dev_priv->dependencies)
> >+		goto err_requests;
> >+
> > 	mutex_lock(&dev_priv->drm.struct_mutex);
> > 	INIT_LIST_HEAD(&dev_priv->gt.timelines);
> > 	err = i915_gem_timeline_init__global(dev_priv);
> > 	mutex_unlock(&dev_priv->drm.struct_mutex);
> > 	if (err)
> >-		goto err_requests;
> >+		goto err_dependencies;
> >
> > 	INIT_LIST_HEAD(&dev_priv->context_list);
> > 	INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
> >@@ -4787,6 +4794,8 @@ i915_gem_load_init(struct drm_device *dev)
> >
> > 	return 0;
> >
> >+err_dependencies:
> >+	kmem_cache_destroy(dev_priv->dependencies);
> > err_requests:
> > 	kmem_cache_destroy(dev_priv->requests);
> > err_vmas:
> >@@ -4803,6 +4812,7 @@ void i915_gem_load_cleanup(struct drm_device *dev)
> >
> > 	WARN_ON(!llist_empty(&dev_priv->mm.free_list));
> >
> >+	kmem_cache_destroy(dev_priv->dependencies);
> > 	kmem_cache_destroy(dev_priv->requests);
> > 	kmem_cache_destroy(dev_priv->vmas);
> > 	kmem_cache_destroy(dev_priv->objects);
> >diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> >index 93f77df9bc51..278b103a4e95 100644
> >--- a/drivers/gpu/drm/i915/i915_gem_request.c
> >+++ b/drivers/gpu/drm/i915/i915_gem_request.c
> >@@ -113,6 +113,82 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
> > 	spin_unlock(&file_priv->mm.lock);
> > }
> >
> >+static struct i915_dependency *
> >+i915_dependency_alloc(struct drm_i915_private *i915)
> >+{
> >+	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
> >+}
> >+
> >+static void
> >+i915_dependency_free(struct drm_i915_private *i915,
> >+		     struct i915_dependency *dep)
> >+{
> >+	kmem_cache_free(i915->dependencies, dep);
> >+}
> >+
> >+static void
> >+__i915_priotree_add_dependency(struct i915_priotree *pt,
> >+			       struct i915_priotree *signal,
> >+			       struct i915_dependency *dep,
> >+			       unsigned long flags)
> >+{
> >+	INIT_LIST_HEAD(&dep->dfs_link);
> >+	list_add(&dep->wait_link, &signal->waiters_list);
> >+	list_add(&dep->signal_link, &pt->signalers_list);
> >+	dep->signaler = signal;
> >+	dep->flags = flags;
> >+}
> >+
> >+static int
> >+i915_priotree_add_dependency(struct drm_i915_private *i915,
> >+			     struct i915_priotree *pt,
> >+			     struct i915_priotree *signal)
> >+{
> >+	struct i915_dependency *dep;
> >+
> >+	dep = i915_dependency_alloc(i915);
> >+	if (!dep)
> >+		return -ENOMEM;
> >+
> >+	__i915_priotree_add_dependency(pt, signal, dep, I915_DEPENDENCY_ALLOC);
> >+	return 0;
> >+}
> >+
> >+static void
> >+i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
> >+{
> >+	struct i915_dependency *dep, *next;
> >+
> >+	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
> >+
> >+	/* Everyone we depended upon (the fences we wait to be signaled)
> >+	 * should retire before us and remove themselves from our list.
> >+	 * However, retirement is run independently on each timeline and
> >+	 * so we may be called out-of-order.
> >+	 */
> >+	list_for_each_entry_safe(dep, next, &pt->signalers_list, signal_link) {
> >+		list_del(&dep->wait_link);
> >+		if (dep->flags & I915_DEPENDENCY_ALLOC)
> >+			i915_dependency_free(i915, dep);
> >+	}
> >+
> >+	/* Remove ourselves from everyone who depends upon us */
> >+	list_for_each_entry_safe(dep, next, &pt->waiters_list, wait_link) {
> >+		list_del(&dep->signal_link);
> >+		if (dep->flags & I915_DEPENDENCY_ALLOC)
> >+			i915_dependency_free(i915, dep);
> >+	}
> >+}
> >+
> >+static void
> >+i915_priotree_init(struct i915_priotree *pt)
> >+{
> >+	INIT_LIST_HEAD(&pt->signalers_list);
> >+	INIT_LIST_HEAD(&pt->waiters_list);
> >+	RB_CLEAR_NODE(&pt->node);
> >+	pt->priority = INT_MIN;
> 
> Not I915_PRIORITY_MIN? Or it has to be smaller? In which case
> BUILD_BUG_ON(INT_MIN >= I915_PRIORITY_MIN)?

I wanted it to be smaller than min so that the unset value was clear in
any debug trace.

> >+		rb = rb_next(rb);
> >+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> >+		RB_CLEAR_NODE(&cursor->priotree.node);
> >+		cursor->priotree.priority = INT_MAX;
> 
> What does setting the priority to INT_MAX here do?

It is used as a signal that the request has been sent to hardware
both as a shortcircuit for the dfs and so that it is clear in the
debugfs.
 
> >+static struct intel_engine_cs *
> >+pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
> >+{
> >+	struct intel_engine_cs *engine;
> >+
> >+	engine = container_of(pt,
> >+			      struct drm_i915_gem_request,
> >+			      priotree)->engine;
> >+	if (engine != locked) {
> >+		if (locked)
> >+			spin_unlock_irq(&locked->timeline->lock);
> >+		spin_lock_irq(&engine->timeline->lock);
> >+	}
> >+
> >+	return engine;
> >+}
> 
> Ha, cute. :)
> 
> >+
> >+static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
> >+{
> >+	struct intel_engine_cs *engine = NULL;
> >+	struct i915_dependency *dep, *p;
> >+	struct i915_dependency stack;
> >+	LIST_HEAD(dfs);
> >+
> >+	if (prio <= READ_ONCE(request->priotree.priority))
> >+		return;
> >+
> >+	/* Need BKL in order to use the temporary link inside i915_dependency */
> >+	lockdep_assert_held(&request->i915->drm.struct_mutex);
> 
> Could use a new i915 submission lock, but I suppose this is OK to
> start with.
> 
> >+
> >+	stack.signaler = &request->priotree;
> >+	list_add(&stack.dfs_link, &dfs);
> >+
> 
> Tada, onto the DFS which I am not familiar with - but there's always
> Wikipedia. :)
> 
> >+	/* Recursively bump all dependent priorities to match the new request */
> >+	list_for_each_entry_safe(dep, p, &dfs, dfs_link) {
> >+		struct i915_priotree *pt = dep->signaler;
> >+
> >+		list_for_each_entry(p, &pt->signalers_list, signal_link)
> >+			if (prio > READ_ONCE(p->signaler->priority))
> >+				list_move_tail(&p->dfs_link, &dfs);
> >+
> >+		p = list_first_entry(&dep->dfs_link, typeof(*p), dfs_link);
> 
> This was fun. :)
> 
> >+		if (!RB_EMPTY_NODE(&pt->node))
> >+			continue;
> >+
> >+		engine = pt_lock_engine(pt, engine);
> >+
> >+		/* If it is not already in the rbtree, we can update the
> >+		 * priority inplace and skip over it (and its dependencies)
> >+		 * if it is referenced again as we descend the dfs.
> >+		 */
> 
> Are you sure it is OK to ignore the dependencies for new requests? I
> don't see why it would be.

We don't ignore the dependencies for new requests, they have already
been added to the list to be processed. What we are considering here is
what happens if this request is a dependency of a subsequent request in
the list. My statement is that since we are not in the rbtree, we are
not ready to be run (and in turn neither will be the following request
that depended upon this request). As we are not in the rbtree, we do not
need to reorder the rbtree to ensure the fifo ordering with our
dependencies.
 
> >+		if (prio > pt->priority && RB_EMPTY_NODE(&pt->node)) {
> 
> Isn't the node guaranteed to be empty from the earlier test and continue?

We checked before we were certain we had the spinlock.
 
> >+			pt->priority = prio;
> >+			list_del_init(&dep->dfs_link);
> >+		}
> >+	}
> >+
> >+	/* Fifo and depth-first replacement ensure our deps execute before us */
> >+	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
> >+		struct i915_priotree *pt = dep->signaler;
> >+
> >+		INIT_LIST_HEAD(&dep->dfs_link);
> >+
> >+		engine = pt_lock_engine(pt, engine);
> >+
> >+		if (prio <= pt->priority)
> >+			continue;
> 
> How would these priorities end up in the list? The first loop skips
> over them.

We keep on dropping the lock.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-10 15:01     ` Chris Wilson
@ 2016-11-10 15:36       ` Tvrtko Ursulin
  2016-11-10 15:55         ` Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-10 15:36 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 10/11/2016 15:01, Chris Wilson wrote:
> On Thu, Nov 10, 2016 at 02:45:39PM +0000, Tvrtko Ursulin wrote:
>>
>> On 07/11/2016 13:59, Chris Wilson wrote:
>>> The scheduler needs to know the dependencies of each request for the
>>> lifetime of the request, as it may choose to reschedule the requests at
>>> any time and must ensure the dependency tree is not broken. This is in
>>> additional to using the fence to only allow execution after all
>>> dependencies have been completed.
>>>
>>> One option was to extend the fence to support the bidirectional
>>> dependency tracking required by the scheduler. However the mismatch in
>>> lifetimes between the submit fence and the request essentially meant
>>> that we had to build a completely separate struct (and we could not
>>> simply reuse the existing waitqueue in the fence for one half of the
>>> dependency tracking). The extra dependency tracking simply did not mesh
>>> well with the fence, and keeping it separate both keeps the fence
>>> implementation simpler and allows us to extend the dependency tracking
>>> into a priority tree (whilst maintaining support for reordering the
>>> tree).
>>>
>>> To avoid the additional allocations and list manipulations, the use of
>>> the priotree is disabled when there are no schedulers to use it.
>>>
>>> v2: Create a dedicated slab for i915_dependency.
>>>    Rename the lists.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>> drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
>>> drivers/gpu/drm/i915/i915_drv.h            |   1 +
>>> drivers/gpu/drm/i915/i915_gem.c            |  14 ++-
>>> drivers/gpu/drm/i915/i915_gem_request.c    |  96 +++++++++++++++++++-
>>> drivers/gpu/drm/i915/i915_gem_request.h    |  40 ++++++++-
>>> drivers/gpu/drm/i915/i915_guc_submission.c |   1 +
>>> drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
>>> drivers/gpu/drm/i915/intel_lrc.c           | 135 +++++++++++++++++++++++++++--
>>> drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
>>> 9 files changed, 282 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
>>> index 3cb96d260dfb..dac435680e98 100644
>>> --- a/drivers/gpu/drm/i915/i915_debugfs.c
>>> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
>>> @@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
>>> 			  struct drm_i915_gem_request *rq,
>>> 			  const char *prefix)
>>> {
>>> -	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
>>> +	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
>>> 		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
>>> +		   rq->priotree.priority,
>>> 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
>>> 		   rq->timeline->common->name);
>>> }
>>> @@ -3218,6 +3219,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>>>
>>> 		if (i915.enable_execlists) {
>>> 			u32 ptr, read, write;
>>> +			struct rb_node *rb;
>>>
>>> 			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
>>> 				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
>>> @@ -3257,7 +3259,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>>> 			rcu_read_unlock();
>>>
>>> 			spin_lock_irq(&engine->timeline->lock);
>>> -			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
>>> +			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
>>> +				rq = rb_entry(rb, typeof(*rq), priotree.node);
>>> 				print_request(m, rq, "\t\tQ ");
>>> 			}
>>> 			spin_unlock_irq(&engine->timeline->lock);
>>> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
>>> index 4735b4177100..e790147209f3 100644
>>> --- a/drivers/gpu/drm/i915/i915_drv.h
>>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>>> @@ -1791,6 +1791,7 @@ struct drm_i915_private {
>>> 	struct kmem_cache *objects;
>>> 	struct kmem_cache *vmas;
>>> 	struct kmem_cache *requests;
>>> +	struct kmem_cache *dependencies;
>>>
>>> 	const struct intel_device_info info;
>>>
>>> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
>>> index df803e82eb07..a4dc2da2323a 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem.c
>>> @@ -2693,10 +2693,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
>>>
>>> 		spin_lock_irqsave(&engine->timeline->lock, flags);
>>>
>>> -		INIT_LIST_HEAD(&engine->execlist_queue);
>>> 		i915_gem_request_put(engine->execlist_port[0].request);
>>> 		i915_gem_request_put(engine->execlist_port[1].request);
>>> 		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
>>> +		engine->execlist_queue = RB_ROOT;
>>> +		engine->execlist_first = NULL;
>>>
>>> 		spin_unlock_irqrestore(&engine->timeline->lock, flags);
>>> 	}
>>> @@ -4754,12 +4755,18 @@ i915_gem_load_init(struct drm_device *dev)
>>> 	if (!dev_priv->requests)
>>> 		goto err_vmas;
>>>
>>> +	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
>>> +					    SLAB_HWCACHE_ALIGN |
>>> +					    SLAB_RECLAIM_ACCOUNT);
>>> +	if (!dev_priv->dependencies)
>>> +		goto err_requests;
>>> +
>>> 	mutex_lock(&dev_priv->drm.struct_mutex);
>>> 	INIT_LIST_HEAD(&dev_priv->gt.timelines);
>>> 	err = i915_gem_timeline_init__global(dev_priv);
>>> 	mutex_unlock(&dev_priv->drm.struct_mutex);
>>> 	if (err)
>>> -		goto err_requests;
>>> +		goto err_dependencies;
>>>
>>> 	INIT_LIST_HEAD(&dev_priv->context_list);
>>> 	INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
>>> @@ -4787,6 +4794,8 @@ i915_gem_load_init(struct drm_device *dev)
>>>
>>> 	return 0;
>>>
>>> +err_dependencies:
>>> +	kmem_cache_destroy(dev_priv->dependencies);
>>> err_requests:
>>> 	kmem_cache_destroy(dev_priv->requests);
>>> err_vmas:
>>> @@ -4803,6 +4812,7 @@ void i915_gem_load_cleanup(struct drm_device *dev)
>>>
>>> 	WARN_ON(!llist_empty(&dev_priv->mm.free_list));
>>>
>>> +	kmem_cache_destroy(dev_priv->dependencies);
>>> 	kmem_cache_destroy(dev_priv->requests);
>>> 	kmem_cache_destroy(dev_priv->vmas);
>>> 	kmem_cache_destroy(dev_priv->objects);
>>> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
>>> index 93f77df9bc51..278b103a4e95 100644
>>> --- a/drivers/gpu/drm/i915/i915_gem_request.c
>>> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
>>> @@ -113,6 +113,82 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>>> 	spin_unlock(&file_priv->mm.lock);
>>> }
>>>
>>> +static struct i915_dependency *
>>> +i915_dependency_alloc(struct drm_i915_private *i915)
>>> +{
>>> +	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
>>> +}
>>> +
>>> +static void
>>> +i915_dependency_free(struct drm_i915_private *i915,
>>> +		     struct i915_dependency *dep)
>>> +{
>>> +	kmem_cache_free(i915->dependencies, dep);
>>> +}
>>> +
>>> +static void
>>> +__i915_priotree_add_dependency(struct i915_priotree *pt,
>>> +			       struct i915_priotree *signal,
>>> +			       struct i915_dependency *dep,
>>> +			       unsigned long flags)
>>> +{
>>> +	INIT_LIST_HEAD(&dep->dfs_link);
>>> +	list_add(&dep->wait_link, &signal->waiters_list);
>>> +	list_add(&dep->signal_link, &pt->signalers_list);
>>> +	dep->signaler = signal;
>>> +	dep->flags = flags;
>>> +}
>>> +
>>> +static int
>>> +i915_priotree_add_dependency(struct drm_i915_private *i915,
>>> +			     struct i915_priotree *pt,
>>> +			     struct i915_priotree *signal)
>>> +{
>>> +	struct i915_dependency *dep;
>>> +
>>> +	dep = i915_dependency_alloc(i915);
>>> +	if (!dep)
>>> +		return -ENOMEM;
>>> +
>>> +	__i915_priotree_add_dependency(pt, signal, dep, I915_DEPENDENCY_ALLOC);
>>> +	return 0;
>>> +}
>>> +
>>> +static void
>>> +i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
>>> +{
>>> +	struct i915_dependency *dep, *next;
>>> +
>>> +	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
>>> +
>>> +	/* Everyone we depended upon (the fences we wait to be signaled)
>>> +	 * should retire before us and remove themselves from our list.
>>> +	 * However, retirement is run independently on each timeline and
>>> +	 * so we may be called out-of-order.
>>> +	 */
>>> +	list_for_each_entry_safe(dep, next, &pt->signalers_list, signal_link) {
>>> +		list_del(&dep->wait_link);
>>> +		if (dep->flags & I915_DEPENDENCY_ALLOC)
>>> +			i915_dependency_free(i915, dep);
>>> +	}
>>> +
>>> +	/* Remove ourselves from everyone who depends upon us */
>>> +	list_for_each_entry_safe(dep, next, &pt->waiters_list, wait_link) {
>>> +		list_del(&dep->signal_link);
>>> +		if (dep->flags & I915_DEPENDENCY_ALLOC)
>>> +			i915_dependency_free(i915, dep);
>>> +	}
>>> +}
>>> +
>>> +static void
>>> +i915_priotree_init(struct i915_priotree *pt)
>>> +{
>>> +	INIT_LIST_HEAD(&pt->signalers_list);
>>> +	INIT_LIST_HEAD(&pt->waiters_list);
>>> +	RB_CLEAR_NODE(&pt->node);
>>> +	pt->priority = INT_MIN;
>>
>> Not I915_PRIORITY_MIN? Or it has to be smaller? In which case
>> BUILD_BUG_ON(INT_MIN >= I915_PRIORITY_MIN)?
>
> I wanted it to be smaller than min so that the unset value was clear in
> any debug trace.
>
>>> +		rb = rb_next(rb);
>>> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
>>> +		RB_CLEAR_NODE(&cursor->priotree.node);
>>> +		cursor->priotree.priority = INT_MAX;
>>
>> What does setting the priority to INT_MAX here do?
>
> It is used as a signal that the request has been sent to hardware
> both as a shortcircuit for the dfs and so that it is clear in the
> debugfs.
>
>>> +static struct intel_engine_cs *
>>> +pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
>>> +{
>>> +	struct intel_engine_cs *engine;
>>> +
>>> +	engine = container_of(pt,
>>> +			      struct drm_i915_gem_request,
>>> +			      priotree)->engine;
>>> +	if (engine != locked) {
>>> +		if (locked)
>>> +			spin_unlock_irq(&locked->timeline->lock);
>>> +		spin_lock_irq(&engine->timeline->lock);
>>> +	}
>>> +
>>> +	return engine;
>>> +}
>>
>> Ha, cute. :)
>>
>>> +
>>> +static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
>>> +{
>>> +	struct intel_engine_cs *engine = NULL;
>>> +	struct i915_dependency *dep, *p;
>>> +	struct i915_dependency stack;
>>> +	LIST_HEAD(dfs);
>>> +
>>> +	if (prio <= READ_ONCE(request->priotree.priority))
>>> +		return;
>>> +
>>> +	/* Need BKL in order to use the temporary link inside i915_dependency */
>>> +	lockdep_assert_held(&request->i915->drm.struct_mutex);
>>
>> Could use a new i915 submission lock, but I suppose this is OK to
>> start with.
>>
>>> +
>>> +	stack.signaler = &request->priotree;
>>> +	list_add(&stack.dfs_link, &dfs);
>>> +
>>
>> Tada, onto the DFS which I am not familiar with - but there's always
>> Wikipedia. :)
>>
>>> +	/* Recursively bump all dependent priorities to match the new request */
>>> +	list_for_each_entry_safe(dep, p, &dfs, dfs_link) {
>>> +		struct i915_priotree *pt = dep->signaler;
>>> +
>>> +		list_for_each_entry(p, &pt->signalers_list, signal_link)
>>> +			if (prio > READ_ONCE(p->signaler->priority))
>>> +				list_move_tail(&p->dfs_link, &dfs);
>>> +
>>> +		p = list_first_entry(&dep->dfs_link, typeof(*p), dfs_link);
>>
>> This was fun. :)

Maybe clearer as:

p = list_next_entry(dep, typeof(*p), dfs_link) ?

>>
>>> +		if (!RB_EMPTY_NODE(&pt->node))
>>> +			continue;
>>> +
>>> +		engine = pt_lock_engine(pt, engine);
>>> +
>>> +		/* If it is not already in the rbtree, we can update the
>>> +		 * priority inplace and skip over it (and its dependencies)
>>> +		 * if it is referenced again as we descend the dfs.
>>> +		 */
>>
>> Are you sure it is OK to ignore the dependencies for new requests? I
>> don't see why it would be.
>
> We don't ignore the dependencies for new requests, they have already
> been added to the list to be processed. What we are considering here is
> what happens if this request is a dependency of a subsequent request in
> the list. My statement is that since we are not in the rbtree, we are
> not ready to be run (and in turn neither will be the following request
> that depended upon this request). As we are not in the rbtree, we do not
> need to reorder the rbtree to ensure the fifo ordering with our
> dependencies.

Yes I've lost context while looking at it, you are right.

>>> +		if (prio > pt->priority && RB_EMPTY_NODE(&pt->node)) {
>>
>> Isn't the node guaranteed to be empty from the earlier test and continue?
>
> We checked before we were certain we had the spinlock.

I missed that as well, makes sense.

>
>>> +			pt->priority = prio;
>>> +			list_del_init(&dep->dfs_link);
>>> +		}
>>> +	}
>>> +
>>> +	/* Fifo and depth-first replacement ensure our deps execute before us */
>>> +	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
>>> +		struct i915_priotree *pt = dep->signaler;
>>> +
>>> +		INIT_LIST_HEAD(&dep->dfs_link);
>>> +
>>> +		engine = pt_lock_engine(pt, engine);
>>> +
>>> +		if (prio <= pt->priority)
>>> +			continue;
>>
>> How would these priorities end up in the list? The first loop skips
>> over them.
>
> We keep on dropping the lock.

Yeah, looks OK to me.

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-10 15:36       ` Tvrtko Ursulin
@ 2016-11-10 15:55         ` Chris Wilson
  0 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-10 15:55 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Nov 10, 2016 at 03:36:17PM +0000, Tvrtko Ursulin wrote:
> 
> On 10/11/2016 15:01, Chris Wilson wrote:
> >On Thu, Nov 10, 2016 at 02:45:39PM +0000, Tvrtko Ursulin wrote:
> >>
> >>On 07/11/2016 13:59, Chris Wilson wrote:
> >>>+	/* Recursively bump all dependent priorities to match the new request */
> >>>+	list_for_each_entry_safe(dep, p, &dfs, dfs_link) {
> >>>+		struct i915_priotree *pt = dep->signaler;
> >>>+
> >>>+		list_for_each_entry(p, &pt->signalers_list, signal_link)
> >>>+			if (prio > READ_ONCE(p->signaler->priority))
> >>>+				list_move_tail(&p->dfs_link, &dfs);
> >>>+
> >>>+		p = list_first_entry(&dep->dfs_link, typeof(*p), dfs_link);
> >>
> >>This was fun. :)
> 
> Maybe clearer as:
> 
> p = list_next_entry(dep, typeof(*p), dfs_link) ?

Fancy.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass
  2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
                   ` (12 preceding siblings ...)
  2016-11-10 11:45 ` ✓ Fi.CI.BAT: success for series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines (rev2) Patchwork
@ 2016-11-14  8:56 ` Chris Wilson
  2016-11-14  8:56   ` [PATCH v3 02/14] drm/i915: Create distinct lockclasses for execution vs user timelines Chris Wilson
                     ` (15 more replies)
  13 siblings, 16 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:56 UTC (permalink / raw)
  To: intel-gfx

Localise the static struct lock_class_key to the caller of
i915_sw_fence_init() so that we create a lock_class instance for each
unique sw_fence rather than all sw_fences sharing the same
lock_class. This eliminate some lockdep false positive when using fences
from within fence callbacks.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_sw_fence.c |  7 +++++--
 drivers/gpu/drm/i915/i915_sw_fence.h | 11 ++++++++++-
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 95f2f12e0917..147420ccf49c 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -116,11 +116,14 @@ static void i915_sw_fence_await(struct i915_sw_fence *fence)
 	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
 }
 
-void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn)
+void __i915_sw_fence_init(struct i915_sw_fence *fence,
+			  i915_sw_fence_notify_t fn,
+			  const char *name,
+			  struct lock_class_key *key)
 {
 	BUG_ON((unsigned long)fn & ~I915_SW_FENCE_MASK);
 
-	init_waitqueue_head(&fence->wait);
+	__init_waitqueue_head(&fence->wait, name, key);
 	kref_init(&fence->kref);
 	atomic_set(&fence->pending, 1);
 	fence->flags = (unsigned long)fn;
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 707dfc4f0da5..a5546eb2b5cd 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -40,7 +40,16 @@ typedef int (*i915_sw_fence_notify_t)(struct i915_sw_fence *,
 				      enum i915_sw_fence_notify state);
 #define __i915_sw_fence_call __aligned(4)
 
-void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn);
+void __i915_sw_fence_init(struct i915_sw_fence *fence,
+			  i915_sw_fence_notify_t fn,
+			  const char *name,
+			  struct lock_class_key *key);
+#define i915_sw_fence_init(fence, fn) do {			\
+	static struct lock_class_key __key; 			\
+								\
+	__i915_sw_fence_init((fence), fn, #fence, &__key);	\
+} while (0)
+
 void i915_sw_fence_commit(struct i915_sw_fence *fence);
 
 int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 02/14] drm/i915: Create distinct lockclasses for execution vs user timelines
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
@ 2016-11-14  8:56   ` Chris Wilson
  2016-11-14  8:56   ` [PATCH v3 03/14] drm/i915: Split request submit/execute phase into two Chris Wilson
                     ` (14 subsequent siblings)
  15 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:56 UTC (permalink / raw)
  To: intel-gfx

In order to simplify the lockdep annotation, as they become more complex
in the future with deferred execution and multiple paths through the
same functions, create a separate lockclass for the user timeline and
the hardware execution timeline.

We should only ever be locking the user timeline and the execution
timeline in parallel so we only need to create two lock classes, rather
than a separate class for every timeline.

v2: Rename the lock classes to be more consistent with other lockdep.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c          |  4 +---
 drivers/gpu/drm/i915/i915_gem_request.c  |  2 +-
 drivers/gpu/drm/i915/i915_gem_timeline.c | 33 ++++++++++++++++++++++++++++----
 drivers/gpu/drm/i915/i915_gem_timeline.h |  1 +
 4 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 3b021e9e3379..8a812dc334da 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4429,9 +4429,7 @@ i915_gem_load_init(struct drm_device *dev)
 
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	INIT_LIST_HEAD(&dev_priv->gt.timelines);
-	err = i915_gem_timeline_init(dev_priv,
-				     &dev_priv->gt.global_timeline,
-				     "[execution]");
+	err = i915_gem_timeline_init__global(dev_priv);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 	if (err)
 		goto err_requests;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 5050464c5401..f25b537d6e64 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -346,7 +346,7 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 				request->ring->vaddr + request->postfix);
 	engine->submit_request(request);
 
-	spin_lock_nested(&request->timeline->lock, SINGLE_DEPTH_NESTING);
+	spin_lock(&request->timeline->lock);
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.c b/drivers/gpu/drm/i915/i915_gem_timeline.c
index fc8f13a79f8f..bf8a471b61e6 100644
--- a/drivers/gpu/drm/i915/i915_gem_timeline.c
+++ b/drivers/gpu/drm/i915/i915_gem_timeline.c
@@ -24,9 +24,11 @@
 
 #include "i915_drv.h"
 
-int i915_gem_timeline_init(struct drm_i915_private *i915,
-			   struct i915_gem_timeline *timeline,
-			   const char *name)
+static int __i915_gem_timeline_init(struct drm_i915_private *i915,
+				    struct i915_gem_timeline *timeline,
+				    const char *name,
+				    struct lock_class_key *lockclass,
+				    const char *lockname)
 {
 	unsigned int i;
 	u64 fences;
@@ -47,8 +49,11 @@ int i915_gem_timeline_init(struct drm_i915_private *i915,
 
 		tl->fence_context = fences++;
 		tl->common = timeline;
-
+#ifdef CONFIG_DEBUG_SPINLOCK
+		__raw_spin_lock_init(&tl->lock.rlock, lockname, lockclass);
+#else
 		spin_lock_init(&tl->lock);
+#endif
 		init_request_active(&tl->last_request, NULL);
 		INIT_LIST_HEAD(&tl->requests);
 	}
@@ -56,6 +61,26 @@ int i915_gem_timeline_init(struct drm_i915_private *i915,
 	return 0;
 }
 
+int i915_gem_timeline_init(struct drm_i915_private *i915,
+			   struct i915_gem_timeline *timeline,
+			   const char *name)
+{
+	static struct lock_class_key class;
+
+	return __i915_gem_timeline_init(i915, timeline, name,
+					&class, "&timeline->lock");
+}
+
+int i915_gem_timeline_init__global(struct drm_i915_private *i915)
+{
+	static struct lock_class_key class;
+
+	return __i915_gem_timeline_init(i915,
+					&i915->gt.global_timeline,
+					"[execution]",
+					&class, "&global_timeline->lock");
+}
+
 void i915_gem_timeline_fini(struct i915_gem_timeline *tl)
 {
 	lockdep_assert_held(&tl->i915->drm.struct_mutex);
diff --git a/drivers/gpu/drm/i915/i915_gem_timeline.h b/drivers/gpu/drm/i915/i915_gem_timeline.h
index f2bf7b1d49a1..98d99a62b4ae 100644
--- a/drivers/gpu/drm/i915/i915_gem_timeline.h
+++ b/drivers/gpu/drm/i915/i915_gem_timeline.h
@@ -67,6 +67,7 @@ struct i915_gem_timeline {
 int i915_gem_timeline_init(struct drm_i915_private *i915,
 			   struct i915_gem_timeline *tl,
 			   const char *name);
+int i915_gem_timeline_init__global(struct drm_i915_private *i915);
 void i915_gem_timeline_fini(struct i915_gem_timeline *tl);
 
 #endif
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 03/14] drm/i915: Split request submit/execute phase into two
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
  2016-11-14  8:56   ` [PATCH v3 02/14] drm/i915: Create distinct lockclasses for execution vs user timelines Chris Wilson
@ 2016-11-14  8:56   ` Chris Wilson
  2016-11-14  8:56   ` [PATCH v3 04/14] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
                     ` (13 subsequent siblings)
  15 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:56 UTC (permalink / raw)
  To: intel-gfx

In order to support deferred scheduling, we need to differentiate
between when the request is ready to run (i.e. the submit fence is
signaled) and when the request is actually run (a new execute fence).
This is typically split between the request itself wanting to wait upon
others (for which we use the submit fence) and the CPU wanting to wait
upon the request, for which we use the execute fence to be sure the
hardware is ready to signal completion.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_gem_request.c | 33 ++++++++++++++++++++++++---------
 drivers/gpu/drm/i915/i915_gem_request.h | 15 +++++++++++++++
 2 files changed, 39 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index f25b537d6e64..d0f6b9f82636 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -350,11 +350,19 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
+	i915_sw_fence_commit(&request->execute);
+
 	spin_unlock_irqrestore(&timeline->lock, flags);
 
 	return NOTIFY_DONE;
 }
 
+static int __i915_sw_fence_call
+execute_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	return NOTIFY_DONE;
+}
+
 /**
  * i915_gem_request_alloc - allocate a request structure
  *
@@ -440,6 +448,12 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 		       __timeline_get_seqno(req->timeline->common));
 
 	i915_sw_fence_init(&req->submit, submit_notify);
+	i915_sw_fence_init(&req->execute, execute_notify);
+	/* Ensure that the execute fence completes after the submit fence -
+	 * as we complete the execute fence from within the submit fence
+	 * callback, its completion would otherwise be visible first.
+	 */
+	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
 
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
@@ -816,9 +830,9 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req,
 }
 
 static long
-__i915_request_wait_for_submit(struct drm_i915_gem_request *request,
-			       unsigned int flags,
-			       long timeout)
+__i915_request_wait_for_execute(struct drm_i915_gem_request *request,
+				unsigned int flags,
+				long timeout)
 {
 	const int state = flags & I915_WAIT_INTERRUPTIBLE ?
 		TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE;
@@ -830,9 +844,9 @@ __i915_request_wait_for_submit(struct drm_i915_gem_request *request,
 		add_wait_queue(q, &reset);
 
 	do {
-		prepare_to_wait(&request->submit.wait, &wait, state);
+		prepare_to_wait(&request->execute.wait, &wait, state);
 
-		if (i915_sw_fence_done(&request->submit))
+		if (i915_sw_fence_done(&request->execute))
 			break;
 
 		if (flags & I915_WAIT_LOCKED &&
@@ -850,7 +864,7 @@ __i915_request_wait_for_submit(struct drm_i915_gem_request *request,
 
 		timeout = io_schedule_timeout(timeout);
 	} while (timeout);
-	finish_wait(&request->submit.wait, &wait);
+	finish_wait(&request->execute.wait, &wait);
 
 	if (flags & I915_WAIT_LOCKED)
 		remove_wait_queue(q, &reset);
@@ -902,13 +916,14 @@ long i915_wait_request(struct drm_i915_gem_request *req,
 
 	trace_i915_gem_request_wait_begin(req);
 
-	if (!i915_sw_fence_done(&req->submit)) {
-		timeout = __i915_request_wait_for_submit(req, flags, timeout);
+	if (!i915_sw_fence_done(&req->execute)) {
+		timeout = __i915_request_wait_for_execute(req, flags, timeout);
 		if (timeout < 0)
 			goto complete;
 
-		GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
+		GEM_BUG_ON(!i915_sw_fence_done(&req->execute));
 	}
+	GEM_BUG_ON(!i915_sw_fence_done(&req->submit));
 	GEM_BUG_ON(!req->global_seqno);
 
 	/* Optimistic short spin before touching IRQs */
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index a56559e3b034..4976039189ea 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -87,8 +87,23 @@ struct drm_i915_gem_request {
 	struct intel_timeline *timeline;
 	struct intel_signal_node signaling;
 
+	/* Fences for the various phases in the request's lifetime.
+	 *
+	 * The submit fence is used to await upon all of the request's
+	 * dependencies. When it is signaled, the request is ready to run.
+	 * It is used by the driver to then queue the request for execution.
+	 *
+	 * The execute fence is used to signal when the request has been
+	 * sent to hardware.
+	 *
+	 * It is illegal for the submit fence of one request to wait upon the
+	 * execute fence of an earlier request. It should be sufficient to
+	 * wait upon the submit fence of the earlier request.
+	 */
 	struct i915_sw_fence submit;
+	struct i915_sw_fence execute;
 	wait_queue_t submitq;
+	wait_queue_t execq;
 
 	u32 global_seqno;
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 04/14] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
  2016-11-14  8:56   ` [PATCH v3 02/14] drm/i915: Create distinct lockclasses for execution vs user timelines Chris Wilson
  2016-11-14  8:56   ` [PATCH v3 03/14] drm/i915: Split request submit/execute phase into two Chris Wilson
@ 2016-11-14  8:56   ` Chris Wilson
  2016-11-14 10:59     ` Tvrtko Ursulin
  2016-11-14  8:56   ` [PATCH v3 05/14] drm/i915: Remove engine->execlist_lock Chris Wilson
                     ` (12 subsequent siblings)
  15 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:56 UTC (permalink / raw)
  To: intel-gfx

Defer the transfer from the client's timeline onto the execution
timeline from the point of readiness to the point of actual submission.
For example, in execlists, a request is finally submitted to hardware
when the hardware is ready, and only put onto the hardware queue when
the request is ready. By deferring the transfer, we ensure that the
timeline is maintained in retirement order if we decide to queue the
requests onto the hardware in a different order than fifo.

v2: Rebased onto distinct global/user timeline lock classes.
v3: Play with the position of the spin_lock().
v4: Nesting finally resolved with distinct sw_fence lock classes.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_request.c    | 38 ++++++++++++++++++++----------
 drivers/gpu/drm/i915/i915_gem_request.h    |  3 +++
 drivers/gpu/drm/i915/i915_guc_submission.c | 14 ++++++++++-
 drivers/gpu/drm/i915/intel_lrc.c           | 23 +++++++++++-------
 drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
 5 files changed, 57 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index d0f6b9f82636..952d2aec5244 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -306,25 +306,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
 	return atomic_inc_return(&tl->next_seqno);
 }
 
-static int __i915_sw_fence_call
-submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+void __i915_gem_request_submit(struct drm_i915_gem_request *request)
 {
-	struct drm_i915_gem_request *request =
-		container_of(fence, typeof(*request), submit);
 	struct intel_engine_cs *engine = request->engine;
 	struct intel_timeline *timeline;
-	unsigned long flags;
 	u32 seqno;
 
-	if (state != FENCE_COMPLETE)
-		return NOTIFY_DONE;
-
 	/* Transfer from per-context onto the global per-engine timeline */
 	timeline = engine->timeline;
 	GEM_BUG_ON(timeline == request->timeline);
-
-	/* Will be called from irq-context when using foreign DMA fences */
-	spin_lock_irqsave(&timeline->lock, flags);
+	assert_spin_locked(&timeline->lock);
 
 	seqno = timeline_get_seqno(timeline->common);
 	GEM_BUG_ON(!seqno);
@@ -344,15 +335,36 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 	GEM_BUG_ON(!request->global_seqno);
 	engine->emit_breadcrumb(request,
 				request->ring->vaddr + request->postfix);
-	engine->submit_request(request);
 
 	spin_lock(&request->timeline->lock);
 	list_move_tail(&request->link, &timeline->requests);
 	spin_unlock(&request->timeline->lock);
 
 	i915_sw_fence_commit(&request->execute);
+}
+
+void i915_gem_request_submit(struct drm_i915_gem_request *request)
+{
+	struct intel_engine_cs *engine = request->engine;
+	unsigned long flags;
 
-	spin_unlock_irqrestore(&timeline->lock, flags);
+	/* Will be called from irq-context when using foreign fences. */
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+
+	__i915_gem_request_submit(request);
+
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+}
+
+static int __i915_sw_fence_call
+submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
+{
+	if (state == FENCE_COMPLETE) {
+		struct drm_i915_gem_request *request =
+			container_of(fence, typeof(*request), submit);
+
+		request->engine->submit_request(request);
+	}
 
 	return NOTIFY_DONE;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 4976039189ea..4d2784633d9f 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -232,6 +232,9 @@ void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
 #define i915_add_request_no_flush(req) \
 	__i915_add_request(req, false)
 
+void __i915_gem_request_submit(struct drm_i915_gem_request *request);
+void i915_gem_request_submit(struct drm_i915_gem_request *request);
+
 struct intel_rps_client;
 #define NO_WAITBOOST ERR_PTR(-1)
 #define IS_RPS_CLIENT(p) (!IS_ERR(p))
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 666dab7a675a..942f5000d372 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -629,11 +629,23 @@ static int guc_ring_doorbell(struct i915_guc_client *gc)
 static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
 	struct drm_i915_private *dev_priv = rq->i915;
-	unsigned int engine_id = rq->engine->id;
+	struct intel_engine_cs *engine = rq->engine;
+	unsigned int engine_id = engine->id;
 	struct intel_guc *guc = &rq->i915->guc;
 	struct i915_guc_client *client = guc->execbuf_client;
 	int b_ret;
 
+	/* We keep the previous context alive until we retire the following
+	 * request. This ensures that any the context object is still pinned
+	 * for any residual writes the HW makes into it on the context switch
+	 * into the next object following the breadcrumb. Otherwise, we may
+	 * retire the context too early.
+	 */
+	rq->previous_context = engine->last_context;
+	engine->last_context = rq->ctx;
+
+	i915_gem_request_submit(rq);
+
 	spin_lock(&client->wq_lock);
 	guc_wq_item_append(client, rq);
 
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dde04b7643b1..dca41834dec1 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -434,6 +434,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *cursor, *last;
 	struct execlist_port *port = engine->execlist_port;
+	unsigned long flags;
 	bool submit = false;
 
 	last = port->request;
@@ -469,6 +470,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 * and context switches) submission.
 	 */
 
+	spin_lock_irqsave(&engine->timeline->lock, flags);
 	spin_lock(&engine->execlist_lock);
 	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
 		/* Can we combine this request with the current port? It has to
@@ -501,6 +503,17 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			i915_gem_request_assign(&port->request, last);
 			port++;
 		}
+
+		/* We keep the previous context alive until we retire the
+		 * following request. This ensures that any the context object
+		 * is still pinned for any residual writes the HW makes into it
+		 * on the context switch into the next object following the
+		 * breadcrumb. Otherwise, we may retire the context too early.
+		 */
+		cursor->previous_context = engine->last_context;
+		engine->last_context = cursor->ctx;
+
+		__i915_gem_request_submit(cursor);
 		last = cursor;
 		submit = true;
 	}
@@ -512,6 +525,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		i915_gem_request_assign(&port->request, last);
 	}
 	spin_unlock(&engine->execlist_lock);
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	if (submit)
 		execlists_submit_ports(engine);
@@ -621,15 +635,6 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 
 	spin_lock_irqsave(&engine->execlist_lock, flags);
 
-	/* We keep the previous context alive until we retire the following
-	 * request. This ensures that any the context object is still pinned
-	 * for any residual writes the HW makes into it on the context switch
-	 * into the next object following the breadcrumb. Otherwise, we may
-	 * retire the context too early.
-	 */
-	request->previous_context = engine->last_context;
-	engine->last_context = request->ctx;
-
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 700e93d80616..aeb637dc1fdf 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1294,6 +1294,8 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
 {
 	struct drm_i915_private *dev_priv = request->i915;
 
+	i915_gem_request_submit(request);
+
 	I915_WRITE_TAIL(request->engine, request->tail);
 }
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 05/14] drm/i915: Remove engine->execlist_lock
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (2 preceding siblings ...)
  2016-11-14  8:56   ` [PATCH v3 04/14] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
@ 2016-11-14  8:56   ` Chris Wilson
  2016-11-14  8:56   ` [PATCH v3 06/14] drm/i915/scheduler: Signal the arrival of a new request Chris Wilson
                     ` (11 subsequent siblings)
  15 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:56 UTC (permalink / raw)
  To: intel-gfx

The execlist_lock is now completely subsumed by the engine->timeline->lock,
and so we can remove the redundant layer of locking.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_debugfs.c     | 4 ++--
 drivers/gpu/drm/i915/i915_gem.c         | 8 ++++++--
 drivers/gpu/drm/i915/intel_engine_cs.c  | 1 -
 drivers/gpu/drm/i915/intel_lrc.c        | 7 +++----
 drivers/gpu/drm/i915/intel_ringbuffer.h | 1 -
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index bce38803f45c..03e3c2afbb06 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -3254,11 +3254,11 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 				seq_printf(m, "\t\tELSP[1] idle\n");
 			rcu_read_unlock();
 
-			spin_lock_irq(&engine->execlist_lock);
+			spin_lock_irq(&engine->timeline->lock);
 			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
 				print_request(m, rq, "\t\tQ ");
 			}
-			spin_unlock_irq(&engine->execlist_lock);
+			spin_unlock_irq(&engine->timeline->lock);
 		} else if (INTEL_GEN(dev_priv) > 6) {
 			seq_printf(m, "\tPP_DIR_BASE: 0x%08x\n",
 				   I915_READ(RING_PP_DIR_BASE(engine)));
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 8a812dc334da..e1afa11609a0 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2718,12 +2718,16 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
 	 */
 
 	if (i915.enable_execlists) {
-		spin_lock(&engine->execlist_lock);
+		unsigned long flags;
+
+		spin_lock_irqsave(&engine->timeline->lock, flags);
+
 		INIT_LIST_HEAD(&engine->execlist_queue);
 		i915_gem_request_put(engine->execlist_port[0].request);
 		i915_gem_request_put(engine->execlist_port[1].request);
 		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
-		spin_unlock(&engine->execlist_lock);
+
+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
 	}
 }
 
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 841f8d1e1410..298f0f95dd3f 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -237,7 +237,6 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
 void intel_engine_setup_common(struct intel_engine_cs *engine)
 {
 	INIT_LIST_HEAD(&engine->execlist_queue);
-	spin_lock_init(&engine->execlist_lock);
 
 	intel_engine_init_timeline(engine);
 	intel_engine_init_hangcheck(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index dca41834dec1..d1aea7462515 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -471,7 +471,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 */
 
 	spin_lock_irqsave(&engine->timeline->lock, flags);
-	spin_lock(&engine->execlist_lock);
 	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
 		/* Can we combine this request with the current port? It has to
 		 * be the same context/ringbuffer and not have any exceptions
@@ -524,7 +523,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 
 		i915_gem_request_assign(&port->request, last);
 	}
-	spin_unlock(&engine->execlist_lock);
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
 	if (submit)
@@ -633,13 +631,14 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 	struct intel_engine_cs *engine = request->engine;
 	unsigned long flags;
 
-	spin_lock_irqsave(&engine->execlist_lock, flags);
+	/* Will be called from irq-context when using foreign fences. */
+	spin_lock_irqsave(&engine->timeline->lock, flags);
 
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
 
-	spin_unlock_irqrestore(&engine->execlist_lock, flags);
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 }
 
 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index d1a728791ad4..e1351870c203 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -335,7 +335,6 @@ struct intel_engine_cs {
 
 	/* Execlists */
 	struct tasklet_struct irq_tasklet;
-	spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
 	struct execlist_port {
 		struct drm_i915_gem_request *request;
 		unsigned int count;
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 06/14] drm/i915/scheduler: Signal the arrival of a new request
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (3 preceding siblings ...)
  2016-11-14  8:56   ` [PATCH v3 05/14] drm/i915: Remove engine->execlist_lock Chris Wilson
@ 2016-11-14  8:56   ` Chris Wilson
  2016-11-14  8:56   ` [PATCH v3 07/14] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
                     ` (10 subsequent siblings)
  15 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:56 UTC (permalink / raw)
  To: intel-gfx

The start of the scheduler, add a hook into request submission for the
scheduler to see the arrival of new requests and prepare its runqueues.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c         |  4 ++++
 drivers/gpu/drm/i915/i915_gem_request.c | 13 +++++++++++++
 drivers/gpu/drm/i915/intel_engine_cs.c  |  3 +++
 drivers/gpu/drm/i915/intel_ringbuffer.h |  9 +++++++++
 include/uapi/drm/i915_drm.h             |  5 +++++
 5 files changed, 34 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 4efec8b401bc..99fc075100b1 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -323,6 +323,10 @@ static int i915_getparam(struct drm_device *dev, void *data,
 		 */
 		value = i915_gem_mmap_gtt_version();
 		break;
+	case I915_PARAM_HAS_SCHEDULER:
+		value = dev_priv->engine[RCS] &&
+			dev_priv->engine[RCS]->schedule;
+		break;
 	case I915_PARAM_MMAP_VERSION:
 		/* Remember to bump this if the version changes! */
 	case I915_PARAM_HAS_GEM:
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 952d2aec5244..1118cf48d6f0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -762,6 +762,19 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 
 	i915_gem_mark_busy(engine);
 
+	/* Let the backend know a new request has arrived that may need
+	 * to adjust the existing execution schedule due to a high priority
+	 * request - i.e. we may want to preempt the current request in order
+	 * to run a high priority dependency chain *before* we can execute this
+	 * request.
+	 *
+	 * This is called before the request is ready to run so that we can
+	 * decide whether to preempt the entire chain so that it is ready to
+	 * run at the earliest possible convenience.
+	 */
+	if (engine->schedule)
+		engine->schedule(request, 0);
+
 	local_bh_disable();
 	i915_sw_fence_commit(&request->submit);
 	local_bh_enable(); /* Kick the execlists tasklet if just scheduled */
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index 298f0f95dd3f..c9171a058478 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -102,6 +102,9 @@ intel_engine_setup(struct drm_i915_private *dev_priv,
 	engine->mmio_base = info->mmio_base;
 	engine->irq_shift = info->irq_shift;
 
+	/* Nothing to do here, execute in order of dependencies */
+	engine->schedule = NULL;
+
 	dev_priv->engine[id] = engine;
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index e1351870c203..b9583941eb6b 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -267,6 +267,15 @@ struct intel_engine_cs {
 	 */
 	void		(*submit_request)(struct drm_i915_gem_request *req);
 
+	/* Call when the priority on a request has changed and it and its
+	 * dependencies may need rescheduling. Note the request itself may
+	 * not be ready to run!
+	 *
+	 * Called under the struct_mutex.
+	 */
+	void		(*schedule)(struct drm_i915_gem_request *request,
+				    int priority);
+
 	/* Some chipsets are not quite as coherent as advertised and need
 	 * an expensive kick to force a true read of the up-to-date seqno.
 	 * However, the up-to-date seqno is not always required and the last
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 03725fe89859..1c12a350eca3 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -389,6 +389,11 @@ typedef struct drm_i915_irq_wait {
 #define I915_PARAM_MIN_EU_IN_POOL	 39
 #define I915_PARAM_MMAP_GTT_VERSION	 40
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
+ * priorities and the driver will attempt to execute batches in priority order.
+ */
+#define I915_PARAM_HAS_SCHEDULER	 41
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 07/14] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (4 preceding siblings ...)
  2016-11-14  8:56   ` [PATCH v3 06/14] drm/i915/scheduler: Signal the arrival of a new request Chris Wilson
@ 2016-11-14  8:56   ` Chris Wilson
  2016-11-14 11:09     ` Tvrtko Ursulin
  2016-11-14  8:56   ` [PATCH v3 08/14] drm/i915/scheduler: Execute requests in order of priorities Chris Wilson
                     ` (9 subsequent siblings)
  15 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:56 UTC (permalink / raw)
  To: intel-gfx

The scheduler needs to know the dependencies of each request for the
lifetime of the request, as it may choose to reschedule the requests at
any time and must ensure the dependency tree is not broken. This is in
additional to using the fence to only allow execution after all
dependencies have been completed.

One option was to extend the fence to support the bidirectional
dependency tracking required by the scheduler. However the mismatch in
lifetimes between the submit fence and the request essentially meant
that we had to build a completely separate struct (and we could not
simply reuse the existing waitqueue in the fence for one half of the
dependency tracking). The extra dependency tracking simply did not mesh
well with the fence, and keeping it separate both keeps the fence
implementation simpler and allows us to extend the dependency tracking
into a priority tree (whilst maintaining support for reordering the
tree).

To avoid the additional allocations and list manipulations, the use of
the priotree is disabled when there are no schedulers to use it.

v2: Create a dedicated slab for i915_dependency.
    Rename the lists.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h         |  1 +
 drivers/gpu/drm/i915/i915_gem.c         | 11 +++-
 drivers/gpu/drm/i915/i915_gem_request.c | 91 ++++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_request.h | 33 ++++++++++++
 4 files changed, 134 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index c0f1dfc7119e..ab4ad5522cf5 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1778,6 +1778,7 @@ struct drm_i915_private {
 	struct kmem_cache *objects;
 	struct kmem_cache *vmas;
 	struct kmem_cache *requests;
+	struct kmem_cache *dependencies;
 
 	const struct intel_device_info info;
 
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index e1afa11609a0..b331e5966fe2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4431,12 +4431,18 @@ i915_gem_load_init(struct drm_device *dev)
 	if (!dev_priv->requests)
 		goto err_vmas;
 
+	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
+					    SLAB_HWCACHE_ALIGN |
+					    SLAB_RECLAIM_ACCOUNT);
+	if (!dev_priv->dependencies)
+		goto err_requests;
+
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	INIT_LIST_HEAD(&dev_priv->gt.timelines);
 	err = i915_gem_timeline_init__global(dev_priv);
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 	if (err)
-		goto err_requests;
+		goto err_dependencies;
 
 	INIT_LIST_HEAD(&dev_priv->context_list);
 	INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
@@ -4464,6 +4470,8 @@ i915_gem_load_init(struct drm_device *dev)
 
 	return 0;
 
+err_dependencies:
+	kmem_cache_destroy(dev_priv->dependencies);
 err_requests:
 	kmem_cache_destroy(dev_priv->requests);
 err_vmas:
@@ -4480,6 +4488,7 @@ void i915_gem_load_cleanup(struct drm_device *dev)
 
 	WARN_ON(!llist_empty(&dev_priv->mm.free_list));
 
+	kmem_cache_destroy(dev_priv->dependencies);
 	kmem_cache_destroy(dev_priv->requests);
 	kmem_cache_destroy(dev_priv->vmas);
 	kmem_cache_destroy(dev_priv->objects);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 1118cf48d6f0..78c87d94d205 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -113,6 +113,77 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
 	spin_unlock(&file_priv->mm.lock);
 }
 
+static struct i915_dependency *
+i915_dependency_alloc(struct drm_i915_private *i915)
+{
+	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
+}
+
+static void
+i915_dependency_free(struct drm_i915_private *i915,
+		     struct i915_dependency *dep)
+{
+	kmem_cache_free(i915->dependencies, dep);
+}
+
+static void
+__i915_priotree_add_dependency(struct i915_priotree *pt,
+			       struct i915_priotree *signal,
+			       struct i915_dependency *dep,
+			       unsigned long flags)
+{
+	list_add(&dep->wait_link, &signal->waiters_list);
+	list_add(&dep->signal_link, &pt->signalers_list);
+	dep->signaler = signal;
+	dep->flags = flags;
+}
+
+static int
+i915_priotree_add_dependency(struct drm_i915_private *i915,
+			     struct i915_priotree *pt,
+			     struct i915_priotree *signal)
+{
+	struct i915_dependency *dep;
+
+	dep = i915_dependency_alloc(i915);
+	if (!dep)
+		return -ENOMEM;
+
+	__i915_priotree_add_dependency(pt, signal, dep, I915_DEPENDENCY_ALLOC);
+	return 0;
+}
+
+static void
+i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
+{
+	struct i915_dependency *dep, *next;
+
+	/* Everyone we depended upon (the fences we wait to be signaled)
+	 * should retire before us and remove themselves from our list.
+	 * However, retirement is run independently on each timeline and
+	 * so we may be called out-of-order.
+	 */
+	list_for_each_entry_safe(dep, next, &pt->signalers_list, signal_link) {
+		list_del(&dep->wait_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			i915_dependency_free(i915, dep);
+	}
+
+	/* Remove ourselves from everyone who depends upon us */
+	list_for_each_entry_safe(dep, next, &pt->waiters_list, wait_link) {
+		list_del(&dep->signal_link);
+		if (dep->flags & I915_DEPENDENCY_ALLOC)
+			i915_dependency_free(i915, dep);
+	}
+}
+
+static void
+i915_priotree_init(struct i915_priotree *pt)
+{
+	INIT_LIST_HEAD(&pt->signalers_list);
+	INIT_LIST_HEAD(&pt->waiters_list);
+}
+
 void i915_gem_retire_noop(struct i915_gem_active *active,
 			  struct drm_i915_gem_request *request)
 {
@@ -182,6 +253,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
 	i915_gem_context_put(request->ctx);
 
 	dma_fence_signal(&request->fence);
+
+	i915_priotree_fini(request->i915, &request->priotree);
 	i915_gem_request_put(request);
 }
 
@@ -467,6 +540,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	 */
 	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
 
+	i915_priotree_init(&req->priotree);
+
 	INIT_LIST_HEAD(&req->active_list);
 	req->i915 = dev_priv;
 	req->engine = engine;
@@ -520,6 +595,14 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
 
 	GEM_BUG_ON(to == from);
 
+	if (to->engine->schedule) {
+		ret = i915_priotree_add_dependency(to->i915,
+						   &to->priotree,
+						   &from->priotree);
+		if (ret < 0)
+			return ret;
+	}
+
 	if (to->timeline == from->timeline)
 		return 0;
 
@@ -743,9 +826,15 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 
 	prev = i915_gem_active_raw(&timeline->last_request,
 				   &request->i915->drm.struct_mutex);
-	if (prev)
+	if (prev) {
 		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
 					     &request->submitq);
+		if (engine->schedule)
+			__i915_priotree_add_dependency(&request->priotree,
+						       &prev->priotree,
+						       &request->dep,
+						       0);
+	}
 
 	spin_lock_irq(&timeline->lock);
 	list_add_tail(&request->link, &timeline->requests);
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 4d2784633d9f..943c39d2a62a 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -44,6 +44,28 @@ struct intel_signal_node {
 	struct intel_wait wait;
 };
 
+struct i915_dependency {
+	struct i915_priotree *signaler;
+	struct list_head signal_link;
+	struct list_head wait_link;
+	unsigned long flags;
+#define I915_DEPENDENCY_ALLOC BIT(0)
+};
+
+/* Requests exist in a complex web of interdependencies. Each request
+ * has to wait for some other request to complete before it is ready to be run
+ * (e.g. we have to wait until the pixels have been rendering into a texture
+ * before we can copy from it). We track the readiness of a request in terms
+ * of fences, but we also need to keep the dependency tree for the lifetime
+ * of the request (beyond the life of an individual fence). We use the tree
+ * at various points to reorder the requests whilst keeping the requests
+ * in order with respect to their various dependencies.
+ */
+struct i915_priotree {
+	struct list_head signalers_list; /* those before us, we depend upon */
+	struct list_head waiters_list; /* those after us, they depend upon us */
+};
+
 /**
  * Request queue structure.
  *
@@ -105,6 +127,17 @@ struct drm_i915_gem_request {
 	wait_queue_t submitq;
 	wait_queue_t execq;
 
+	/* A list of everyone we wait upon, and everyone who waits upon us.
+	 * Even though we will not be submitted to the hardware before the
+	 * submit fence is signaled (it waits for all external events as well
+	 * as our own requests), the scheduler still needs to know the
+	 * dependency tree for the lifetime of the request (from execbuf
+	 * to retirement), i.e. bidirectional dependency information for the
+	 * request not tied to individual fences.
+	 */
+	struct i915_priotree priotree;
+	struct i915_dependency dep;
+
 	u32 global_seqno;
 
 	/** GEM sequence number associated with the previous request,
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 08/14] drm/i915/scheduler: Execute requests in order of priorities
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (5 preceding siblings ...)
  2016-11-14  8:56   ` [PATCH v3 07/14] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
@ 2016-11-14  8:56   ` Chris Wilson
  2016-11-14 11:15     ` Tvrtko Ursulin
  2016-11-14  8:56   ` [PATCH v3 09/14] drm/i915: Store the execution priority on the context Chris Wilson
                     ` (8 subsequent siblings)
  15 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:56 UTC (permalink / raw)
  To: intel-gfx

Track the priority of each request and use it to determine the order in
which we submit requests to the hardware via execlists.

The priority of the request is determined by the user (eventually via
the context) but may be overridden at any time by the driver. When we set
the priority of the request, we bump the priority of all of its
dependencies to match - so that a high priority drawing operation is not
stuck behind a background task.

When the request is ready to execute (i.e. we have signaled the submit
fence following completion of all its dependencies, including third
party fences), we put the request into a priority sorted rbtree to be
submitted to the hardware. If the request is higher priority than all
pending requests, it will be submitted on the next context-switch
interrupt as soon as the hardware has completed the current request. We
do not currently preempt any current execution to immediately run a very
high priority request, at least not yet.

One more limitation, is that this is first implementation is for
execlists only so currently limited to gen8/gen9.

v2: Replace recursive priority inheritance bumping with an iterative
depth-first search list.
v3: list_next_entry() for walking lists

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
 drivers/gpu/drm/i915/i915_gem.c            |   3 +-
 drivers/gpu/drm/i915/i915_gem_request.c    |   5 ++
 drivers/gpu/drm/i915/i915_gem_request.h    |   8 +-
 drivers/gpu/drm/i915/i915_guc_submission.c |   1 +
 drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
 drivers/gpu/drm/i915/intel_lrc.c           | 135 +++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
 8 files changed, 149 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 03e3c2afbb06..1cc971cb6cb1 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
 			  struct drm_i915_gem_request *rq,
 			  const char *prefix)
 {
-	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
+	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
 		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
+		   rq->priotree.priority,
 		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
 		   rq->timeline->common->name);
 }
@@ -3216,6 +3217,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 
 		if (i915.enable_execlists) {
 			u32 ptr, read, write;
+			struct rb_node *rb;
 
 			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
 				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
@@ -3255,7 +3257,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 			rcu_read_unlock();
 
 			spin_lock_irq(&engine->timeline->lock);
-			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
+			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
+				rq = rb_entry(rb, typeof(*rq), priotree.node);
 				print_request(m, rq, "\t\tQ ");
 			}
 			spin_unlock_irq(&engine->timeline->lock);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index b331e5966fe2..a9d27f3e88d2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2722,10 +2722,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
 
 		spin_lock_irqsave(&engine->timeline->lock, flags);
 
-		INIT_LIST_HEAD(&engine->execlist_queue);
 		i915_gem_request_put(engine->execlist_port[0].request);
 		i915_gem_request_put(engine->execlist_port[1].request);
 		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+		engine->execlist_queue = RB_ROOT;
+		engine->execlist_first = NULL;
 
 		spin_unlock_irqrestore(&engine->timeline->lock, flags);
 	}
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 78c87d94d205..13574a1e29b1 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -132,6 +132,7 @@ __i915_priotree_add_dependency(struct i915_priotree *pt,
 			       struct i915_dependency *dep,
 			       unsigned long flags)
 {
+	INIT_LIST_HEAD(&dep->dfs_link);
 	list_add(&dep->wait_link, &signal->waiters_list);
 	list_add(&dep->signal_link, &pt->signalers_list);
 	dep->signaler = signal;
@@ -158,6 +159,8 @@ i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
 {
 	struct i915_dependency *dep, *next;
 
+	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
+
 	/* Everyone we depended upon (the fences we wait to be signaled)
 	 * should retire before us and remove themselves from our list.
 	 * However, retirement is run independently on each timeline and
@@ -182,6 +185,8 @@ i915_priotree_init(struct i915_priotree *pt)
 {
 	INIT_LIST_HEAD(&pt->signalers_list);
 	INIT_LIST_HEAD(&pt->waiters_list);
+	RB_CLEAR_NODE(&pt->node);
+	pt->priority = INT_MIN;
 }
 
 void i915_gem_retire_noop(struct i915_gem_active *active,
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 943c39d2a62a..e2b077df2da0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -48,6 +48,7 @@ struct i915_dependency {
 	struct i915_priotree *signaler;
 	struct list_head signal_link;
 	struct list_head wait_link;
+	struct list_head dfs_link;
 	unsigned long flags;
 #define I915_DEPENDENCY_ALLOC BIT(0)
 };
@@ -64,6 +65,10 @@ struct i915_dependency {
 struct i915_priotree {
 	struct list_head signalers_list; /* those before us, we depend upon */
 	struct list_head waiters_list; /* those after us, they depend upon us */
+	struct rb_node node;
+	int priority;
+#define I915_PRIORITY_MAX 1024
+#define I915_PRIORITY_MIN (-I915_PRIORITY_MAX)
 };
 
 /**
@@ -194,9 +199,6 @@ struct drm_i915_gem_request {
 	struct drm_i915_file_private *file_priv;
 	/** file_priv list entry for this request */
 	struct list_head client_list;
-
-	/** Link in the execlist submission queue, guarded by execlist_lock. */
-	struct list_head execlist_link;
 };
 
 extern const struct dma_fence_ops i915_fence_ops;
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 942f5000d372..4462112725ef 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -1532,6 +1532,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
 		engine->submit_request = i915_guc_submit;
+		engine->schedule = NULL;
 
 		/* Replay the current set of previously submitted requests */
 		list_for_each_entry(request,
diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
index c9171a058478..3da4d466e332 100644
--- a/drivers/gpu/drm/i915/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/intel_engine_cs.c
@@ -239,7 +239,8 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
  */
 void intel_engine_setup_common(struct intel_engine_cs *engine)
 {
-	INIT_LIST_HEAD(&engine->execlist_queue);
+	engine->execlist_queue = RB_ROOT;
+	engine->execlist_first = NULL;
 
 	intel_engine_init_timeline(engine);
 	intel_engine_init_hangcheck(engine);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d1aea7462515..d13a335ad83a 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -432,9 +432,10 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
 
 static void execlists_dequeue(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *cursor, *last;
+	struct drm_i915_gem_request *last;
 	struct execlist_port *port = engine->execlist_port;
 	unsigned long flags;
+	struct rb_node *rb;
 	bool submit = false;
 
 	last = port->request;
@@ -471,7 +472,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 	 */
 
 	spin_lock_irqsave(&engine->timeline->lock, flags);
-	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
 		/* Can we combine this request with the current port? It has to
 		 * be the same context/ringbuffer and not have any exceptions
 		 * (e.g. GVT saying never to combine contexts).
@@ -503,6 +508,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 			port++;
 		}
 
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
 		/* We keep the previous context alive until we retire the
 		 * following request. This ensures that any the context object
 		 * is still pinned for any residual writes the HW makes into it
@@ -517,11 +527,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
 		submit = true;
 	}
 	if (submit) {
-		/* Decouple all the requests submitted from the queue */
-		engine->execlist_queue.next = &cursor->execlist_link;
-		cursor->execlist_link.prev = &engine->execlist_queue;
-
 		i915_gem_request_assign(&port->request, last);
+		engine->execlist_first = rb;
 	}
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 
@@ -626,6 +633,32 @@ static void intel_lrc_irq_handler(unsigned long data)
 	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
 }
 
+static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
+{
+	struct rb_node **p, *rb;
+	bool first = true;
+
+	/* most positive priority is scheduled first, equal priorities fifo */
+	rb = NULL;
+	p = &root->rb_node;
+	while (*p) {
+		struct i915_priotree *pos;
+
+		rb = *p;
+		pos = rb_entry(rb, typeof(*pos), node);
+		if (pt->priority > pos->priority) {
+			p = &rb->rb_left;
+		} else {
+			p = &rb->rb_right;
+			first = false;
+		}
+	}
+	rb_link_node(&pt->node, rb, p);
+	rb_insert_color(&pt->node, root);
+
+	return first;
+}
+
 static void execlists_submit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
@@ -634,13 +667,96 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 	/* Will be called from irq-context when using foreign fences. */
 	spin_lock_irqsave(&engine->timeline->lock, flags);
 
-	list_add_tail(&request->execlist_link, &engine->execlist_queue);
+	if (insert_request(&request->priotree, &engine->execlist_queue))
+		engine->execlist_first = &request->priotree.node;
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
 
 	spin_unlock_irqrestore(&engine->timeline->lock, flags);
 }
 
+static struct intel_engine_cs *
+pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
+{
+	struct intel_engine_cs *engine;
+
+	engine = container_of(pt,
+			      struct drm_i915_gem_request,
+			      priotree)->engine;
+	if (engine != locked) {
+		if (locked)
+			spin_unlock_irq(&locked->timeline->lock);
+		spin_lock_irq(&engine->timeline->lock);
+	}
+
+	return engine;
+}
+
+static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
+{
+	struct intel_engine_cs *engine = NULL;
+	struct i915_dependency *dep, *p;
+	struct i915_dependency stack;
+	LIST_HEAD(dfs);
+
+	if (prio <= READ_ONCE(request->priotree.priority))
+		return;
+
+	/* Need BKL in order to use the temporary link inside i915_dependency */
+	lockdep_assert_held(&request->i915->drm.struct_mutex);
+
+	stack.signaler = &request->priotree;
+	list_add(&stack.dfs_link, &dfs);
+
+	/* Recursively bump all dependent priorities to match the new request */
+	list_for_each_entry_safe(dep, p, &dfs, dfs_link) {
+		struct i915_priotree *pt = dep->signaler;
+
+		list_for_each_entry(p, &pt->signalers_list, signal_link)
+			if (prio > READ_ONCE(p->signaler->priority))
+				list_move_tail(&p->dfs_link, &dfs);
+
+		p = list_next_entry(dep, dfs_link);
+		if (!RB_EMPTY_NODE(&pt->node))
+			continue;
+
+		engine = pt_lock_engine(pt, engine);
+
+		/* If it is not already in the rbtree, we can update the
+		 * priority inplace and skip over it (and its dependencies)
+		 * if it is referenced *again* as we descend the dfs.
+		 */
+		if (prio > pt->priority && RB_EMPTY_NODE(&pt->node)) {
+			pt->priority = prio;
+			list_del_init(&dep->dfs_link);
+		}
+	}
+
+	/* Fifo and depth-first replacement ensure our deps execute before us */
+	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
+		struct i915_priotree *pt = dep->signaler;
+
+		INIT_LIST_HEAD(&dep->dfs_link);
+
+		engine = pt_lock_engine(pt, engine);
+
+		if (prio <= pt->priority)
+			continue;
+
+		GEM_BUG_ON(RB_EMPTY_NODE(&pt->node));
+
+		pt->priority = prio;
+		rb_erase(&pt->node, &engine->execlist_queue);
+		if (insert_request(pt, &engine->execlist_queue))
+			engine->execlist_first = &pt->node;
+	}
+
+	if (engine)
+		spin_unlock_irq(&engine->timeline->lock);
+
+	/* XXX Do we need to preempt to make room for us and our deps? */
+}
+
 int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
 {
 	struct intel_engine_cs *engine = request->engine;
@@ -1677,8 +1793,10 @@ void intel_execlists_enable_submission(struct drm_i915_private *dev_priv)
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
 
-	for_each_engine(engine, dev_priv, id)
+	for_each_engine(engine, dev_priv, id) {
 		engine->submit_request = execlists_submit_request;
+		engine->schedule = execlists_schedule;
+	}
 }
 
 static void
@@ -1691,6 +1809,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
 	engine->emit_breadcrumb = gen8_emit_breadcrumb;
 	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
 	engine->submit_request = execlists_submit_request;
+	engine->schedule = execlists_schedule;
 
 	engine->irq_enable = gen8_logical_ring_enable_irq;
 	engine->irq_disable = gen8_logical_ring_disable_irq;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index b9583941eb6b..3466b4e77e7c 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -348,7 +348,8 @@ struct intel_engine_cs {
 		struct drm_i915_gem_request *request;
 		unsigned int count;
 	} execlist_port[2];
-	struct list_head execlist_queue;
+	struct rb_root execlist_queue;
+	struct rb_node *execlist_first;
 	unsigned int fw_domains;
 	bool disable_lite_restore_wa;
 	bool preempt_wa;
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 09/14] drm/i915: Store the execution priority on the context
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (6 preceding siblings ...)
  2016-11-14  8:56   ` [PATCH v3 08/14] drm/i915/scheduler: Execute requests in order of priorities Chris Wilson
@ 2016-11-14  8:56   ` Chris Wilson
  2016-11-14 11:16     ` Tvrtko Ursulin
  2016-11-14  8:56   ` [PATCH v3 10/14] drm/i915/scheduler: Boost priorities for flips Chris Wilson
                     ` (7 subsequent siblings)
  15 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:56 UTC (permalink / raw)
  To: intel-gfx

In order to support userspace defining different levels of importance to
different contexts, and in particular the preferred order of execution,
store a priority value on each context. By default, the kernel's
context, which is used for idling and other background tasks, is given
minimum priority (all user contexts will execute first).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h         | 1 +
 drivers/gpu/drm/i915/i915_gem_context.c | 1 +
 drivers/gpu/drm/i915/i915_gem_request.c | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index ab4ad5522cf5..fb3e850f5d3a 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -936,6 +936,7 @@ struct i915_gem_context {
 	/* Unique identifier for this context, used by the hw for tracking */
 	unsigned int hw_id;
 	u32 user_handle;
+	int priority; /* greater priorities are serviced first */
 
 	u32 ggtt_alignment;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 6dd475735f0a..1f94b8d6d83d 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -476,6 +476,7 @@ int i915_gem_context_init(struct drm_device *dev)
 		return PTR_ERR(ctx);
 	}
 
+	ctx->priority = I915_PRIORITY_MIN; /* lowest priority; idle task */
 	dev_priv->kernel_context = ctx;
 
 	DRM_DEBUG_DRIVER("%s context support initialized\n",
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index 13574a1e29b1..b9b5253cf3cd 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -867,7 +867,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * run at the earliest possible convenience.
 	 */
 	if (engine->schedule)
-		engine->schedule(request, 0);
+		engine->schedule(request, request->ctx->priority);
 
 	local_bh_disable();
 	i915_sw_fence_commit(&request->submit);
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 10/14] drm/i915/scheduler: Boost priorities for flips
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (7 preceding siblings ...)
  2016-11-14  8:56   ` [PATCH v3 09/14] drm/i915: Store the execution priority on the context Chris Wilson
@ 2016-11-14  8:56   ` Chris Wilson
  2016-11-14  8:57   ` [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
                     ` (6 subsequent siblings)
  15 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:56 UTC (permalink / raw)
  To: intel-gfx

Boost the priority of any rendering required to show the next pageflip
as we want to avoid missing the vblank by being delayed by invisible
workload. We prioritise avoiding jank and jitter in the GUI over
starving background tasks.

v2: Descend dma_fence_array when boosting priorities.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
---
 drivers/gpu/drm/i915/i915_drv.h      |  5 +++
 drivers/gpu/drm/i915/i915_gem.c      | 65 ++++++++++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/intel_display.c |  2 ++
 3 files changed, 72 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index fb3e850f5d3a..eff93ffde25c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -3093,6 +3093,11 @@ int i915_gem_object_wait(struct drm_i915_gem_object *obj,
 			 unsigned int flags,
 			 long timeout,
 			 struct intel_rps_client *rps);
+int i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
+				  unsigned int flags,
+				  int priority);
+#define I915_PRIORITY_DISPLAY I915_PRIORITY_MAX
+
 int __must_check
 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
 				  bool write);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a9d27f3e88d2..8ccc80c679f5 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -34,6 +34,7 @@
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
 #include "intel_mocs.h"
+#include <linux/dma-fence-array.h>
 #include <linux/reservation.h>
 #include <linux/shmem_fs.h>
 #include <linux/slab.h>
@@ -435,6 +436,70 @@ i915_gem_object_wait_reservation(struct reservation_object *resv,
 	return timeout;
 }
 
+static void __fence_set_priority(struct dma_fence *fence, int prio)
+{
+	struct drm_i915_gem_request *rq;
+	struct intel_engine_cs *engine;
+
+	if (!dma_fence_is_i915(fence))
+		return;
+
+	rq = to_request(fence);
+	engine = rq->engine;
+	if (!engine->schedule)
+		return;
+
+	engine->schedule(rq, prio);
+}
+
+static void fence_set_priority(struct dma_fence *fence, int prio)
+{
+	/* Recurse once into a fence-array */
+	if (dma_fence_is_array(fence)) {
+		struct dma_fence_array *array = to_dma_fence_array(fence);
+		int i;
+
+		for (i = 0; i < array->num_fences; i++)
+			__fence_set_priority(array->fences[i], prio);
+	} else {
+		__fence_set_priority(fence, prio);
+	}
+}
+
+int
+i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
+			      unsigned int flags,
+			      int prio)
+{
+	struct dma_fence *excl;
+
+	if (flags & I915_WAIT_ALL) {
+		struct dma_fence **shared;
+		unsigned int count, i;
+		int ret;
+
+		ret = reservation_object_get_fences_rcu(obj->resv,
+							&excl, &count, &shared);
+		if (ret)
+			return ret;
+
+		for (i = 0; i < count; i++) {
+			fence_set_priority(shared[i], prio);
+			dma_fence_put(shared[i]);
+		}
+
+		kfree(shared);
+	} else {
+		excl = reservation_object_get_excl_rcu(obj->resv);
+	}
+
+	if (excl) {
+		fence_set_priority(excl, prio);
+		dma_fence_put(excl);
+	}
+	return 0;
+}
+
 /**
  * Waits for rendering to the object to be completed
  * @obj: i915 gem object
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 916a3c8eefd1..7c45e8973887 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -14765,6 +14765,8 @@ intel_prepare_plane_fb(struct drm_plane *plane,
 						      GFP_KERNEL);
 		if (ret < 0)
 			return ret;
+
+		i915_gem_object_wait_priority(obj, 0, I915_PRIORITY_DISPLAY);
 	}
 
 	if (plane->type == DRM_PLANE_TYPE_CURSOR &&
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (8 preceding siblings ...)
  2016-11-14  8:56   ` [PATCH v3 10/14] drm/i915/scheduler: Boost priorities for flips Chris Wilson
@ 2016-11-14  8:57   ` Chris Wilson
  2016-11-14 11:31     ` Tvrtko Ursulin
  2016-12-01 10:45     ` Tvrtko Ursulin
  2016-11-14  8:57   ` [PATCH v3 12/14] drm/i915/scheduler: Support user-defined priorities Chris Wilson
                     ` (5 subsequent siblings)
  15 siblings, 2 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:57 UTC (permalink / raw)
  To: intel-gfx

This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 85 +++++++++++++++++++++++++-----
 drivers/gpu/drm/i915/i915_irq.c            |  4 +-
 drivers/gpu/drm/i915/intel_lrc.c           |  3 --
 3 files changed, 76 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 4462112725ef..088f5a99ecfc 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 	u32 freespace;
 	int ret;
 
-	spin_lock(&gc->wq_lock);
+	spin_lock_irq(&gc->wq_lock);
 	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
 	freespace -= gc->wq_rsvd;
 	if (likely(freespace >= wqi_size)) {
@@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 		gc->no_wq_space++;
 		ret = -EAGAIN;
 	}
-	spin_unlock(&gc->wq_lock);
+	spin_unlock_irq(&gc->wq_lock);
 
 	return ret;
 }
@@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
 
 	GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
 
-	spin_lock(&gc->wq_lock);
+	spin_lock_irq(&gc->wq_lock);
 	gc->wq_rsvd -= wqi_size;
-	spin_unlock(&gc->wq_lock);
+	spin_unlock_irq(&gc->wq_lock);
 }
 
 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -644,7 +644,7 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
 	rq->previous_context = engine->last_context;
 	engine->last_context = rq->ctx;
 
-	i915_gem_request_submit(rq);
+	__i915_gem_request_submit(rq);
 
 	spin_lock(&client->wq_lock);
 	guc_wq_item_append(client, rq);
@@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
 	spin_unlock(&client->wq_lock);
 }
 
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *last = port[0].request;
+	unsigned long flags;
+	struct rb_node *rb;
+	bool submit = false;
+
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
+		if (last && cursor->ctx != last->ctx) {
+			if (port != engine->execlist_port)
+				break;
+
+			i915_gem_request_assign(&port->request, last);
+			dma_fence_enable_sw_signaling(&last->fence);
+			port++;
+		}
+
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
+		i915_guc_submit(cursor);
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		i915_gem_request_assign(&port->request, last);
+		dma_fence_enable_sw_signaling(&last->fence);
+		engine->execlist_first = rb;
+	}
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+
+	return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *rq;
+	bool submit;
+
+	do {
+		rq = port[0].request;
+		while (rq && i915_gem_request_completed(rq)) {
+			i915_gem_request_put(rq);
+			rq = port[1].request;
+			port[0].request = rq;
+			port[1].request = NULL;
+		}
+
+		submit = false;
+		if (!port[1].request)
+			submit = i915_guc_dequeue(engine);
+	} while (submit);
+}
+
 /*
  * Everything below here is concerned with setup & teardown, and is
  * therefore not part of the somewhat time-critical batch-submission
@@ -1531,16 +1595,13 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
-		engine->submit_request = i915_guc_submit;
-		engine->schedule = NULL;
+		tasklet_init(&engine->irq_tasklet,
+			     i915_guc_irq_handler,
+			     (unsigned long)engine);
 
 		/* Replay the current set of previously submitted requests */
-		list_for_each_entry(request,
-				    &engine->timeline->requests, link) {
+		list_for_each_entry(request, &engine->timeline->requests, link)
 			client->wq_rsvd += sizeof(struct guc_wq_item);
-			if (i915_sw_fence_done(&request->submit))
-				i915_guc_submit(request);
-		}
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index cb8a75f6ca16..18dce4c66d56 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 static __always_inline void
 gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 {
-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
+		tasklet_schedule(&engine->irq_tasklet);
 		notify_ring(engine);
+	}
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
 		tasklet_schedule(&engine->irq_tasklet);
 }
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index d13a335ad83a..ffab255e55a7 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1425,9 +1425,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	request->ring->last_retired_head = -1;
 	intel_ring_update_space(request->ring);
 
-	if (i915.enable_guc_submission)
-		return;
-
 	/* Catch up with any missed context-switch interrupts */
 	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
 	if (request->ctx != port[0].request->ctx) {
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 12/14] drm/i915/scheduler: Support user-defined priorities
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (9 preceding siblings ...)
  2016-11-14  8:57   ` [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
@ 2016-11-14  8:57   ` Chris Wilson
  2016-11-14 11:32     ` Tvrtko Ursulin
  2016-11-14  8:57   ` [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
                     ` (4 subsequent siblings)
  15 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:57 UTC (permalink / raw)
  To: intel-gfx

Use a priority stored in the context as the initial value when
submitting a request. This allows us to change the default priority on a
per-context basis, allowing different contexts to be favoured with GPU
time at the expense of lower importance work. The user can adjust the
context's priority via I915_CONTEXT_PARAM_PRIORITY, with more positive
values being higher priority (they will be serviced earlier, after their
dependencies have been resolved). Any prerequisite work for an execbuf
will have its priority raised to match the new request as required.

Normal users can specify any value in the range of -1023 to 0 [default],
i.e. they can reduce the priority of their workloads (and temporarily
boost it back to normal if so desired).

Privileged users can specify any value in the range of -1023 to 1023,
[default is 0], i.e. they can raise their priority above all overs and
so potentially starve the system.

Note that the existing schedulers are not fair, nor load balancing, the
execution is strictly by priority on a first-come, first-served basis,
and the driver may choose to boost some requests above the range
available to users.

This priority was originally based around nice(2), but evolved to allow
clients to adjust their priority within a small range, and allow for a
privileged high priority range.

For example, this can be used to implement EGL_IMG_context_priority
https://www.khronos.org/registry/egl/extensions/IMG/EGL_IMG_context_priority.txt

	EGL_CONTEXT_PRIORITY_LEVEL_IMG determines the priority level of
        the context to be created. This attribute is a hint, as an
        implementation may not support multiple contexts at some
        priority levels and system policy may limit access to high
        priority contexts to appropriate system privilege level. The
        default value for EGL_CONTEXT_PRIORITY_LEVEL_IMG is
        EGL_CONTEXT_PRIORITY_MEDIUM_IMG."

so we can map

	PRIORITY_HIGH -> 1023 [privileged, will failback to 0]
	PRIORITY_MED -> 0 [default]
	PRIORITY_LOW -> -1023

They also map onto the priorities used by VkQueue (and a VkQueue is
essentially a timeline, our i915_gem_context under full-ppgtt).

Testcase: igt/gem_exec_schedule
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_context.c | 20 ++++++++++++++++++++
 include/uapi/drm/i915_drm.h             |  3 +++
 2 files changed, 23 insertions(+)

diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
index 1f94b8d6d83d..1f74ab266f6b 100644
--- a/drivers/gpu/drm/i915/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/i915_gem_context.c
@@ -1101,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
 	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
 		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
 		break;
+	case I915_CONTEXT_PARAM_PRIORITY:
+		args->value = ctx->priority;
+		break;
 	default:
 		ret = -EINVAL;
 		break;
@@ -1156,6 +1159,23 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
 				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
 		}
 		break;
+
+	case I915_CONTEXT_PARAM_PRIORITY:
+		{
+			int priority = args->value;
+
+			if (args->size)
+				ret = -EINVAL;
+			else if (priority >= I915_PRIORITY_MAX ||
+				 priority <= -I915_PRIORITY_MAX)
+				ret = -EINVAL;
+			else if (priority > 0 && !capable(CAP_SYS_ADMIN))
+				ret = -EPERM;
+			else
+				ctx->priority = priority;
+		}
+		break;
+
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 1c12a350eca3..47901a8ad682 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -391,6 +391,8 @@ typedef struct drm_i915_irq_wait {
 
 /* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
  * priorities and the driver will attempt to execute batches in priority order.
+ * The initial priority for each batch is supplied by the context and is
+ * controlled via I915_CONTEXT_PARAM_PRIORITY.
  */
 #define I915_PARAM_HAS_SCHEDULER	 41
 
@@ -1224,6 +1226,7 @@ struct drm_i915_gem_context_param {
 #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
 #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
 #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
+#define I915_CONTEXT_PARAM_PRIORITY	0x5
 	__u64 value;
 };
 
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (10 preceding siblings ...)
  2016-11-14  8:57   ` [PATCH v3 12/14] drm/i915/scheduler: Support user-defined priorities Chris Wilson
@ 2016-11-14  8:57   ` Chris Wilson
  2017-01-25 20:38     ` Chad Versace
  2016-11-14  8:57   ` [PATCH v3 14/14] drm/i915: Support explicit fencing for execbuf Chris Wilson
                     ` (3 subsequent siblings)
  15 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:57 UTC (permalink / raw)
  To: intel-gfx

Userspace is faced with a dilemma. The kernel requires implicit fencing
to manage resource usage (we always must wait for the GPU to finish
before releasing its PTE) and for third parties. However, userspace may
wish to avoid this serialisation if it is either using explicit fencing
between parties and wants more fine-grained access to buffers (e.g. it
may partition the buffer between uses and track fences on ranges rather
than the implicit fences tracking the whole object). It follows that
userspace needs a mechanism to avoid the kernel's serialisation on its
implicit fences before execbuf execution.

The next question is whether this is an object, execbuf or context flag.
Hybrid users (such as using explicit EGL_ANDROID_native_sync fencing on
shared winsys buffers, but implicit fencing on internal surfaces)
require a per-object level flag. Given that this flag need to be only
set once for the lifetime of the object, this reduces the convenience of
having an execbuf or context level flag (and avoids having multiple
pieces of uABI controlling the same feature).

Incorrect use of this flag will result in rendering corruption and GPU
hangs - but will not result in use-after-free or similar resource
tracking issues.

Serious caveat: write ordering is not strictly correct after setting
this flag on a render target on multiple engines. This affects all
subsequent GEM operations (execbuf, set-domain, pread) and shared
dma-buf operations. A fix is possible - but costly (both in terms of
further ABI changes and runtime overhead).

Testcase: igt/gem_exec_async
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_drv.c            |  1 +
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
 include/uapi/drm/i915_drm.h                | 29 ++++++++++++++++++++++++++++-
 3 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 99fc075100b1..657c465a8d50 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -343,6 +343,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_EXEC_HANDLE_LUT:
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
+	case I915_PARAM_HAS_EXEC_ASYNC:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index e804cb2fa57e..781b5559f86e 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -1111,6 +1111,9 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 	list_for_each_entry(vma, vmas, exec_list) {
 		struct drm_i915_gem_object *obj = vma->obj;
 
+		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
+			continue;
+
 		ret = i915_gem_request_await_object
 			(req, obj, obj->base.pending_write_domain);
 		if (ret)
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 47901a8ad682..4bd83c0b07db 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -396,6 +396,12 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_SCHEDULER	 41
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports the ability to opt-out of
+ * synchronisation with implicit fencing on individual objects.
+ */
+#define I915_PARAM_HAS_EXEC_ASYNC	 42
+
+
 typedef struct drm_i915_getparam {
 	__s32 param;
 	/*
@@ -736,8 +742,29 @@ struct drm_i915_gem_exec_object2 {
 #define EXEC_OBJECT_SUPPORTS_48B_ADDRESS (1<<3)
 #define EXEC_OBJECT_PINNED		 (1<<4)
 #define EXEC_OBJECT_PAD_TO_SIZE		 (1<<5)
+/* The kernel implicitly tracks GPU activity on all GEM objects, and
+ * synchronises operations with outstanding rendering. This includes
+ * rendering on other devices if exported via dma-buf. However, sometimes
+ * this tracking is too coarse and the user knows better. For example,
+ * if the object is split into non-overlapping ranges shared between different
+ * clients or engines (i.e. suballocating objects), the implicit tracking
+ * by kernel assumes that each operation affects the whole object rather
+ * than an individual range, causing needless synchronisation between clients.
+ * The kernel will also forgo any CPU cache flushes prior to rendering from
+ * the object as the client is expected to be also handling such domain
+ * tracking.
+ *
+ * The kernel maintains the implicit tracking in order to manage resources
+ * used by the GPU - this flag only disables the synchronisation prior to
+ * rendering with this object in this execbuf.
+ *
+ * Opting out of implicit synhronisation requires the user to do its own
+ * explicit tracking to avoid rendering corruption. See, for example,
+ * I915_PARAM_HAS_EXEC_FENCE to order execbufs and execute them asynchronously.
+ */
+#define EXEC_OBJECT_ASYNC		(1<<6)
 /* All remaining bits are MBZ and RESERVED FOR FUTURE USE */
-#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_PAD_TO_SIZE<<1)
+#define __EXEC_OBJECT_UNKNOWN_FLAGS -(EXEC_OBJECT_ASYNC<<1)
 	__u64 flags;
 
 	union {
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH v3 14/14] drm/i915: Support explicit fencing for execbuf
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (11 preceding siblings ...)
  2016-11-14  8:57   ` [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
@ 2016-11-14  8:57   ` Chris Wilson
  2016-11-14 22:29     ` Rafael Antognolli
  2017-01-25 20:27     ` Chad Versace
  2016-11-14  9:01   ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Tvrtko Ursulin
                     ` (2 subsequent siblings)
  15 siblings, 2 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  8:57 UTC (permalink / raw)
  To: intel-gfx

Now that the user can opt-out of implicit fencing, we need to give them
back control over the fencing. We employ sync_file to wrap our
drm_i915_gem_request and provide an fd that userspace can merge with
other sync_file fds and pass back to the kernel to wait upon before
future execution.

Testcase: igt/gem_exec_fence
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
---
 drivers/gpu/drm/i915/Kconfig               |  1 +
 drivers/gpu/drm/i915/i915_drv.c            |  3 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 54 +++++++++++++++++++++++++++---
 include/uapi/drm/i915_drm.h                | 35 ++++++++++++++++++-
 4 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
index beed5c1d2cd7..bed81fe1d2a7 100644
--- a/drivers/gpu/drm/i915/Kconfig
+++ b/drivers/gpu/drm/i915/Kconfig
@@ -19,6 +19,7 @@ config DRM_I915
 	select INPUT if ACPI
 	select ACPI_VIDEO if ACPI
 	select ACPI_BUTTON if ACPI
+	select SYNC_FILE
 	help
 	  Choose this option if you have a system that has "Intel Graphics
 	  Media Accelerator" or "HD Graphics" integrated graphics,
diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
index 657c465a8d50..6fe7f41a5b5b 100644
--- a/drivers/gpu/drm/i915/i915_drv.c
+++ b/drivers/gpu/drm/i915/i915_drv.c
@@ -344,6 +344,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
 	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
 	case I915_PARAM_HAS_EXEC_SOFTPIN:
 	case I915_PARAM_HAS_EXEC_ASYNC:
+	case I915_PARAM_HAS_EXEC_FENCE:
 		/* For the time being all of these are always true;
 		 * if some supported hardware does not have one of these
 		 * features this value needs to be provided from
@@ -2533,7 +2534,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_HWS_ADDR, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_INIT, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH),
-	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
+	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2_WR, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
 	DRM_IOCTL_DEF_DRV(I915_GEM_PIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_UNPIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
 	DRM_IOCTL_DEF_DRV(I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 781b5559f86e..facec610b55a 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -28,6 +28,7 @@
 
 #include <linux/dma_remapping.h>
 #include <linux/reservation.h>
+#include <linux/sync_file.h>
 #include <linux/uaccess.h>
 
 #include <drm/drmP.h>
@@ -1597,6 +1598,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	struct i915_execbuffer_params *params = &params_master;
 	const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
 	u32 dispatch_flags;
+	struct dma_fence *in_fence = NULL;
+	struct sync_file *out_fence = NULL;
+	int out_fence_fd = -1;
 	int ret;
 	bool need_relocs;
 
@@ -1640,6 +1644,23 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		dispatch_flags |= I915_DISPATCH_RS;
 	}
 
+	if (args->flags & I915_EXEC_FENCE_IN) {
+		in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
+		if (!in_fence) {
+			ret = -EINVAL;
+			goto pre_mutex_err;
+		}
+	}
+
+	if (args->flags & I915_EXEC_FENCE_OUT) {
+		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
+		if (out_fence_fd < 0) {
+			ret = out_fence_fd;
+			out_fence_fd = -1;
+			goto pre_mutex_err;
+		}
+	}
+
 	/* Take a local wakeref for preparing to dispatch the execbuf as
 	 * we expect to access the hardware fairly frequently in the
 	 * process. Upon first dispatch, we acquire another prolonged
@@ -1784,6 +1805,21 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 		goto err_batch_unpin;
 	}
 
+	if (in_fence) {
+		ret = i915_gem_request_await_dma_fence(params->request,
+						       in_fence);
+		if (ret < 0)
+			goto err_request;
+	}
+
+	if (out_fence_fd != -1) {
+		out_fence = sync_file_create(&params->request->fence);
+		if (!out_fence) {
+			ret = -ENOMEM;
+			goto err_request;
+		}
+	}
+
 	/* Whilst this request exists, batch_obj will be on the
 	 * active_list, and so will hold the active reference. Only when this
 	 * request is retired will the the batch_obj be moved onto the
@@ -1811,6 +1847,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	ret = execbuf_submit(params, args, &eb->vmas);
 err_request:
 	__i915_add_request(params->request, ret == 0);
+	if (out_fence) {
+		if (ret == 0) {
+			fd_install(out_fence_fd, out_fence->file);
+			args->rsvd2 &= GENMASK_ULL(0, 31); /* keep in-fence */
+			args->rsvd2 |= (u64)out_fence_fd << 32;
+			out_fence_fd = -1;
+		} else {
+			fput(out_fence->file);
+		}
+	}
 
 err_batch_unpin:
 	/*
@@ -1832,6 +1878,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
 	/* intel_gpu_busy should also get a ref, so it will free when the device
 	 * is really idle. */
 	intel_runtime_pm_put(dev_priv);
+	if (out_fence_fd != -1)
+		put_unused_fd(out_fence_fd);
+	dma_fence_put(in_fence);
 	return ret;
 }
 
@@ -1939,11 +1988,6 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
 		return -EINVAL;
 	}
 
-	if (args->rsvd2 != 0) {
-		DRM_DEBUG("dirty rvsd2 field\n");
-		return -EINVAL;
-	}
-
 	exec2_list = drm_malloc_gfp(args->buffer_count,
 				    sizeof(*exec2_list),
 				    GFP_TEMPORARY);
diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
index 4bd83c0b07db..90082269fb50 100644
--- a/include/uapi/drm/i915_drm.h
+++ b/include/uapi/drm/i915_drm.h
@@ -246,6 +246,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
 #define DRM_I915_OVERLAY_ATTRS	0x28
 #define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_EXECBUFFER2_WR	DRM_I915_GEM_EXECBUFFER2
 #define DRM_I915_GET_SPRITE_COLORKEY	0x2a
 #define DRM_I915_SET_SPRITE_COLORKEY	0x2b
 #define DRM_I915_GEM_WAIT	0x2c
@@ -279,6 +280,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
 #define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
+#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
 #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
 #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
 #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
@@ -401,6 +403,12 @@ typedef struct drm_i915_irq_wait {
  */
 #define I915_PARAM_HAS_EXEC_ASYNC	 42
 
+/* Query whether DRM_I915_GEM_EXECBUFFER2 supports explicit fence support -
+ * both being able to pass in a sync_file fd to wait upon before executing,
+ * and being able to return a new sync_file fd that is signaled when the
+ * current request is complete.
+ */
+#define I915_PARAM_HAS_EXEC_FENCE	 43
 
 typedef struct drm_i915_getparam {
 	__s32 param;
@@ -854,7 +862,32 @@ struct drm_i915_gem_execbuffer2 {
  */
 #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
 
-#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
+/* Setting I915_EXEC_FENCE_IN implies that lower_32_bits(rsvd2) represent
+ * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
+ * the batch.
+ *
+ * Returns -EINVAL if the sync_file fd cannot be found.
+ */
+#define I915_EXEC_FENCE_IN		(1<<16)
+
+/* Setting I915_EXEC_FENCE_OUT causes the ioctl to return a sync_file fd
+ * in the upper_32_bits(rsvd2) upon success. Ownership of the fd is given
+ * to the caller, and it should be close() after use. (The fd is a regular
+ * file descriptor and will be cleaned up on process termination. It holds
+ * a reference to the request, but nothing else.)
+ *
+ * The sync_file fd can be combined with other sync_file and passed either
+ * to execbuf using I915_EXEC_FENCE_IN, to atomic KMS ioctls (so that a flip
+ * will only occur after this request completes), or to other devices.
+ *
+ * Using I915_EXEC_FENCE_OUT requires use of
+ * DRM_IOCTL_I915_GEM_EXECBUFFER2_WR ioctl so that the result is written
+ * back to userspace. Failure to do so will cause the out-fence to always
+ * be reported as zero, and the real fence fd to be leaked.
+ */
+#define I915_EXEC_FENCE_OUT		(1<<17)
+
+#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
 
 #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
 #define i915_execbuffer2_set_context_id(eb2, context) \
-- 
2.10.2

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (12 preceding siblings ...)
  2016-11-14  8:57   ` [PATCH v3 14/14] drm/i915: Support explicit fencing for execbuf Chris Wilson
@ 2016-11-14  9:01   ` Tvrtko Ursulin
  2016-11-14  9:05     ` Chris Wilson
  2016-11-14 10:57   ` Tvrtko Ursulin
  2016-11-14 14:48   ` Joonas Lahtinen
  15 siblings, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-14  9:01 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 14/11/2016 08:56, Chris Wilson wrote:
> Localise the static struct lock_class_key to the caller of
> i915_sw_fence_init() so that we create a lock_class instance for each
> unique sw_fence rather than all sw_fences sharing the same
> lock_class. This eliminate some lockdep false positive when using fences
> from within fence callbacks.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

This one had Joonas' r-b.

Regads,

Tvrtko

> ---
>  drivers/gpu/drm/i915/i915_sw_fence.c |  7 +++++--
>  drivers/gpu/drm/i915/i915_sw_fence.h | 11 ++++++++++-
>  2 files changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 95f2f12e0917..147420ccf49c 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -116,11 +116,14 @@ static void i915_sw_fence_await(struct i915_sw_fence *fence)
>  	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
>  }
>
> -void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn)
> +void __i915_sw_fence_init(struct i915_sw_fence *fence,
> +			  i915_sw_fence_notify_t fn,
> +			  const char *name,
> +			  struct lock_class_key *key)
>  {
>  	BUG_ON((unsigned long)fn & ~I915_SW_FENCE_MASK);
>
> -	init_waitqueue_head(&fence->wait);
> +	__init_waitqueue_head(&fence->wait, name, key);
>  	kref_init(&fence->kref);
>  	atomic_set(&fence->pending, 1);
>  	fence->flags = (unsigned long)fn;
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 707dfc4f0da5..a5546eb2b5cd 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -40,7 +40,16 @@ typedef int (*i915_sw_fence_notify_t)(struct i915_sw_fence *,
>  				      enum i915_sw_fence_notify state);
>  #define __i915_sw_fence_call __aligned(4)
>
> -void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn);
> +void __i915_sw_fence_init(struct i915_sw_fence *fence,
> +			  i915_sw_fence_notify_t fn,
> +			  const char *name,
> +			  struct lock_class_key *key);
> +#define i915_sw_fence_init(fence, fn) do {			\
> +	static struct lock_class_key __key; 			\
> +								\
> +	__i915_sw_fence_init((fence), fn, #fence, &__key);	\
> +} while (0)
> +
>  void i915_sw_fence_commit(struct i915_sw_fence *fence);
>
>  int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass
  2016-11-14  9:01   ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Tvrtko Ursulin
@ 2016-11-14  9:05     ` Chris Wilson
  0 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14  9:05 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Mon, Nov 14, 2016 at 09:01:00AM +0000, Tvrtko Ursulin wrote:
> 
> On 14/11/2016 08:56, Chris Wilson wrote:
> >Localise the static struct lock_class_key to the caller of
> >i915_sw_fence_init() so that we create a lock_class instance for each
> >unique sw_fence rather than all sw_fences sharing the same
> >lock_class. This eliminate some lockdep false positive when using fences
> >from within fence callbacks.
> >
> >Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> This one had Joonas' r-b.

No, this is a new patch. Tackling the same issue of nested annotations
but this time adding the classes to the fence not to the timeline (which
is the next patch).

This is the difference between v2 and v3, and kills that remining nasty
spin_lock_irqsave_nested() from v2.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (13 preceding siblings ...)
  2016-11-14  9:01   ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Tvrtko Ursulin
@ 2016-11-14 10:57   ` Tvrtko Ursulin
  2016-11-14 14:48   ` Joonas Lahtinen
  15 siblings, 0 replies; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-14 10:57 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 14/11/2016 08:56, Chris Wilson wrote:
> Localise the static struct lock_class_key to the caller of
> i915_sw_fence_init() so that we create a lock_class instance for each
> unique sw_fence rather than all sw_fences sharing the same
> lock_class. This eliminate some lockdep false positive when using fences
> from within fence callbacks.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_sw_fence.c |  7 +++++--
>  drivers/gpu/drm/i915/i915_sw_fence.h | 11 ++++++++++-
>  2 files changed, 15 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
> index 95f2f12e0917..147420ccf49c 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.c
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.c
> @@ -116,11 +116,14 @@ static void i915_sw_fence_await(struct i915_sw_fence *fence)
>  	WARN_ON(atomic_inc_return(&fence->pending) <= 1);
>  }
>
> -void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn)
> +void __i915_sw_fence_init(struct i915_sw_fence *fence,
> +			  i915_sw_fence_notify_t fn,
> +			  const char *name,
> +			  struct lock_class_key *key)
>  {
>  	BUG_ON((unsigned long)fn & ~I915_SW_FENCE_MASK);
>
> -	init_waitqueue_head(&fence->wait);
> +	__init_waitqueue_head(&fence->wait, name, key);
>  	kref_init(&fence->kref);
>  	atomic_set(&fence->pending, 1);
>  	fence->flags = (unsigned long)fn;
> diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
> index 707dfc4f0da5..a5546eb2b5cd 100644
> --- a/drivers/gpu/drm/i915/i915_sw_fence.h
> +++ b/drivers/gpu/drm/i915/i915_sw_fence.h
> @@ -40,7 +40,16 @@ typedef int (*i915_sw_fence_notify_t)(struct i915_sw_fence *,
>  				      enum i915_sw_fence_notify state);
>  #define __i915_sw_fence_call __aligned(4)
>
> -void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn);
> +void __i915_sw_fence_init(struct i915_sw_fence *fence,
> +			  i915_sw_fence_notify_t fn,
> +			  const char *name,
> +			  struct lock_class_key *key);
> +#define i915_sw_fence_init(fence, fn) do {			\
> +	static struct lock_class_key __key; 			\
> +								\
> +	__i915_sw_fence_init((fence), fn, #fence, &__key);	\

(fn) ?

> +} while (0)
> +
>  void i915_sw_fence_commit(struct i915_sw_fence *fence);
>
>  int i915_sw_fence_await_sw_fence(struct i915_sw_fence *fence,
>

Looks good to me.

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 04/14] drm/i915: Defer transfer onto execution timeline to actual hw submission
  2016-11-14  8:56   ` [PATCH v3 04/14] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
@ 2016-11-14 10:59     ` Tvrtko Ursulin
  0 siblings, 0 replies; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-14 10:59 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 14/11/2016 08:56, Chris Wilson wrote:
> Defer the transfer from the client's timeline onto the execution
> timeline from the point of readiness to the point of actual submission.
> For example, in execlists, a request is finally submitted to hardware
> when the hardware is ready, and only put onto the hardware queue when
> the request is ready. By deferring the transfer, we ensure that the
> timeline is maintained in retirement order if we decide to queue the
> requests onto the hardware in a different order than fifo.
>
> v2: Rebased onto distinct global/user timeline lock classes.
> v3: Play with the position of the spin_lock().
> v4: Nesting finally resolved with distinct sw_fence lock classes.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_request.c    | 38 ++++++++++++++++++++----------
>  drivers/gpu/drm/i915/i915_gem_request.h    |  3 +++
>  drivers/gpu/drm/i915/i915_guc_submission.c | 14 ++++++++++-
>  drivers/gpu/drm/i915/intel_lrc.c           | 23 +++++++++++-------
>  drivers/gpu/drm/i915/intel_ringbuffer.c    |  2 ++
>  5 files changed, 57 insertions(+), 23 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index d0f6b9f82636..952d2aec5244 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -306,25 +306,16 @@ static u32 timeline_get_seqno(struct i915_gem_timeline *tl)
>  	return atomic_inc_return(&tl->next_seqno);
>  }
>
> -static int __i915_sw_fence_call
> -submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +void __i915_gem_request_submit(struct drm_i915_gem_request *request)
>  {
> -	struct drm_i915_gem_request *request =
> -		container_of(fence, typeof(*request), submit);
>  	struct intel_engine_cs *engine = request->engine;
>  	struct intel_timeline *timeline;
> -	unsigned long flags;
>  	u32 seqno;
>
> -	if (state != FENCE_COMPLETE)
> -		return NOTIFY_DONE;
> -
>  	/* Transfer from per-context onto the global per-engine timeline */
>  	timeline = engine->timeline;
>  	GEM_BUG_ON(timeline == request->timeline);
> -
> -	/* Will be called from irq-context when using foreign DMA fences */
> -	spin_lock_irqsave(&timeline->lock, flags);
> +	assert_spin_locked(&timeline->lock);
>
>  	seqno = timeline_get_seqno(timeline->common);
>  	GEM_BUG_ON(!seqno);
> @@ -344,15 +335,36 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
>  	GEM_BUG_ON(!request->global_seqno);
>  	engine->emit_breadcrumb(request,
>  				request->ring->vaddr + request->postfix);
> -	engine->submit_request(request);
>
>  	spin_lock(&request->timeline->lock);
>  	list_move_tail(&request->link, &timeline->requests);
>  	spin_unlock(&request->timeline->lock);
>
>  	i915_sw_fence_commit(&request->execute);
> +}
> +
> +void i915_gem_request_submit(struct drm_i915_gem_request *request)
> +{
> +	struct intel_engine_cs *engine = request->engine;
> +	unsigned long flags;
>
> -	spin_unlock_irqrestore(&timeline->lock, flags);
> +	/* Will be called from irq-context when using foreign fences. */
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +
> +	__i915_gem_request_submit(request);
> +
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +}
> +
> +static int __i915_sw_fence_call
> +submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
> +{
> +	if (state == FENCE_COMPLETE) {
> +		struct drm_i915_gem_request *request =
> +			container_of(fence, typeof(*request), submit);
> +
> +		request->engine->submit_request(request);
> +	}
>
>  	return NOTIFY_DONE;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 4976039189ea..4d2784633d9f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -232,6 +232,9 @@ void __i915_add_request(struct drm_i915_gem_request *req, bool flush_caches);
>  #define i915_add_request_no_flush(req) \
>  	__i915_add_request(req, false)
>
> +void __i915_gem_request_submit(struct drm_i915_gem_request *request);
> +void i915_gem_request_submit(struct drm_i915_gem_request *request);
> +
>  struct intel_rps_client;
>  #define NO_WAITBOOST ERR_PTR(-1)
>  #define IS_RPS_CLIENT(p) (!IS_ERR(p))
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 666dab7a675a..942f5000d372 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -629,11 +629,23 @@ static int guc_ring_doorbell(struct i915_guc_client *gc)
>  static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  {
>  	struct drm_i915_private *dev_priv = rq->i915;
> -	unsigned int engine_id = rq->engine->id;
> +	struct intel_engine_cs *engine = rq->engine;
> +	unsigned int engine_id = engine->id;
>  	struct intel_guc *guc = &rq->i915->guc;
>  	struct i915_guc_client *client = guc->execbuf_client;
>  	int b_ret;
>
> +	/* We keep the previous context alive until we retire the following
> +	 * request. This ensures that any the context object is still pinned
> +	 * for any residual writes the HW makes into it on the context switch
> +	 * into the next object following the breadcrumb. Otherwise, we may
> +	 * retire the context too early.
> +	 */
> +	rq->previous_context = engine->last_context;
> +	engine->last_context = rq->ctx;
> +
> +	i915_gem_request_submit(rq);
> +
>  	spin_lock(&client->wq_lock);
>  	guc_wq_item_append(client, rq);
>
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index dde04b7643b1..dca41834dec1 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -434,6 +434,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
>  	struct drm_i915_gem_request *cursor, *last;
>  	struct execlist_port *port = engine->execlist_port;
> +	unsigned long flags;
>  	bool submit = false;
>
>  	last = port->request;
> @@ -469,6 +470,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	 * and context switches) submission.
>  	 */
>
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
>  	spin_lock(&engine->execlist_lock);
>  	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
>  		/* Can we combine this request with the current port? It has to
> @@ -501,6 +503,17 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			i915_gem_request_assign(&port->request, last);
>  			port++;
>  		}
> +
> +		/* We keep the previous context alive until we retire the
> +		 * following request. This ensures that any the context object
> +		 * is still pinned for any residual writes the HW makes into it
> +		 * on the context switch into the next object following the
> +		 * breadcrumb. Otherwise, we may retire the context too early.
> +		 */
> +		cursor->previous_context = engine->last_context;
> +		engine->last_context = cursor->ctx;
> +
> +		__i915_gem_request_submit(cursor);
>  		last = cursor;
>  		submit = true;
>  	}
> @@ -512,6 +525,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		i915_gem_request_assign(&port->request, last);
>  	}
>  	spin_unlock(&engine->execlist_lock);
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>
>  	if (submit)
>  		execlists_submit_ports(engine);
> @@ -621,15 +635,6 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
>
>  	spin_lock_irqsave(&engine->execlist_lock, flags);
>
> -	/* We keep the previous context alive until we retire the following
> -	 * request. This ensures that any the context object is still pinned
> -	 * for any residual writes the HW makes into it on the context switch
> -	 * into the next object following the breadcrumb. Otherwise, we may
> -	 * retire the context too early.
> -	 */
> -	request->previous_context = engine->last_context;
> -	engine->last_context = request->ctx;
> -
>  	list_add_tail(&request->execlist_link, &engine->execlist_queue);
>  	if (execlists_elsp_idle(engine))
>  		tasklet_hi_schedule(&engine->irq_tasklet);
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 700e93d80616..aeb637dc1fdf 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -1294,6 +1294,8 @@ static void i9xx_submit_request(struct drm_i915_gem_request *request)
>  {
>  	struct drm_i915_private *dev_priv = request->i915;
>
> +	i915_gem_request_submit(request);
> +
>  	I915_WRITE_TAIL(request->engine, request->tail);
>  }
>
>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 07/14] drm/i915/scheduler: Record all dependencies upon request construction
  2016-11-14  8:56   ` [PATCH v3 07/14] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
@ 2016-11-14 11:09     ` Tvrtko Ursulin
  0 siblings, 0 replies; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-14 11:09 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 14/11/2016 08:56, Chris Wilson wrote:
> The scheduler needs to know the dependencies of each request for the
> lifetime of the request, as it may choose to reschedule the requests at
> any time and must ensure the dependency tree is not broken. This is in
> additional to using the fence to only allow execution after all
> dependencies have been completed.
>
> One option was to extend the fence to support the bidirectional
> dependency tracking required by the scheduler. However the mismatch in
> lifetimes between the submit fence and the request essentially meant
> that we had to build a completely separate struct (and we could not
> simply reuse the existing waitqueue in the fence for one half of the
> dependency tracking). The extra dependency tracking simply did not mesh
> well with the fence, and keeping it separate both keeps the fence
> implementation simpler and allows us to extend the dependency tracking
> into a priority tree (whilst maintaining support for reordering the
> tree).
>
> To avoid the additional allocations and list manipulations, the use of
> the priotree is disabled when there are no schedulers to use it.
>
> v2: Create a dedicated slab for i915_dependency.
>     Rename the lists.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         |  1 +
>  drivers/gpu/drm/i915/i915_gem.c         | 11 +++-
>  drivers/gpu/drm/i915/i915_gem_request.c | 91 ++++++++++++++++++++++++++++++++-
>  drivers/gpu/drm/i915/i915_gem_request.h | 33 ++++++++++++
>  4 files changed, 134 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index c0f1dfc7119e..ab4ad5522cf5 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1778,6 +1778,7 @@ struct drm_i915_private {
>  	struct kmem_cache *objects;
>  	struct kmem_cache *vmas;
>  	struct kmem_cache *requests;
> +	struct kmem_cache *dependencies;
>
>  	const struct intel_device_info info;
>
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index e1afa11609a0..b331e5966fe2 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -4431,12 +4431,18 @@ i915_gem_load_init(struct drm_device *dev)
>  	if (!dev_priv->requests)
>  		goto err_vmas;
>
> +	dev_priv->dependencies = KMEM_CACHE(i915_dependency,
> +					    SLAB_HWCACHE_ALIGN |
> +					    SLAB_RECLAIM_ACCOUNT);
> +	if (!dev_priv->dependencies)
> +		goto err_requests;
> +
>  	mutex_lock(&dev_priv->drm.struct_mutex);
>  	INIT_LIST_HEAD(&dev_priv->gt.timelines);
>  	err = i915_gem_timeline_init__global(dev_priv);
>  	mutex_unlock(&dev_priv->drm.struct_mutex);
>  	if (err)
> -		goto err_requests;
> +		goto err_dependencies;
>
>  	INIT_LIST_HEAD(&dev_priv->context_list);
>  	INIT_WORK(&dev_priv->mm.free_work, __i915_gem_free_work);
> @@ -4464,6 +4470,8 @@ i915_gem_load_init(struct drm_device *dev)
>
>  	return 0;
>
> +err_dependencies:
> +	kmem_cache_destroy(dev_priv->dependencies);
>  err_requests:
>  	kmem_cache_destroy(dev_priv->requests);
>  err_vmas:
> @@ -4480,6 +4488,7 @@ void i915_gem_load_cleanup(struct drm_device *dev)
>
>  	WARN_ON(!llist_empty(&dev_priv->mm.free_list));
>
> +	kmem_cache_destroy(dev_priv->dependencies);
>  	kmem_cache_destroy(dev_priv->requests);
>  	kmem_cache_destroy(dev_priv->vmas);
>  	kmem_cache_destroy(dev_priv->objects);
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 1118cf48d6f0..78c87d94d205 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -113,6 +113,77 @@ i915_gem_request_remove_from_client(struct drm_i915_gem_request *request)
>  	spin_unlock(&file_priv->mm.lock);
>  }
>
> +static struct i915_dependency *
> +i915_dependency_alloc(struct drm_i915_private *i915)
> +{
> +	return kmem_cache_alloc(i915->dependencies, GFP_KERNEL);
> +}
> +
> +static void
> +i915_dependency_free(struct drm_i915_private *i915,
> +		     struct i915_dependency *dep)
> +{
> +	kmem_cache_free(i915->dependencies, dep);
> +}
> +
> +static void
> +__i915_priotree_add_dependency(struct i915_priotree *pt,
> +			       struct i915_priotree *signal,
> +			       struct i915_dependency *dep,
> +			       unsigned long flags)
> +{
> +	list_add(&dep->wait_link, &signal->waiters_list);
> +	list_add(&dep->signal_link, &pt->signalers_list);
> +	dep->signaler = signal;
> +	dep->flags = flags;
> +}
> +
> +static int
> +i915_priotree_add_dependency(struct drm_i915_private *i915,
> +			     struct i915_priotree *pt,
> +			     struct i915_priotree *signal)
> +{
> +	struct i915_dependency *dep;
> +
> +	dep = i915_dependency_alloc(i915);
> +	if (!dep)
> +		return -ENOMEM;
> +
> +	__i915_priotree_add_dependency(pt, signal, dep, I915_DEPENDENCY_ALLOC);
> +	return 0;
> +}
> +
> +static void
> +i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
> +{
> +	struct i915_dependency *dep, *next;
> +
> +	/* Everyone we depended upon (the fences we wait to be signaled)
> +	 * should retire before us and remove themselves from our list.
> +	 * However, retirement is run independently on each timeline and
> +	 * so we may be called out-of-order.
> +	 */
> +	list_for_each_entry_safe(dep, next, &pt->signalers_list, signal_link) {
> +		list_del(&dep->wait_link);
> +		if (dep->flags & I915_DEPENDENCY_ALLOC)
> +			i915_dependency_free(i915, dep);
> +	}
> +
> +	/* Remove ourselves from everyone who depends upon us */
> +	list_for_each_entry_safe(dep, next, &pt->waiters_list, wait_link) {
> +		list_del(&dep->signal_link);
> +		if (dep->flags & I915_DEPENDENCY_ALLOC)
> +			i915_dependency_free(i915, dep);
> +	}
> +}
> +
> +static void
> +i915_priotree_init(struct i915_priotree *pt)
> +{
> +	INIT_LIST_HEAD(&pt->signalers_list);
> +	INIT_LIST_HEAD(&pt->waiters_list);
> +}
> +
>  void i915_gem_retire_noop(struct i915_gem_active *active,
>  			  struct drm_i915_gem_request *request)
>  {
> @@ -182,6 +253,8 @@ static void i915_gem_request_retire(struct drm_i915_gem_request *request)
>  	i915_gem_context_put(request->ctx);
>
>  	dma_fence_signal(&request->fence);
> +
> +	i915_priotree_fini(request->i915, &request->priotree);
>  	i915_gem_request_put(request);
>  }
>
> @@ -467,6 +540,8 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
>  	 */
>  	i915_sw_fence_await_sw_fence(&req->execute, &req->submit, &req->execq);
>
> +	i915_priotree_init(&req->priotree);
> +
>  	INIT_LIST_HEAD(&req->active_list);
>  	req->i915 = dev_priv;
>  	req->engine = engine;
> @@ -520,6 +595,14 @@ i915_gem_request_await_request(struct drm_i915_gem_request *to,
>
>  	GEM_BUG_ON(to == from);
>
> +	if (to->engine->schedule) {
> +		ret = i915_priotree_add_dependency(to->i915,
> +						   &to->priotree,
> +						   &from->priotree);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
>  	if (to->timeline == from->timeline)
>  		return 0;
>
> @@ -743,9 +826,15 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
>
>  	prev = i915_gem_active_raw(&timeline->last_request,
>  				   &request->i915->drm.struct_mutex);
> -	if (prev)
> +	if (prev) {
>  		i915_sw_fence_await_sw_fence(&request->submit, &prev->submit,
>  					     &request->submitq);
> +		if (engine->schedule)
> +			__i915_priotree_add_dependency(&request->priotree,
> +						       &prev->priotree,
> +						       &request->dep,
> +						       0);
> +	}
>
>  	spin_lock_irq(&timeline->lock);
>  	list_add_tail(&request->link, &timeline->requests);
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 4d2784633d9f..943c39d2a62a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -44,6 +44,28 @@ struct intel_signal_node {
>  	struct intel_wait wait;
>  };
>
> +struct i915_dependency {
> +	struct i915_priotree *signaler;
> +	struct list_head signal_link;
> +	struct list_head wait_link;
> +	unsigned long flags;
> +#define I915_DEPENDENCY_ALLOC BIT(0)
> +};
> +
> +/* Requests exist in a complex web of interdependencies. Each request
> + * has to wait for some other request to complete before it is ready to be run
> + * (e.g. we have to wait until the pixels have been rendering into a texture
> + * before we can copy from it). We track the readiness of a request in terms
> + * of fences, but we also need to keep the dependency tree for the lifetime
> + * of the request (beyond the life of an individual fence). We use the tree
> + * at various points to reorder the requests whilst keeping the requests
> + * in order with respect to their various dependencies.
> + */
> +struct i915_priotree {
> +	struct list_head signalers_list; /* those before us, we depend upon */
> +	struct list_head waiters_list; /* those after us, they depend upon us */
> +};
> +
>  /**
>   * Request queue structure.
>   *
> @@ -105,6 +127,17 @@ struct drm_i915_gem_request {
>  	wait_queue_t submitq;
>  	wait_queue_t execq;
>
> +	/* A list of everyone we wait upon, and everyone who waits upon us.
> +	 * Even though we will not be submitted to the hardware before the
> +	 * submit fence is signaled (it waits for all external events as well
> +	 * as our own requests), the scheduler still needs to know the
> +	 * dependency tree for the lifetime of the request (from execbuf
> +	 * to retirement), i.e. bidirectional dependency information for the
> +	 * request not tied to individual fences.
> +	 */
> +	struct i915_priotree priotree;
> +	struct i915_dependency dep;
> +
>  	u32 global_seqno;
>
>  	/** GEM sequence number associated with the previous request,
>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 08/14] drm/i915/scheduler: Execute requests in order of priorities
  2016-11-14  8:56   ` [PATCH v3 08/14] drm/i915/scheduler: Execute requests in order of priorities Chris Wilson
@ 2016-11-14 11:15     ` Tvrtko Ursulin
  2016-11-14 11:41       ` Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-14 11:15 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 14/11/2016 08:56, Chris Wilson wrote:
> Track the priority of each request and use it to determine the order in
> which we submit requests to the hardware via execlists.
>
> The priority of the request is determined by the user (eventually via
> the context) but may be overridden at any time by the driver. When we set
> the priority of the request, we bump the priority of all of its
> dependencies to match - so that a high priority drawing operation is not
> stuck behind a background task.
>
> When the request is ready to execute (i.e. we have signaled the submit
> fence following completion of all its dependencies, including third
> party fences), we put the request into a priority sorted rbtree to be
> submitted to the hardware. If the request is higher priority than all
> pending requests, it will be submitted on the next context-switch
> interrupt as soon as the hardware has completed the current request. We
> do not currently preempt any current execution to immediately run a very
> high priority request, at least not yet.
>
> One more limitation, is that this is first implementation is for
> execlists only so currently limited to gen8/gen9.
>
> v2: Replace recursive priority inheritance bumping with an iterative
> depth-first search list.
> v3: list_next_entry() for walking lists
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_debugfs.c        |   7 +-
>  drivers/gpu/drm/i915/i915_gem.c            |   3 +-
>  drivers/gpu/drm/i915/i915_gem_request.c    |   5 ++
>  drivers/gpu/drm/i915/i915_gem_request.h    |   8 +-
>  drivers/gpu/drm/i915/i915_guc_submission.c |   1 +
>  drivers/gpu/drm/i915/intel_engine_cs.c     |   3 +-
>  drivers/gpu/drm/i915/intel_lrc.c           | 135 +++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/intel_ringbuffer.h    |   3 +-
>  8 files changed, 149 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
> index 03e3c2afbb06..1cc971cb6cb1 100644
> --- a/drivers/gpu/drm/i915/i915_debugfs.c
> +++ b/drivers/gpu/drm/i915/i915_debugfs.c
> @@ -631,8 +631,9 @@ static void print_request(struct seq_file *m,
>  			  struct drm_i915_gem_request *rq,
>  			  const char *prefix)
>  {
> -	seq_printf(m, "%s%x [%x:%x] @ %d: %s\n", prefix,
> +	seq_printf(m, "%s%x [%x:%x] prio=%d @ %dms: %s\n", prefix,
>  		   rq->global_seqno, rq->ctx->hw_id, rq->fence.seqno,
> +		   rq->priotree.priority,
>  		   jiffies_to_msecs(jiffies - rq->emitted_jiffies),
>  		   rq->timeline->common->name);
>  }
> @@ -3216,6 +3217,7 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>
>  		if (i915.enable_execlists) {
>  			u32 ptr, read, write;
> +			struct rb_node *rb;
>
>  			seq_printf(m, "\tExeclist status: 0x%08x %08x\n",
>  				   I915_READ(RING_EXECLIST_STATUS_LO(engine)),
> @@ -3255,7 +3257,8 @@ static int i915_engine_info(struct seq_file *m, void *unused)
>  			rcu_read_unlock();
>
>  			spin_lock_irq(&engine->timeline->lock);
> -			list_for_each_entry(rq, &engine->execlist_queue, execlist_link) {
> +			for (rb = engine->execlist_first; rb; rb = rb_next(rb)) {
> +				rq = rb_entry(rb, typeof(*rq), priotree.node);
>  				print_request(m, rq, "\t\tQ ");
>  			}
>  			spin_unlock_irq(&engine->timeline->lock);
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index b331e5966fe2..a9d27f3e88d2 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -2722,10 +2722,11 @@ static void i915_gem_cleanup_engine(struct intel_engine_cs *engine)
>
>  		spin_lock_irqsave(&engine->timeline->lock, flags);
>
> -		INIT_LIST_HEAD(&engine->execlist_queue);
>  		i915_gem_request_put(engine->execlist_port[0].request);
>  		i915_gem_request_put(engine->execlist_port[1].request);
>  		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
> +		engine->execlist_queue = RB_ROOT;
> +		engine->execlist_first = NULL;
>
>  		spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  	}
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 78c87d94d205..13574a1e29b1 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -132,6 +132,7 @@ __i915_priotree_add_dependency(struct i915_priotree *pt,
>  			       struct i915_dependency *dep,
>  			       unsigned long flags)
>  {
> +	INIT_LIST_HEAD(&dep->dfs_link);
>  	list_add(&dep->wait_link, &signal->waiters_list);
>  	list_add(&dep->signal_link, &pt->signalers_list);
>  	dep->signaler = signal;
> @@ -158,6 +159,8 @@ i915_priotree_fini(struct drm_i915_private *i915, struct i915_priotree *pt)
>  {
>  	struct i915_dependency *dep, *next;
>
> +	GEM_BUG_ON(!RB_EMPTY_NODE(&pt->node));
> +
>  	/* Everyone we depended upon (the fences we wait to be signaled)
>  	 * should retire before us and remove themselves from our list.
>  	 * However, retirement is run independently on each timeline and
> @@ -182,6 +185,8 @@ i915_priotree_init(struct i915_priotree *pt)
>  {
>  	INIT_LIST_HEAD(&pt->signalers_list);
>  	INIT_LIST_HEAD(&pt->waiters_list);
> +	RB_CLEAR_NODE(&pt->node);
> +	pt->priority = INT_MIN;
>  }
>
>  void i915_gem_retire_noop(struct i915_gem_active *active,
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
> index 943c39d2a62a..e2b077df2da0 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.h
> +++ b/drivers/gpu/drm/i915/i915_gem_request.h
> @@ -48,6 +48,7 @@ struct i915_dependency {
>  	struct i915_priotree *signaler;
>  	struct list_head signal_link;
>  	struct list_head wait_link;
> +	struct list_head dfs_link;
>  	unsigned long flags;
>  #define I915_DEPENDENCY_ALLOC BIT(0)
>  };
> @@ -64,6 +65,10 @@ struct i915_dependency {
>  struct i915_priotree {
>  	struct list_head signalers_list; /* those before us, we depend upon */
>  	struct list_head waiters_list; /* those after us, they depend upon us */
> +	struct rb_node node;
> +	int priority;
> +#define I915_PRIORITY_MAX 1024
> +#define I915_PRIORITY_MIN (-I915_PRIORITY_MAX)
>  };
>
>  /**
> @@ -194,9 +199,6 @@ struct drm_i915_gem_request {
>  	struct drm_i915_file_private *file_priv;
>  	/** file_priv list entry for this request */
>  	struct list_head client_list;
> -
> -	/** Link in the execlist submission queue, guarded by execlist_lock. */
> -	struct list_head execlist_link;
>  };
>
>  extern const struct dma_fence_ops i915_fence_ops;
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 942f5000d372..4462112725ef 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -1532,6 +1532,7 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
>  		engine->submit_request = i915_guc_submit;
> +		engine->schedule = NULL;
>
>  		/* Replay the current set of previously submitted requests */
>  		list_for_each_entry(request,
> diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c
> index c9171a058478..3da4d466e332 100644
> --- a/drivers/gpu/drm/i915/intel_engine_cs.c
> +++ b/drivers/gpu/drm/i915/intel_engine_cs.c
> @@ -239,7 +239,8 @@ static void intel_engine_init_timeline(struct intel_engine_cs *engine)
>   */
>  void intel_engine_setup_common(struct intel_engine_cs *engine)
>  {
> -	INIT_LIST_HEAD(&engine->execlist_queue);
> +	engine->execlist_queue = RB_ROOT;
> +	engine->execlist_first = NULL;
>
>  	intel_engine_init_timeline(engine);
>  	intel_engine_init_hangcheck(engine);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index d1aea7462515..d13a335ad83a 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -432,9 +432,10 @@ static bool can_merge_ctx(const struct i915_gem_context *prev,
>
>  static void execlists_dequeue(struct intel_engine_cs *engine)
>  {
> -	struct drm_i915_gem_request *cursor, *last;
> +	struct drm_i915_gem_request *last;
>  	struct execlist_port *port = engine->execlist_port;
>  	unsigned long flags;
> +	struct rb_node *rb;
>  	bool submit = false;
>
>  	last = port->request;
> @@ -471,7 +472,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  	 */
>
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
> -	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
>  		/* Can we combine this request with the current port? It has to
>  		 * be the same context/ringbuffer and not have any exceptions
>  		 * (e.g. GVT saying never to combine contexts).
> @@ -503,6 +508,11 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  			port++;
>  		}
>
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
>  		/* We keep the previous context alive until we retire the
>  		 * following request. This ensures that any the context object
>  		 * is still pinned for any residual writes the HW makes into it
> @@ -517,11 +527,8 @@ static void execlists_dequeue(struct intel_engine_cs *engine)
>  		submit = true;
>  	}
>  	if (submit) {
> -		/* Decouple all the requests submitted from the queue */
> -		engine->execlist_queue.next = &cursor->execlist_link;
> -		cursor->execlist_link.prev = &engine->execlist_queue;
> -
>  		i915_gem_request_assign(&port->request, last);
> +		engine->execlist_first = rb;
>  	}
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>
> @@ -626,6 +633,32 @@ static void intel_lrc_irq_handler(unsigned long data)
>  	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
>  }
>
> +static bool insert_request(struct i915_priotree *pt, struct rb_root *root)
> +{
> +	struct rb_node **p, *rb;
> +	bool first = true;
> +
> +	/* most positive priority is scheduled first, equal priorities fifo */
> +	rb = NULL;
> +	p = &root->rb_node;
> +	while (*p) {
> +		struct i915_priotree *pos;
> +
> +		rb = *p;
> +		pos = rb_entry(rb, typeof(*pos), node);
> +		if (pt->priority > pos->priority) {
> +			p = &rb->rb_left;
> +		} else {
> +			p = &rb->rb_right;
> +			first = false;
> +		}
> +	}
> +	rb_link_node(&pt->node, rb, p);
> +	rb_insert_color(&pt->node, root);
> +
> +	return first;
> +}
> +
>  static void execlists_submit_request(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
> @@ -634,13 +667,96 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
>  	/* Will be called from irq-context when using foreign fences. */
>  	spin_lock_irqsave(&engine->timeline->lock, flags);
>
> -	list_add_tail(&request->execlist_link, &engine->execlist_queue);
> +	if (insert_request(&request->priotree, &engine->execlist_queue))
> +		engine->execlist_first = &request->priotree.node;
>  	if (execlists_elsp_idle(engine))
>  		tasklet_hi_schedule(&engine->irq_tasklet);
>
>  	spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  }
>
> +static struct intel_engine_cs *
> +pt_lock_engine(struct i915_priotree *pt, struct intel_engine_cs *locked)
> +{
> +	struct intel_engine_cs *engine;
> +
> +	engine = container_of(pt,
> +			      struct drm_i915_gem_request,
> +			      priotree)->engine;
> +	if (engine != locked) {
> +		if (locked)
> +			spin_unlock_irq(&locked->timeline->lock);
> +		spin_lock_irq(&engine->timeline->lock);
> +	}
> +
> +	return engine;
> +}
> +
> +static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
> +{
> +	struct intel_engine_cs *engine = NULL;
> +	struct i915_dependency *dep, *p;
> +	struct i915_dependency stack;
> +	LIST_HEAD(dfs);
> +
> +	if (prio <= READ_ONCE(request->priotree.priority))
> +		return;
> +
> +	/* Need BKL in order to use the temporary link inside i915_dependency */
> +	lockdep_assert_held(&request->i915->drm.struct_mutex);
> +
> +	stack.signaler = &request->priotree;
> +	list_add(&stack.dfs_link, &dfs);
> +
> +	/* Recursively bump all dependent priorities to match the new request */

Missed last time round that the comment needs updating.

> +	list_for_each_entry_safe(dep, p, &dfs, dfs_link) {
> +		struct i915_priotree *pt = dep->signaler;
> +
> +		list_for_each_entry(p, &pt->signalers_list, signal_link)
> +			if (prio > READ_ONCE(p->signaler->priority))
> +				list_move_tail(&p->dfs_link, &dfs);
> +
> +		p = list_next_entry(dep, dfs_link);
> +		if (!RB_EMPTY_NODE(&pt->node))
> +			continue;
> +
> +		engine = pt_lock_engine(pt, engine);
> +
> +		/* If it is not already in the rbtree, we can update the
> +		 * priority inplace and skip over it (and its dependencies)
> +		 * if it is referenced *again* as we descend the dfs.
> +		 */
> +		if (prio > pt->priority && RB_EMPTY_NODE(&pt->node)) {
> +			pt->priority = prio;
> +			list_del_init(&dep->dfs_link);
> +		}
> +	}
> +
> +	/* Fifo and depth-first replacement ensure our deps execute before us */
> +	list_for_each_entry_safe_reverse(dep, p, &dfs, dfs_link) {
> +		struct i915_priotree *pt = dep->signaler;
> +
> +		INIT_LIST_HEAD(&dep->dfs_link);
> +
> +		engine = pt_lock_engine(pt, engine);
> +
> +		if (prio <= pt->priority)
> +			continue;
> +
> +		GEM_BUG_ON(RB_EMPTY_NODE(&pt->node));
> +
> +		pt->priority = prio;
> +		rb_erase(&pt->node, &engine->execlist_queue);
> +		if (insert_request(pt, &engine->execlist_queue))
> +			engine->execlist_first = &pt->node;
> +	}
> +
> +	if (engine)
> +		spin_unlock_irq(&engine->timeline->lock);
> +
> +	/* XXX Do we need to preempt to make room for us and our deps? */
> +}
> +
>  int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
>  {
>  	struct intel_engine_cs *engine = request->engine;
> @@ -1677,8 +1793,10 @@ void intel_execlists_enable_submission(struct drm_i915_private *dev_priv)
>  	struct intel_engine_cs *engine;
>  	enum intel_engine_id id;
>
> -	for_each_engine(engine, dev_priv, id)
> +	for_each_engine(engine, dev_priv, id) {
>  		engine->submit_request = execlists_submit_request;
> +		engine->schedule = execlists_schedule;
> +	}
>  }
>
>  static void
> @@ -1691,6 +1809,7 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
>  	engine->emit_breadcrumb = gen8_emit_breadcrumb;
>  	engine->emit_breadcrumb_sz = gen8_emit_breadcrumb_sz;
>  	engine->submit_request = execlists_submit_request;
> +	engine->schedule = execlists_schedule;
>
>  	engine->irq_enable = gen8_logical_ring_enable_irq;
>  	engine->irq_disable = gen8_logical_ring_disable_irq;
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index b9583941eb6b..3466b4e77e7c 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -348,7 +348,8 @@ struct intel_engine_cs {
>  		struct drm_i915_gem_request *request;
>  		unsigned int count;
>  	} execlist_port[2];
> -	struct list_head execlist_queue;
> +	struct rb_root execlist_queue;
> +	struct rb_node *execlist_first;
>  	unsigned int fw_domains;
>  	bool disable_lite_restore_wa;
>  	bool preempt_wa;
>

Just the comment needs to be updated. With that:

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 09/14] drm/i915: Store the execution priority on the context
  2016-11-14  8:56   ` [PATCH v3 09/14] drm/i915: Store the execution priority on the context Chris Wilson
@ 2016-11-14 11:16     ` Tvrtko Ursulin
  0 siblings, 0 replies; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-14 11:16 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 14/11/2016 08:56, Chris Wilson wrote:
> In order to support userspace defining different levels of importance to
> different contexts, and in particular the preferred order of execution,
> store a priority value on each context. By default, the kernel's
> context, which is used for idling and other background tasks, is given
> minimum priority (all user contexts will execute first).
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_drv.h         | 1 +
>  drivers/gpu/drm/i915/i915_gem_context.c | 1 +
>  drivers/gpu/drm/i915/i915_gem_request.c | 2 +-
>  3 files changed, 3 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index ab4ad5522cf5..fb3e850f5d3a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -936,6 +936,7 @@ struct i915_gem_context {
>  	/* Unique identifier for this context, used by the hw for tracking */
>  	unsigned int hw_id;
>  	u32 user_handle;
> +	int priority; /* greater priorities are serviced first */
>
>  	u32 ggtt_alignment;
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 6dd475735f0a..1f94b8d6d83d 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -476,6 +476,7 @@ int i915_gem_context_init(struct drm_device *dev)
>  		return PTR_ERR(ctx);
>  	}
>
> +	ctx->priority = I915_PRIORITY_MIN; /* lowest priority; idle task */
>  	dev_priv->kernel_context = ctx;
>
>  	DRM_DEBUG_DRIVER("%s context support initialized\n",
> diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
> index 13574a1e29b1..b9b5253cf3cd 100644
> --- a/drivers/gpu/drm/i915/i915_gem_request.c
> +++ b/drivers/gpu/drm/i915/i915_gem_request.c
> @@ -867,7 +867,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
>  	 * run at the earliest possible convenience.
>  	 */
>  	if (engine->schedule)
> -		engine->schedule(request, 0);
> +		engine->schedule(request, request->ctx->priority);
>
>  	local_bh_disable();
>  	i915_sw_fence_commit(&request->submit);
>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc
  2016-11-14  8:57   ` [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
@ 2016-11-14 11:31     ` Tvrtko Ursulin
  2016-11-14 14:40       ` Chris Wilson
  2016-12-01 10:45     ` Tvrtko Ursulin
  1 sibling, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-14 11:31 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 14/11/2016 08:57, Chris Wilson wrote:
> This emulates execlists on top of the GuC in order to defer submission of
> requests to the hardware. This deferral allows time for high priority
> requests to gazump their way to the head of the queue, however it nerfs
> the GuC by converting it back into a simple execlist (where the CPU has
> to wake up after every request to feed new commands into the GuC).

Don't know what to do with this one. It feels like it should be a 
separate patch so it can be performance evaluated properly?

It is also not clear to me why we don't need any similar limiting for 
the execlists request merging?

Regards,

Tvrtko

> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 85 +++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/i915_irq.c            |  4 +-
>  drivers/gpu/drm/i915/intel_lrc.c           |  3 --
>  3 files changed, 76 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 4462112725ef..088f5a99ecfc 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  	u32 freespace;
>  	int ret;
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
>  	freespace -= gc->wq_rsvd;
>  	if (likely(freespace >= wqi_size)) {
> @@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  		gc->no_wq_space++;
>  		ret = -EAGAIN;
>  	}
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>
>  	return ret;
>  }
> @@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
>
>  	GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	gc->wq_rsvd -= wqi_size;
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>  }
>
>  /* Construct a Work Item and append it to the GuC's Work Queue */
> @@ -644,7 +644,7 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	rq->previous_context = engine->last_context;
>  	engine->last_context = rq->ctx;
>
> -	i915_gem_request_submit(rq);
> +	__i915_gem_request_submit(rq);
>
>  	spin_lock(&client->wq_lock);
>  	guc_wq_item_append(client, rq);
> @@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	spin_unlock(&client->wq_lock);
>  }
>
> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +{
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *last = port[0].request;
> +	unsigned long flags;
> +	struct rb_node *rb;
> +	bool submit = false;
> +
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
> +		if (last && cursor->ctx != last->ctx) {
> +			if (port != engine->execlist_port)
> +				break;
> +
> +			i915_gem_request_assign(&port->request, last);
> +			dma_fence_enable_sw_signaling(&last->fence);
> +			port++;
> +		}
> +
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		i915_guc_submit(cursor);
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		i915_gem_request_assign(&port->request, last);
> +		dma_fence_enable_sw_signaling(&last->fence);
> +		engine->execlist_first = rb;
> +	}
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +	return submit;
> +}
> +
> +static void i915_guc_irq_handler(unsigned long data)
> +{
> +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *rq;
> +	bool submit;
> +
> +	do {
> +		rq = port[0].request;
> +		while (rq && i915_gem_request_completed(rq)) {
> +			i915_gem_request_put(rq);
> +			rq = port[1].request;
> +			port[0].request = rq;
> +			port[1].request = NULL;
> +		}
> +
> +		submit = false;
> +		if (!port[1].request)
> +			submit = i915_guc_dequeue(engine);
> +	} while (submit);
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission
> @@ -1531,16 +1595,13 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
> -		engine->submit_request = i915_guc_submit;
> -		engine->schedule = NULL;
> +		tasklet_init(&engine->irq_tasklet,
> +			     i915_guc_irq_handler,
> +			     (unsigned long)engine);
>
>  		/* Replay the current set of previously submitted requests */
> -		list_for_each_entry(request,
> -				    &engine->timeline->requests, link) {
> +		list_for_each_entry(request, &engine->timeline->requests, link)
>  			client->wq_rsvd += sizeof(struct guc_wq_item);
> -			if (i915_sw_fence_done(&request->submit))
> -				i915_guc_submit(request);
> -		}
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index cb8a75f6ca16..18dce4c66d56 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>  static __always_inline void
>  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>  {
> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> +		tasklet_schedule(&engine->irq_tasklet);
>  		notify_ring(engine);
> +	}
>  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
>  		tasklet_schedule(&engine->irq_tasklet);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index d13a335ad83a..ffab255e55a7 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1425,9 +1425,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
>
> -	if (i915.enable_guc_submission)
> -		return;
> -
>  	/* Catch up with any missed context-switch interrupts */
>  	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
>  	if (request->ctx != port[0].request->ctx) {
>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 12/14] drm/i915/scheduler: Support user-defined priorities
  2016-11-14  8:57   ` [PATCH v3 12/14] drm/i915/scheduler: Support user-defined priorities Chris Wilson
@ 2016-11-14 11:32     ` Tvrtko Ursulin
  0 siblings, 0 replies; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-14 11:32 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 14/11/2016 08:57, Chris Wilson wrote:
> Use a priority stored in the context as the initial value when
> submitting a request. This allows us to change the default priority on a
> per-context basis, allowing different contexts to be favoured with GPU
> time at the expense of lower importance work. The user can adjust the
> context's priority via I915_CONTEXT_PARAM_PRIORITY, with more positive
> values being higher priority (they will be serviced earlier, after their
> dependencies have been resolved). Any prerequisite work for an execbuf
> will have its priority raised to match the new request as required.
>
> Normal users can specify any value in the range of -1023 to 0 [default],
> i.e. they can reduce the priority of their workloads (and temporarily
> boost it back to normal if so desired).
>
> Privileged users can specify any value in the range of -1023 to 1023,
> [default is 0], i.e. they can raise their priority above all overs and
> so potentially starve the system.
>
> Note that the existing schedulers are not fair, nor load balancing, the
> execution is strictly by priority on a first-come, first-served basis,
> and the driver may choose to boost some requests above the range
> available to users.
>
> This priority was originally based around nice(2), but evolved to allow
> clients to adjust their priority within a small range, and allow for a
> privileged high priority range.
>
> For example, this can be used to implement EGL_IMG_context_priority
> https://www.khronos.org/registry/egl/extensions/IMG/EGL_IMG_context_priority.txt
>
> 	EGL_CONTEXT_PRIORITY_LEVEL_IMG determines the priority level of
>         the context to be created. This attribute is a hint, as an
>         implementation may not support multiple contexts at some
>         priority levels and system policy may limit access to high
>         priority contexts to appropriate system privilege level. The
>         default value for EGL_CONTEXT_PRIORITY_LEVEL_IMG is
>         EGL_CONTEXT_PRIORITY_MEDIUM_IMG."
>
> so we can map
>
> 	PRIORITY_HIGH -> 1023 [privileged, will failback to 0]
> 	PRIORITY_MED -> 0 [default]
> 	PRIORITY_LOW -> -1023
>
> They also map onto the priorities used by VkQueue (and a VkQueue is
> essentially a timeline, our i915_gem_context under full-ppgtt).
>
> Testcase: igt/gem_exec_schedule
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_gem_context.c | 20 ++++++++++++++++++++
>  include/uapi/drm/i915_drm.h             |  3 +++
>  2 files changed, 23 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c
> index 1f94b8d6d83d..1f74ab266f6b 100644
> --- a/drivers/gpu/drm/i915/i915_gem_context.c
> +++ b/drivers/gpu/drm/i915/i915_gem_context.c
> @@ -1101,6 +1101,9 @@ int i915_gem_context_getparam_ioctl(struct drm_device *dev, void *data,
>  	case I915_CONTEXT_PARAM_NO_ERROR_CAPTURE:
>  		args->value = !!(ctx->flags & CONTEXT_NO_ERROR_CAPTURE);
>  		break;
> +	case I915_CONTEXT_PARAM_PRIORITY:
> +		args->value = ctx->priority;
> +		break;
>  	default:
>  		ret = -EINVAL;
>  		break;
> @@ -1156,6 +1159,23 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data,
>  				ctx->flags &= ~CONTEXT_NO_ERROR_CAPTURE;
>  		}
>  		break;
> +
> +	case I915_CONTEXT_PARAM_PRIORITY:
> +		{
> +			int priority = args->value;
> +
> +			if (args->size)
> +				ret = -EINVAL;
> +			else if (priority >= I915_PRIORITY_MAX ||
> +				 priority <= -I915_PRIORITY_MAX)
> +				ret = -EINVAL;
> +			else if (priority > 0 && !capable(CAP_SYS_ADMIN))
> +				ret = -EPERM;
> +			else
> +				ctx->priority = priority;
> +		}
> +		break;
> +
>  	default:
>  		ret = -EINVAL;
>  		break;
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 1c12a350eca3..47901a8ad682 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -391,6 +391,8 @@ typedef struct drm_i915_irq_wait {
>
>  /* Query whether DRM_I915_GEM_EXECBUFFER2 supports user defined execution
>   * priorities and the driver will attempt to execute batches in priority order.
> + * The initial priority for each batch is supplied by the context and is
> + * controlled via I915_CONTEXT_PARAM_PRIORITY.
>   */
>  #define I915_PARAM_HAS_SCHEDULER	 41
>
> @@ -1224,6 +1226,7 @@ struct drm_i915_gem_context_param {
>  #define I915_CONTEXT_PARAM_NO_ZEROMAP	0x2
>  #define I915_CONTEXT_PARAM_GTT_SIZE	0x3
>  #define I915_CONTEXT_PARAM_NO_ERROR_CAPTURE	0x4
> +#define I915_CONTEXT_PARAM_PRIORITY	0x5
>  	__u64 value;
>  };
>
>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 08/14] drm/i915/scheduler: Execute requests in order of priorities
  2016-11-14 11:15     ` Tvrtko Ursulin
@ 2016-11-14 11:41       ` Chris Wilson
  2016-11-14 11:48         ` Tvrtko Ursulin
  0 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-11-14 11:41 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Mon, Nov 14, 2016 at 11:15:52AM +0000, Tvrtko Ursulin wrote:
> On 14/11/2016 08:56, Chris Wilson wrote:
> >+static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
> >+{
> >+	struct intel_engine_cs *engine = NULL;
> >+	struct i915_dependency *dep, *p;
> >+	struct i915_dependency stack;
> >+	LIST_HEAD(dfs);
> >+
> >+	if (prio <= READ_ONCE(request->priotree.priority))
> >+		return;
> >+
> >+	/* Need BKL in order to use the temporary link inside i915_dependency */
> >+	lockdep_assert_held(&request->i915->drm.struct_mutex);
> >+
> >+	stack.signaler = &request->priotree;
> >+	list_add(&stack.dfs_link, &dfs);
> >+
> >+	/* Recursively bump all dependent priorities to match the new request */
> 
> Missed last time round that the comment needs updating.

It still is a recursive design though, just flat. That one word was
saving a paragraph :|

I think the easiest way to describe what the code is doing here is to
show the recursive version in the comment and then hope for inspiration
in describing how that maps onto the search list.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 08/14] drm/i915/scheduler: Execute requests in order of priorities
  2016-11-14 11:41       ` Chris Wilson
@ 2016-11-14 11:48         ` Tvrtko Ursulin
  2016-11-14 14:25           ` Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-11-14 11:48 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 14/11/2016 11:41, Chris Wilson wrote:
> On Mon, Nov 14, 2016 at 11:15:52AM +0000, Tvrtko Ursulin wrote:
>> On 14/11/2016 08:56, Chris Wilson wrote:
>>> +static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
>>> +{
>>> +	struct intel_engine_cs *engine = NULL;
>>> +	struct i915_dependency *dep, *p;
>>> +	struct i915_dependency stack;
>>> +	LIST_HEAD(dfs);
>>> +
>>> +	if (prio <= READ_ONCE(request->priotree.priority))
>>> +		return;
>>> +
>>> +	/* Need BKL in order to use the temporary link inside i915_dependency */
>>> +	lockdep_assert_held(&request->i915->drm.struct_mutex);
>>> +
>>> +	stack.signaler = &request->priotree;
>>> +	list_add(&stack.dfs_link, &dfs);
>>> +
>>> +	/* Recursively bump all dependent priorities to match the new request */
>>
>> Missed last time round that the comment needs updating.
>
> It still is a recursive design though, just flat. That one word was
> saving a paragraph :|
>
> I think the easiest way to describe what the code is doing here is to
> show the recursive version in the comment and then hope for inspiration
> in describing how that maps onto the search list.

I can see that angle yes. Maybe the just add a second sentence saying 
something like "To avoid having recursive code to do this recursive 
update we build a flat list of dependencies in a depth first search 
manner."?

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 08/14] drm/i915/scheduler: Execute requests in order of priorities
  2016-11-14 11:48         ` Tvrtko Ursulin
@ 2016-11-14 14:25           ` Chris Wilson
  0 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14 14:25 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Mon, Nov 14, 2016 at 11:48:32AM +0000, Tvrtko Ursulin wrote:
> 
> On 14/11/2016 11:41, Chris Wilson wrote:
> >On Mon, Nov 14, 2016 at 11:15:52AM +0000, Tvrtko Ursulin wrote:
> >>On 14/11/2016 08:56, Chris Wilson wrote:
> >>>+static void execlists_schedule(struct drm_i915_gem_request *request, int prio)
> >>>+{
> >>>+	struct intel_engine_cs *engine = NULL;
> >>>+	struct i915_dependency *dep, *p;
> >>>+	struct i915_dependency stack;
> >>>+	LIST_HEAD(dfs);
> >>>+
> >>>+	if (prio <= READ_ONCE(request->priotree.priority))
> >>>+		return;
> >>>+
> >>>+	/* Need BKL in order to use the temporary link inside i915_dependency */
> >>>+	lockdep_assert_held(&request->i915->drm.struct_mutex);
> >>>+
> >>>+	stack.signaler = &request->priotree;
> >>>+	list_add(&stack.dfs_link, &dfs);
> >>>+
> >>>+	/* Recursively bump all dependent priorities to match the new request */
> >>
> >>Missed last time round that the comment needs updating.
> >
> >It still is a recursive design though, just flat. That one word was
> >saving a paragraph :|
> >
> >I think the easiest way to describe what the code is doing here is to
> >show the recursive version in the comment and then hope for inspiration
> >in describing how that maps onto the search list.
> 
> I can see that angle yes. Maybe the just add a second sentence
> saying something like "To avoid having recursive code to do this
> recursive update we build a flat list of dependencies in a depth
> first search manner."?

        /* Recursively bump all dependent priorities to match the new request.
         *
         * A naive approach would be to use recursion:
         * static void update_priorities(struct i915_priotree *pt, prio) {
         *      list_for_each_entry(dep, &pt->signalers_list, signal_link )
         *              update_priorities(dep->signal, prio)
         *      insert_request(pt);
         * }
         * but that may have unlimited recursion depth and so runs a very
         * real risk of overunning the kernel stack. Instead, we build
         * a flat list of all dependencies starting with the current request.
         * As we walk the list of dependencies, we add all of its dependencies
         * to the end of the list (this may include an already visited
         * request) and continue to walk onwards onto the new dependencies. The
         * end result is a topological list of requests in reverse order, the
         * last element in the list is the request we must execute first.
         */

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc
  2016-11-14 11:31     ` Tvrtko Ursulin
@ 2016-11-14 14:40       ` Chris Wilson
  0 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14 14:40 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Mon, Nov 14, 2016 at 11:31:11AM +0000, Tvrtko Ursulin wrote:
> 
> On 14/11/2016 08:57, Chris Wilson wrote:
> >This emulates execlists on top of the GuC in order to defer submission of
> >requests to the hardware. This deferral allows time for high priority
> >requests to gazump their way to the head of the queue, however it nerfs
> >the GuC by converting it back into a simple execlist (where the CPU has
> >to wake up after every request to feed new commands into the GuC).
> 
> Don't know what to do with this one. It feels like it should be a
> separate patch so it can be performance evaluated properly?

Yes. It is not clear if this is the right approach for the guc, since
the firmware may have other ideas on how to do scheduling. It is an
interesting thought experiment into how easy it would be to add
scheduling on top!
 
> It is also not clear to me why we don't need any similar limiting
> for the execlists request merging?

This uses exactly the same merging strategy as execlists (with the
exception of not supporting gvt's single request dispatch), so we should
be merging a ringfill of requests from one context if available. Which
of course has it's downsides wrt to scheduling latency without preemption,
and I'm not if there is even a right answer without preemption +
timeslicing - the choice is more or less merge everything, or merge
nothing, so we stick with the status quo of merge everything until
proven otherwise.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass
  2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
                     ` (14 preceding siblings ...)
  2016-11-14 10:57   ` Tvrtko Ursulin
@ 2016-11-14 14:48   ` Joonas Lahtinen
  2016-11-14 15:13     ` Chris Wilson
  15 siblings, 1 reply; 82+ messages in thread
From: Joonas Lahtinen @ 2016-11-14 14:48 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx

On ma, 2016-11-14 at 08:56 +0000, Chris Wilson wrote:
> Localise the static struct lock_class_key to the caller of
> i915_sw_fence_init() so that we create a lock_class instance for each
> unique sw_fence rather than all sw_fences sharing the same
> lock_class. This eliminate some lockdep false positive when using fences
> from within fence callbacks.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

<SNIP>

> @@ -40,7 +40,16 @@ typedef int (*i915_sw_fence_notify_t)(struct i915_sw_fence *,
>  				      enum i915_sw_fence_notify state);
>  #define __i915_sw_fence_call __aligned(4)
>  
> -void i915_sw_fence_init(struct i915_sw_fence *fence, i915_sw_fence_notify_t fn);
> +void __i915_sw_fence_init(struct i915_sw_fence *fence,
> +			  i915_sw_fence_notify_t fn,
> +			  const char *name,
> +			  struct lock_class_key *key);
> +#define i915_sw_fence_init(fence, fn) do {			\

Gimme a (line) break here.

> +	static struct lock_class_key __key; 			\

When lockdep is disabled, this becomes zero size. We might still get
rid of the #fence strings, with some #ifdef, did you measure the
impact? I remember some for_each_engine_masked cry over bytes.

> +								\
> +	__i915_sw_fence_init((fence), fn, #fence, &__key);	\
> +} while (0)
> +

Above addressed, and assuming we're not compiling in extra;

Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>

Regards, Joonas
-- 
Joonas Lahtinen
Open Source Technology Center
Intel Corporation
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass
  2016-11-14 14:48   ` Joonas Lahtinen
@ 2016-11-14 15:13     ` Chris Wilson
  0 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-11-14 15:13 UTC (permalink / raw)
  To: Joonas Lahtinen; +Cc: intel-gfx

On Mon, Nov 14, 2016 at 04:48:00PM +0200, Joonas Lahtinen wrote:
> On ma, 2016-11-14 at 08:56 +0000, Chris Wilson wrote:
> > +	static struct lock_class_key __key; 			\
> 
> When lockdep is disabled, this becomes zero size. We might still get
> rid of the #fence strings, with some #ifdef, did you measure the
> impact? I remember some for_each_engine_masked cry over bytes.

I was copying mutex_init. To avoid it is not just a little ifdeffery :|

The strings currently cost us around 160 bytes of text.

   text	   data	    bss	    dec	    hex	filename
1222524	   5077	    608	1228209	 12bdb1	drivers/gpu/drm/i915/i915.ko
1222364	   5077	    608	1228049	 12bd11 drivers/gpu/drm/i915/i915.ko

diff --git a/drivers/gpu/drm/i915/i915_sw_fence.c b/drivers/gpu/drm/i915/i915_sw_fence.c
index 65fded24a9eb..804af5766650 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.c
+++ b/drivers/gpu/drm/i915/i915_sw_fence.c
@@ -110,6 +110,9 @@ static void i915_sw_fence_await(struct i915_sw_fence *fence)
        WARN_ON(atomic_inc_return(&fence->pending) <= 1);
 }
 
+#ifndef CONFIG_LOCKDEP
+static
+#endif
 void __i915_sw_fence_init(struct i915_sw_fence *fence,
                          i915_sw_fence_notify_t fn,
                          const char *name,
@@ -123,6 +126,16 @@ void __i915_sw_fence_init(struct i915_sw_fence *fence,
        fence->flags = (unsigned long)fn;
 }
 
+#ifndef CONFIG_LOCKDEP
+void i915_sw_fence_init(struct i915_sw_fence *fence,
+                       i915_sw_fence_notify_t fn)
+{
+       static struct lock_class_key __key;
+
+       __i915_sw_fence_init(fence, fn, NULL, &__key);
+}
+#endif
+
 void i915_sw_fence_commit(struct i915_sw_fence *fence)
 {
        i915_sw_fence_complete(fence);
diff --git a/drivers/gpu/drm/i915/i915_sw_fence.h b/drivers/gpu/drm/i915/i915_sw_fence.h
index 23748a1ae6ae..d8510a4b02bd 100644
--- a/drivers/gpu/drm/i915/i915_sw_fence.h
+++ b/drivers/gpu/drm/i915/i915_sw_fence.h
@@ -40,6 +40,7 @@ typedef int (*i915_sw_fence_notify_t)(struct i915_sw_fence *,
                                      enum i915_sw_fence_notify state);
 #define __i915_sw_fence_call __aligned(4)
 
+#ifdef CONFIG_LOCKDEP
 void __i915_sw_fence_init(struct i915_sw_fence *fence,
                          i915_sw_fence_notify_t fn,
                          const char *name,
@@ -49,6 +50,10 @@ void __i915_sw_fence_init(struct i915_sw_fence *fence,
                                                                \
        __i915_sw_fence_init((fence), (fn), #fence, &__key);    \
 } while (0)
+#else
+void i915_sw_fence_init(struct i915_sw_fence *fence,
+                       i915_sw_fence_notify_t fn);
+#endif
 
 void i915_sw_fence_commit(struct i915_sw_fence *fence);
 

Can we do that more neatly?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 14/14] drm/i915: Support explicit fencing for execbuf
  2016-11-14  8:57   ` [PATCH v3 14/14] drm/i915: Support explicit fencing for execbuf Chris Wilson
@ 2016-11-14 22:29     ` Rafael Antognolli
  2017-01-25 20:27     ` Chad Versace
  1 sibling, 0 replies; 82+ messages in thread
From: Rafael Antognolli @ 2016-11-14 22:29 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

Hi Chris,

I'm not sure you are waiting for this kind of feedback, but I've managed
to test this particular patch with Mesa, and I have some piglit tests
for it too. They are waiting for review, but at least the feature is
working as expected:

https://github.com/rantogno/piglit/commits/review/fences-v02

I used mesa and libdrm patches from more than a single set, but in case
you want to look they are here:

https://github.com/rantogno/mesa/commits/wip-fence
https://github.com/rantogno/libdrm/commits/wip-fence

And my kernel tree when I tested it:

https://github.com/rantogno/linux/commits/wip-fence

So in case it helps, you can add a

Tested-by: Rafael Antognolli <rafael.antognolli@intel.com>

PS: I can test it with this last series and everything more up to date,
if it will help to get this patch landed sooner.

Cheers,
Rafael

On Mon, Nov 14, 2016 at 08:57:03AM +0000, Chris Wilson wrote:
> Now that the user can opt-out of implicit fencing, we need to give them
> back control over the fencing. We employ sync_file to wrap our
> drm_i915_gem_request and provide an fd that userspace can merge with
> other sync_file fds and pass back to the kernel to wait upon before
> future execution.
> 
> Testcase: igt/gem_exec_fence
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/Kconfig               |  1 +
>  drivers/gpu/drm/i915/i915_drv.c            |  3 +-
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 54 +++++++++++++++++++++++++++---
>  include/uapi/drm/i915_drm.h                | 35 ++++++++++++++++++-
>  4 files changed, 86 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/Kconfig b/drivers/gpu/drm/i915/Kconfig
> index beed5c1d2cd7..bed81fe1d2a7 100644
> --- a/drivers/gpu/drm/i915/Kconfig
> +++ b/drivers/gpu/drm/i915/Kconfig
> @@ -19,6 +19,7 @@ config DRM_I915
>  	select INPUT if ACPI
>  	select ACPI_VIDEO if ACPI
>  	select ACPI_BUTTON if ACPI
> +	select SYNC_FILE
>  	help
>  	  Choose this option if you have a system that has "Intel Graphics
>  	  Media Accelerator" or "HD Graphics" integrated graphics,
> diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c
> index 657c465a8d50..6fe7f41a5b5b 100644
> --- a/drivers/gpu/drm/i915/i915_drv.c
> +++ b/drivers/gpu/drm/i915/i915_drv.c
> @@ -344,6 +344,7 @@ static int i915_getparam(struct drm_device *dev, void *data,
>  	case I915_PARAM_HAS_COHERENT_PHYS_GTT:
>  	case I915_PARAM_HAS_EXEC_SOFTPIN:
>  	case I915_PARAM_HAS_EXEC_ASYNC:
> +	case I915_PARAM_HAS_EXEC_FENCE:
>  		/* For the time being all of these are always true;
>  		 * if some supported hardware does not have one of these
>  		 * features this value needs to be provided from
> @@ -2533,7 +2534,7 @@ static const struct drm_ioctl_desc i915_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(I915_HWS_ADDR, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
>  	DRM_IOCTL_DEF_DRV(I915_GEM_INIT, drm_noop, DRM_AUTH|DRM_MASTER|DRM_ROOT_ONLY),
>  	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER, i915_gem_execbuffer, DRM_AUTH),
> -	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
> +	DRM_IOCTL_DEF_DRV(I915_GEM_EXECBUFFER2_WR, i915_gem_execbuffer2, DRM_AUTH|DRM_RENDER_ALLOW),
>  	DRM_IOCTL_DEF_DRV(I915_GEM_PIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
>  	DRM_IOCTL_DEF_DRV(I915_GEM_UNPIN, i915_gem_reject_pin_ioctl, DRM_AUTH|DRM_ROOT_ONLY),
>  	DRM_IOCTL_DEF_DRV(I915_GEM_BUSY, i915_gem_busy_ioctl, DRM_AUTH|DRM_RENDER_ALLOW),
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index 781b5559f86e..facec610b55a 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -28,6 +28,7 @@
>  
>  #include <linux/dma_remapping.h>
>  #include <linux/reservation.h>
> +#include <linux/sync_file.h>
>  #include <linux/uaccess.h>
>  
>  #include <drm/drmP.h>
> @@ -1597,6 +1598,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	struct i915_execbuffer_params *params = &params_master;
>  	const u32 ctx_id = i915_execbuffer2_get_context_id(*args);
>  	u32 dispatch_flags;
> +	struct dma_fence *in_fence = NULL;
> +	struct sync_file *out_fence = NULL;
> +	int out_fence_fd = -1;
>  	int ret;
>  	bool need_relocs;
>  
> @@ -1640,6 +1644,23 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  		dispatch_flags |= I915_DISPATCH_RS;
>  	}
>  
> +	if (args->flags & I915_EXEC_FENCE_IN) {
> +		in_fence = sync_file_get_fence(lower_32_bits(args->rsvd2));
> +		if (!in_fence) {
> +			ret = -EINVAL;
> +			goto pre_mutex_err;
> +		}
> +	}
> +
> +	if (args->flags & I915_EXEC_FENCE_OUT) {
> +		out_fence_fd = get_unused_fd_flags(O_CLOEXEC);
> +		if (out_fence_fd < 0) {
> +			ret = out_fence_fd;
> +			out_fence_fd = -1;
> +			goto pre_mutex_err;
> +		}
> +	}
> +
>  	/* Take a local wakeref for preparing to dispatch the execbuf as
>  	 * we expect to access the hardware fairly frequently in the
>  	 * process. Upon first dispatch, we acquire another prolonged
> @@ -1784,6 +1805,21 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  		goto err_batch_unpin;
>  	}
>  
> +	if (in_fence) {
> +		ret = i915_gem_request_await_dma_fence(params->request,
> +						       in_fence);
> +		if (ret < 0)
> +			goto err_request;
> +	}
> +
> +	if (out_fence_fd != -1) {
> +		out_fence = sync_file_create(&params->request->fence);
> +		if (!out_fence) {
> +			ret = -ENOMEM;
> +			goto err_request;
> +		}
> +	}
> +
>  	/* Whilst this request exists, batch_obj will be on the
>  	 * active_list, and so will hold the active reference. Only when this
>  	 * request is retired will the the batch_obj be moved onto the
> @@ -1811,6 +1847,16 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	ret = execbuf_submit(params, args, &eb->vmas);
>  err_request:
>  	__i915_add_request(params->request, ret == 0);
> +	if (out_fence) {
> +		if (ret == 0) {
> +			fd_install(out_fence_fd, out_fence->file);
> +			args->rsvd2 &= GENMASK_ULL(0, 31); /* keep in-fence */
> +			args->rsvd2 |= (u64)out_fence_fd << 32;
> +			out_fence_fd = -1;
> +		} else {
> +			fput(out_fence->file);
> +		}
> +	}
>  
>  err_batch_unpin:
>  	/*
> @@ -1832,6 +1878,9 @@ i915_gem_do_execbuffer(struct drm_device *dev, void *data,
>  	/* intel_gpu_busy should also get a ref, so it will free when the device
>  	 * is really idle. */
>  	intel_runtime_pm_put(dev_priv);
> +	if (out_fence_fd != -1)
> +		put_unused_fd(out_fence_fd);
> +	dma_fence_put(in_fence);
>  	return ret;
>  }
>  
> @@ -1939,11 +1988,6 @@ i915_gem_execbuffer2(struct drm_device *dev, void *data,
>  		return -EINVAL;
>  	}
>  
> -	if (args->rsvd2 != 0) {
> -		DRM_DEBUG("dirty rvsd2 field\n");
> -		return -EINVAL;
> -	}
> -
>  	exec2_list = drm_malloc_gfp(args->buffer_count,
>  				    sizeof(*exec2_list),
>  				    GFP_TEMPORARY);
> diff --git a/include/uapi/drm/i915_drm.h b/include/uapi/drm/i915_drm.h
> index 4bd83c0b07db..90082269fb50 100644
> --- a/include/uapi/drm/i915_drm.h
> +++ b/include/uapi/drm/i915_drm.h
> @@ -246,6 +246,7 @@ typedef struct _drm_i915_sarea {
>  #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
>  #define DRM_I915_OVERLAY_ATTRS	0x28
>  #define DRM_I915_GEM_EXECBUFFER2	0x29
> +#define DRM_I915_GEM_EXECBUFFER2_WR	DRM_I915_GEM_EXECBUFFER2
>  #define DRM_I915_GET_SPRITE_COLORKEY	0x2a
>  #define DRM_I915_SET_SPRITE_COLORKEY	0x2b
>  #define DRM_I915_GEM_WAIT	0x2c
> @@ -279,6 +280,7 @@ typedef struct _drm_i915_sarea {
>  #define DRM_IOCTL_I915_GEM_INIT		DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_INIT, struct drm_i915_gem_init)
>  #define DRM_IOCTL_I915_GEM_EXECBUFFER	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER, struct drm_i915_gem_execbuffer)
>  #define DRM_IOCTL_I915_GEM_EXECBUFFER2	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2, struct drm_i915_gem_execbuffer2)
> +#define DRM_IOCTL_I915_GEM_EXECBUFFER2_WR	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_EXECBUFFER2_WR, struct drm_i915_gem_execbuffer2)
>  #define DRM_IOCTL_I915_GEM_PIN		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_PIN, struct drm_i915_gem_pin)
>  #define DRM_IOCTL_I915_GEM_UNPIN	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_GEM_UNPIN, struct drm_i915_gem_unpin)
>  #define DRM_IOCTL_I915_GEM_BUSY		DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_BUSY, struct drm_i915_gem_busy)
> @@ -401,6 +403,12 @@ typedef struct drm_i915_irq_wait {
>   */
>  #define I915_PARAM_HAS_EXEC_ASYNC	 42
>  
> +/* Query whether DRM_I915_GEM_EXECBUFFER2 supports explicit fence support -
> + * both being able to pass in a sync_file fd to wait upon before executing,
> + * and being able to return a new sync_file fd that is signaled when the
> + * current request is complete.
> + */
> +#define I915_PARAM_HAS_EXEC_FENCE	 43
>  
>  typedef struct drm_i915_getparam {
>  	__s32 param;
> @@ -854,7 +862,32 @@ struct drm_i915_gem_execbuffer2 {
>   */
>  #define I915_EXEC_RESOURCE_STREAMER     (1<<15)
>  
> -#define __I915_EXEC_UNKNOWN_FLAGS -(I915_EXEC_RESOURCE_STREAMER<<1)
> +/* Setting I915_EXEC_FENCE_IN implies that lower_32_bits(rsvd2) represent
> + * a sync_file fd to wait upon (in a nonblocking manner) prior to executing
> + * the batch.
> + *
> + * Returns -EINVAL if the sync_file fd cannot be found.
> + */
> +#define I915_EXEC_FENCE_IN		(1<<16)
> +
> +/* Setting I915_EXEC_FENCE_OUT causes the ioctl to return a sync_file fd
> + * in the upper_32_bits(rsvd2) upon success. Ownership of the fd is given
> + * to the caller, and it should be close() after use. (The fd is a regular
> + * file descriptor and will be cleaned up on process termination. It holds
> + * a reference to the request, but nothing else.)
> + *
> + * The sync_file fd can be combined with other sync_file and passed either
> + * to execbuf using I915_EXEC_FENCE_IN, to atomic KMS ioctls (so that a flip
> + * will only occur after this request completes), or to other devices.
> + *
> + * Using I915_EXEC_FENCE_OUT requires use of
> + * DRM_IOCTL_I915_GEM_EXECBUFFER2_WR ioctl so that the result is written
> + * back to userspace. Failure to do so will cause the out-fence to always
> + * be reported as zero, and the real fence fd to be leaked.
> + */
> +#define I915_EXEC_FENCE_OUT		(1<<17)
> +
> +#define __I915_EXEC_UNKNOWN_FLAGS (-(I915_EXEC_FENCE_OUT<<1))
>  
>  #define I915_EXEC_CONTEXT_ID_MASK	(0xffffffff)
>  #define i915_execbuffer2_set_context_id(eb2, context) \
> -- 
> 2.10.2
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc
  2016-11-14  8:57   ` [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
  2016-11-14 11:31     ` Tvrtko Ursulin
@ 2016-12-01 10:45     ` Tvrtko Ursulin
  2016-12-01 11:18       ` Chris Wilson
  1 sibling, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-12-01 10:45 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 14/11/2016 08:57, Chris Wilson wrote:
> This emulates execlists on top of the GuC in order to defer submission of
> requests to the hardware. This deferral allows time for high priority
> requests to gazump their way to the head of the queue, however it nerfs
> the GuC by converting it back into a simple execlist (where the CPU has
> to wake up after every request to feed new commands into the GuC).

As it is starting to sink in we'll have to do add this hack sooner or 
later, review comments below.

Also, would you be OK to rebase this or would prefer to delegate it?

> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 85 +++++++++++++++++++++++++-----
>  drivers/gpu/drm/i915/i915_irq.c            |  4 +-
>  drivers/gpu/drm/i915/intel_lrc.c           |  3 --
>  3 files changed, 76 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 4462112725ef..088f5a99ecfc 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -469,7 +469,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  	u32 freespace;
>  	int ret;
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	freespace = CIRC_SPACE(gc->wq_tail, desc->head, gc->wq_size);
>  	freespace -= gc->wq_rsvd;
>  	if (likely(freespace >= wqi_size)) {
> @@ -479,7 +479,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  		gc->no_wq_space++;
>  		ret = -EAGAIN;
>  	}
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>
>  	return ret;
>  }
> @@ -491,9 +491,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
>
>  	GEM_BUG_ON(READ_ONCE(gc->wq_rsvd) < wqi_size);
>
> -	spin_lock(&gc->wq_lock);
> +	spin_lock_irq(&gc->wq_lock);
>  	gc->wq_rsvd -= wqi_size;
> -	spin_unlock(&gc->wq_lock);
> +	spin_unlock_irq(&gc->wq_lock);
>  }
>
>  /* Construct a Work Item and append it to the GuC's Work Queue */
> @@ -644,7 +644,7 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	rq->previous_context = engine->last_context;
>  	engine->last_context = rq->ctx;
>
> -	i915_gem_request_submit(rq);
> +	__i915_gem_request_submit(rq);
>
>  	spin_lock(&client->wq_lock);
>  	guc_wq_item_append(client, rq);
> @@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  	spin_unlock(&client->wq_lock);
>  }

Confused me at first here until I noticed engine->submit_request will be 
the execlist_submit_request later. Perhaps it would be good to rename a 
lot of things now, like engine->request_queue, 
intel_engine_submit_request and maybe more?

>
> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +{
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *last = port[0].request;
> +	unsigned long flags;
> +	struct rb_node *rb;
> +	bool submit = false;
> +
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
> +		if (last && cursor->ctx != last->ctx) {

Not sure if GVT comes into the picture here, but it does not sounds like 
it would harm to use can_merge_ctx here?

> +			if (port != engine->execlist_port)
> +				break;

It may be an overkill for the first version, but I was thinking that we 
don't have to limit it to two at a time. And it would depend on 
measuring of course. But perhaps it would make sense to do the 
generalisation of the number of supported ports straight away.

> +
> +			i915_gem_request_assign(&port->request, last);
> +			dma_fence_enable_sw_signaling(&last->fence);
> +			port++;
> +		}
> +
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		i915_guc_submit(cursor);
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		i915_gem_request_assign(&port->request, last);
> +		dma_fence_enable_sw_signaling(&last->fence);
> +		engine->execlist_first = rb;
> +	}
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +	return submit;
> +}

We could theoretically share most of the execlist_dequeue and just do a 
couple things differently depending on the mode.

Looks like one could be a new engine->submit_ports vfunc. And there is 
also the lite restore WA and sw signalling to design in nicely, but it 
may be worth sharing the code. It would be renamed to sometihng like 
scheduler_dequeue or something.

> +
> +static void i915_guc_irq_handler(unsigned long data)
> +{
> +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *rq;
> +	bool submit;
> +
> +	do {
> +		rq = port[0].request;
> +		while (rq && i915_gem_request_completed(rq)) {
> +			i915_gem_request_put(rq);
> +			rq = port[1].request;
> +			port[0].request = rq;
> +			port[1].request = NULL;
> +		}
> +
> +		submit = false;
> +		if (!port[1].request)
> +			submit = i915_guc_dequeue(engine);
> +	} while (submit);
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission
> @@ -1531,16 +1595,13 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
> -		engine->submit_request = i915_guc_submit;
> -		engine->schedule = NULL;
> +		tasklet_init(&engine->irq_tasklet,
> +			     i915_guc_irq_handler,
> +			     (unsigned long)engine);
>
>  		/* Replay the current set of previously submitted requests */
> -		list_for_each_entry(request,
> -				    &engine->timeline->requests, link) {
> +		list_for_each_entry(request, &engine->timeline->requests, link)
>  			client->wq_rsvd += sizeof(struct guc_wq_item);
> -			if (i915_sw_fence_done(&request->submit))
> -				i915_guc_submit(request);
> -		}
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index cb8a75f6ca16..18dce4c66d56 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>  static __always_inline void
>  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>  {
> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> +		tasklet_schedule(&engine->irq_tasklet);

This would be better made conditional on GuC submission just to calling 
tasklet_schedule twice (occasionally) in execlist mode.

>  		notify_ring(engine);
> +	}
>  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
>  		tasklet_schedule(&engine->irq_tasklet);
>  }
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index d13a335ad83a..ffab255e55a7 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1425,9 +1425,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
>
> -	if (i915.enable_guc_submission)
> -		return;
> -
>  	/* Catch up with any missed context-switch interrupts */
>  	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
>  	if (request->ctx != port[0].request->ctx) {
>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc
  2016-12-01 10:45     ` Tvrtko Ursulin
@ 2016-12-01 11:18       ` Chris Wilson
  2016-12-01 12:45         ` Tvrtko Ursulin
  0 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2016-12-01 11:18 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Dec 01, 2016 at 10:45:51AM +0000, Tvrtko Ursulin wrote:
> 
> On 14/11/2016 08:57, Chris Wilson wrote:
> >This emulates execlists on top of the GuC in order to defer submission of
> >requests to the hardware. This deferral allows time for high priority
> >requests to gazump their way to the head of the queue, however it nerfs
> >the GuC by converting it back into a simple execlist (where the CPU has
> >to wake up after every request to feed new commands into the GuC).
> 
> As it is starting to sink in we'll have to do add this hack sooner
> or later, review comments below.
> 
> Also, would you be OK to rebase this or would prefer to delegate it?

It's been very easy to keep it current.

> >@@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
> > 	spin_unlock(&client->wq_lock);
> > }
> 
> Confused me at first here until I noticed engine->submit_request
> will be the execlist_submit_request later. Perhaps it would be good
> to rename a lot of things now, like engine->request_queue,
> intel_engine_submit_request and maybe more?

Looking at the lifecycle of the request and getting clear names for the
phases and their vfuncs would be sensible. It may also be worthwhile to
decide that some are not engine tasks and belong to a new entity
(dev_priv->gt.X ?)
 
> >+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> >+{
> >+	struct execlist_port *port = engine->execlist_port;
> >+	struct drm_i915_gem_request *last = port[0].request;
> >+	unsigned long flags;
> >+	struct rb_node *rb;
> >+	bool submit = false;
> >+
> >+	spin_lock_irqsave(&engine->timeline->lock, flags);
> >+	rb = engine->execlist_first;
> >+	while (rb) {
> >+		struct drm_i915_gem_request *cursor =
> >+			rb_entry(rb, typeof(*cursor), priotree.node);
> >+
> >+		if (last && cursor->ctx != last->ctx) {
> 
> Not sure if GVT comes into the picture here, but it does not sounds
> like it would harm to use can_merge_ctx here?

I wasn't sure what path GVT would take either. So just went with the
simple version that looked as similar to the current guc submission as
possible. Also offloading the scheduling to the guc via semaphores will
likely make this whole chain look completely different.

> >+			if (port != engine->execlist_port)
> >+				break;
> 
> It may be an overkill for the first version, but I was thinking that
> we don't have to limit it to two at a time. And it would depend on
> measuring of course. But perhaps it would make sense to do the
> generalisation of the number of supported ports straight away.

Definitely. I was just looking at a minimal conversion, hence reusing
the existing tracking, and limits.

> We could theoretically share most of the execlist_dequeue and just
> do a couple things differently depending on the mode.

Yes, there were just a couple of intrusive warts that I felt justified
in keeping the routines separate. But mostly it was the merge_ctx
decision that looked to be backend specific.
 
> Looks like one could be a new engine->submit_ports vfunc. And there
> is also the lite restore WA and sw signalling to design in nicely,
> but it may be worth sharing the code. It would be renamed to
> sometihng like scheduler_dequeue or something.

> >diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> >index cb8a75f6ca16..18dce4c66d56 100644
> >--- a/drivers/gpu/drm/i915/i915_irq.c
> >+++ b/drivers/gpu/drm/i915/i915_irq.c
> >@@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
> > static __always_inline void
> > gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
> > {
> >-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> >+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> >+		tasklet_schedule(&engine->irq_tasklet);
> 
> This would be better made conditional on GuC submission just to
> calling tasklet_schedule twice (occasionally) in execlist mode.

It's not a huge cost if we do schedule the tasklet twice (in the same
interrupt context at least). Otherwise, yes we are incurring more mmio
reads to determine that the CSB didn't advance. On the other hand, I
really don't want to have if (i915.enable_guc_submission) here. Maybe
if (engine->user_irq_tasklet) tasklet_schedule(engine->user_irq_tasklet)
?

Speaking of CSB, the spec keeps hinting that "CSB occupies a whole
cacheline, making the read cheap". I'm not aware of how we can do a UC
to cacheline read in a single transfer. Ideas?  movntdqa I thought was
specifically WC.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc
  2016-12-01 11:18       ` Chris Wilson
@ 2016-12-01 12:45         ` Tvrtko Ursulin
  2016-12-01 13:01           ` Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Tvrtko Ursulin @ 2016-12-01 12:45 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, John Harrison


On 01/12/2016 11:18, Chris Wilson wrote:
> On Thu, Dec 01, 2016 at 10:45:51AM +0000, Tvrtko Ursulin wrote:
>>
>> On 14/11/2016 08:57, Chris Wilson wrote:
>>> This emulates execlists on top of the GuC in order to defer submission of
>>> requests to the hardware. This deferral allows time for high priority
>>> requests to gazump their way to the head of the queue, however it nerfs
>>> the GuC by converting it back into a simple execlist (where the CPU has
>>> to wake up after every request to feed new commands into the GuC).
>>
>> As it is starting to sink in we'll have to do add this hack sooner
>> or later, review comments below.
>>
>> Also, would you be OK to rebase this or would prefer to delegate it?
>
> It's been very easy to keep it current.
>
>>> @@ -665,6 +665,70 @@ static void i915_guc_submit(struct drm_i915_gem_request *rq)
>>> 	spin_unlock(&client->wq_lock);
>>> }
>>
>> Confused me at first here until I noticed engine->submit_request
>> will be the execlist_submit_request later. Perhaps it would be good
>> to rename a lot of things now, like engine->request_queue,
>> intel_engine_submit_request and maybe more?
>
> Looking at the lifecycle of the request and getting clear names for the
> phases and their vfuncs would be sensible. It may also be worthwhile to
> decide that some are not engine tasks and belong to a new entity
> (dev_priv->gt.X ?)

Like submit_request? Makes sense yes.

>>> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
>>> +{
>>> +	struct execlist_port *port = engine->execlist_port;
>>> +	struct drm_i915_gem_request *last = port[0].request;
>>> +	unsigned long flags;
>>> +	struct rb_node *rb;
>>> +	bool submit = false;
>>> +
>>> +	spin_lock_irqsave(&engine->timeline->lock, flags);
>>> +	rb = engine->execlist_first;
>>> +	while (rb) {
>>> +		struct drm_i915_gem_request *cursor =
>>> +			rb_entry(rb, typeof(*cursor), priotree.node);
>>> +
>>> +		if (last && cursor->ctx != last->ctx) {
>>
>> Not sure if GVT comes into the picture here, but it does not sounds
>> like it would harm to use can_merge_ctx here?
>
> I wasn't sure what path GVT would take either. So just went with the
> simple version that looked as similar to the current guc submission as
> possible. Also offloading the scheduling to the guc via semaphores will
> likely make this whole chain look completely different.

Hmm I am not up to speed with that. So you are saying it doesn't make 
sense to unify this?

>>> +			if (port != engine->execlist_port)
>>> +				break;
>>
>> It may be an overkill for the first version, but I was thinking that
>> we don't have to limit it to two at a time. And it would depend on
>> measuring of course. But perhaps it would make sense to do the
>> generalisation of the number of supported ports straight away.
>
> Definitely. I was just looking at a minimal conversion, hence reusing
> the existing tracking, and limits.

Definitely leave it for later, or definitely it makes sense to 
generalise right now? I was just thinking that when someone goes to test 
this and finds the throughput regresses, that it might be easier to just 
say please try i915.guc_submit_ports=8 or something.

>> We could theoretically share most of the execlist_dequeue and just
>> do a couple things differently depending on the mode.
>
> Yes, there were just a couple of intrusive warts that I felt justified
> in keeping the routines separate. But mostly it was the merge_ctx
> decision that looked to be backend specific.

Cool.

>> Looks like one could be a new engine->submit_ports vfunc. And there
>> is also the lite restore WA and sw signalling to design in nicely,
>> but it may be worth sharing the code. It would be renamed to
>> sometihng like scheduler_dequeue or something.
>
>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>> index cb8a75f6ca16..18dce4c66d56 100644
>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>> @@ -1341,8 +1341,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>>> static __always_inline void
>>> gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>>> {
>>> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
>>> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
>>> +		tasklet_schedule(&engine->irq_tasklet);
>>
>> This would be better made conditional on GuC submission just to
>> calling tasklet_schedule twice (occasionally) in execlist mode.
>
> It's not a huge cost if we do schedule the tasklet twice (in the same
> interrupt context at least). Otherwise, yes we are incurring more mmio
> reads to determine that the CSB didn't advance. On the other hand, I
> really don't want to have if (i915.enable_guc_submission) here. Maybe
> if (engine->user_irq_tasklet) tasklet_schedule(engine->user_irq_tasklet)
> ?

Hm, I don't know. One conditional plus a redundant engine struct member 
or just one conditional. Maybe dev_priv->gt.handle_user_interrupt and 
dev_priv->gt.handle_ctx_switch_interrupt? You won't like the indirection 
I am sure. Go with the user_irq_tasklet then.

> Speaking of CSB, the spec keeps hinting that "CSB occupies a whole
> cacheline, making the read cheap". I'm not aware of how we can do a UC
> to cacheline read in a single transfer. Ideas?  movntdqa I thought was
> specifically WC.

Unfortunately I don't know. Low-level x86 is not my strong area. I only 
learnt of movntdqa when Akash and you started using it for example.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc
  2016-12-01 12:45         ` Tvrtko Ursulin
@ 2016-12-01 13:01           ` Chris Wilson
  0 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2016-12-01 13:01 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Thu, Dec 01, 2016 at 12:45:18PM +0000, Tvrtko Ursulin wrote:
> 
> On 01/12/2016 11:18, Chris Wilson wrote:
> >On Thu, Dec 01, 2016 at 10:45:51AM +0000, Tvrtko Ursulin wrote:
> >>
> >>On 14/11/2016 08:57, Chris Wilson wrote:
> >>>+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> >>>+{
> >>>+	struct execlist_port *port = engine->execlist_port;
> >>>+	struct drm_i915_gem_request *last = port[0].request;
> >>>+	unsigned long flags;
> >>>+	struct rb_node *rb;
> >>>+	bool submit = false;
> >>>+
> >>>+	spin_lock_irqsave(&engine->timeline->lock, flags);
> >>>+	rb = engine->execlist_first;
> >>>+	while (rb) {
> >>>+		struct drm_i915_gem_request *cursor =
> >>>+			rb_entry(rb, typeof(*cursor), priotree.node);
> >>>+
> >>>+		if (last && cursor->ctx != last->ctx) {
> >>
> >>Not sure if GVT comes into the picture here, but it does not sounds
> >>like it would harm to use can_merge_ctx here?
> >
> >I wasn't sure what path GVT would take either. So just went with the
> >simple version that looked as similar to the current guc submission as
> >possible. Also offloading the scheduling to the guc via semaphores will
> >likely make this whole chain look completely different.
> 
> Hmm I am not up to speed with that. So you are saying it doesn't
> make sense to unify this?

Just not sure yet. Too much duplication, too much engineering are both
traps we may make for ourselves.

> >>>+			if (port != engine->execlist_port)
> >>>+				break;
> >>
> >>It may be an overkill for the first version, but I was thinking that
> >>we don't have to limit it to two at a time. And it would depend on
> >>measuring of course. But perhaps it would make sense to do the
> >>generalisation of the number of supported ports straight away.
> >
> >Definitely. I was just looking at a minimal conversion, hence reusing
> >the existing tracking, and limits.
> 
> Definitely leave it for later, or definitely it makes sense to
> generalise right now? I was just thinking that when someone goes to
> test this and finds the throughput regresses, that it might be
> easier to just say please try i915.guc_submit_ports=8 or something.

It was "definitely not worth it in this patch and definitely makes sense
to investigate". Very rapid diminishing returns, it comes down to how
many requests will complete in the service time of the first irq. You'll
be looking at the no-op switching workloads that stress the driver,
rather than the actual workloads that stress the system. The cheapest
typical ping-pong is GL client -> display server -> GL client, though
OpenCL may beat that, but for that GL sequence, 3 ports would easily
cover us. [1 active, 2 pending slots really.]
-Chris
 

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 14/14] drm/i915: Support explicit fencing for execbuf
  2016-11-14  8:57   ` [PATCH v3 14/14] drm/i915: Support explicit fencing for execbuf Chris Wilson
  2016-11-14 22:29     ` Rafael Antognolli
@ 2017-01-25 20:27     ` Chad Versace
  1 sibling, 0 replies; 82+ messages in thread
From: Chad Versace @ 2017-01-25 20:27 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

If I understand correctly, this patch preserves the kernel's current
implicit fencing, even when an input fence fd is given to execbuffer. I'm
convinced that's the right approach.

If userspace does want to disable the implicit fencing during an
execbuffer, then it should disable that explicit fencing through an
*explicit* knob. I believe the kernel should not interpret the presence
of an in fence fd in execbuffer as that knob. If it did, then using this
feature from GL/EGL userspace would be unwieldy.

In other words, I like this.

Patch 14 gets my
Acked-by: Chad Versace <chadversary@chromium.org>

On Mon 14 Nov 2016, Chris Wilson wrote:
> Now that the user can opt-out of implicit fencing, we need to give them
> back control over the fencing. We employ sync_file to wrap our
> drm_i915_gem_request and provide an fd that userspace can merge with
> other sync_file fds and pass back to the kernel to wait upon before
> future execution.
> 
> Testcase: igt/gem_exec_fence
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/Kconfig               |  1 +
>  drivers/gpu/drm/i915/i915_drv.c            |  3 +-
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c | 54 +++++++++++++++++++++++++++---
>  include/uapi/drm/i915_drm.h                | 35 ++++++++++++++++++-
>  4 files changed, 86 insertions(+), 7 deletions(-)
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing
  2016-11-14  8:57   ` [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
@ 2017-01-25 20:38     ` Chad Versace
  2017-01-26 10:32       ` Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Chad Versace @ 2017-01-25 20:38 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Mon 14 Nov 2016, Chris Wilson wrote:
> Userspace is faced with a dilemma. The kernel requires implicit fencing
> to manage resource usage (we always must wait for the GPU to finish
> before releasing its PTE) and for third parties. However, userspace may
> wish to avoid this serialisation if it is either using explicit fencing
> between parties and wants more fine-grained access to buffers (e.g. it
> may partition the buffer between uses and track fences on ranges rather
> than the implicit fences tracking the whole object). It follows that
> userspace needs a mechanism to avoid the kernel's serialisation on its
> implicit fences before execbuf execution.
> 
> The next question is whether this is an object, execbuf or context flag.
> Hybrid users (such as using explicit EGL_ANDROID_native_sync fencing on
> shared winsys buffers, but implicit fencing on internal surfaces)
> require a per-object level flag. Given that this flag need to be only
> set once for the lifetime of the object, this reduces the convenience of
> having an execbuf or context level flag (and avoids having multiple
> pieces of uABI controlling the same feature).
> 
> Incorrect use of this flag will result in rendering corruption and GPU
> hangs - but will not result in use-after-free or similar resource
> tracking issues.
> 
> Serious caveat: write ordering is not strictly correct after setting
> this flag on a render target on multiple engines. This affects all
> subsequent GEM operations (execbuf, set-domain, pread) and shared
> dma-buf operations. A fix is possible - but costly (both in terms of
> further ABI changes and runtime overhead).
> 
> Testcase: igt/gem_exec_async
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_drv.c            |  1 +
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
>  include/uapi/drm/i915_drm.h                | 29 ++++++++++++++++++++++++++++-
>  3 files changed, 32 insertions(+), 1 deletion(-)

I'm neutral about this patch. I believe patch 14/14 is useful with or
without this patch, and I want to see patch 14 land regardless of what
happens with this one.

I'm not opposed to this patch. It's just that I don't yet understand
exactly if Mesa's EGL/GL code could effectively use this feature for
Android winsys buffers. The amount of information loss between the
EGL/GL apis and the eventual execbuffer submission may prevent Mesa from
annotating the Android winsys buffers with this.  I'm unsure.  I'm still
thinking about it.

But, if Chris, or anyone, already has plans to use this somehow, perhaps
in the DDX, then don't let my hesitation block the patch.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing
  2017-01-25 20:38     ` Chad Versace
@ 2017-01-26 10:32       ` Chris Wilson
  2017-01-26 10:58         ` [PATCH] i965: Share the workaround bo between all contexts Chris Wilson
  2017-01-27  0:07         ` [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing Chad Versace
  0 siblings, 2 replies; 82+ messages in thread
From: Chris Wilson @ 2017-01-26 10:32 UTC (permalink / raw)
  To: Chad Versace, intel-gfx; +Cc: Daniel Vetter

On Wed, Jan 25, 2017 at 12:38:32PM -0800, Chad Versace wrote:
> On Mon 14 Nov 2016, Chris Wilson wrote:
> > Userspace is faced with a dilemma. The kernel requires implicit fencing
> > to manage resource usage (we always must wait for the GPU to finish
> > before releasing its PTE) and for third parties. However, userspace may
> > wish to avoid this serialisation if it is either using explicit fencing
> > between parties and wants more fine-grained access to buffers (e.g. it
> > may partition the buffer between uses and track fences on ranges rather
> > than the implicit fences tracking the whole object). It follows that
> > userspace needs a mechanism to avoid the kernel's serialisation on its
> > implicit fences before execbuf execution.
> > 
> > The next question is whether this is an object, execbuf or context flag.
> > Hybrid users (such as using explicit EGL_ANDROID_native_sync fencing on
> > shared winsys buffers, but implicit fencing on internal surfaces)
> > require a per-object level flag. Given that this flag need to be only
> > set once for the lifetime of the object, this reduces the convenience of
> > having an execbuf or context level flag (and avoids having multiple
> > pieces of uABI controlling the same feature).
> > 
> > Incorrect use of this flag will result in rendering corruption and GPU
> > hangs - but will not result in use-after-free or similar resource
> > tracking issues.
> > 
> > Serious caveat: write ordering is not strictly correct after setting
> > this flag on a render target on multiple engines. This affects all
> > subsequent GEM operations (execbuf, set-domain, pread) and shared
> > dma-buf operations. A fix is possible - but costly (both in terms of
> > further ABI changes and runtime overhead).
> > 
> > Testcase: igt/gem_exec_async
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_drv.c            |  1 +
> >  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
> >  include/uapi/drm/i915_drm.h                | 29 ++++++++++++++++++++++++++++-
> >  3 files changed, 32 insertions(+), 1 deletion(-)
> 
> I'm neutral about this patch. I believe patch 14/14 is useful with or
> without this patch, and I want to see patch 14 land regardless of what
> happens with this one.

I don't like the patch, it opens up a big wart in the GEM api (incorrect
write tracking on GEM/dma-buf across multiple timelines). Otoh, being
able to discard the implicit fence tracking seems to be an important
feature request - if we go forward witout it, we will then be lacking a
feature that is common across other drivers and in particular seems to
be commonplace in the Android ecosystem.

Daniel, what's your feeling? One problem will be that the
synchronisation issue may be hard to track down in future (proving that
the cause of a stall is an avoidable implicit fence).
 
> I'm not opposed to this patch. It's just that I don't yet understand
> exactly if Mesa's EGL/GL code could effectively use this feature for
> Android winsys buffers. The amount of information loss between the
> EGL/GL apis and the eventual execbuffer submission may prevent Mesa from
> annotating the Android winsys buffers with this.  I'm unsure.  I'm still
> thinking about it.
> 
> But, if Chris, or anyone, already has plans to use this somehow, perhaps
> in the DDX, then don't let my hesitation block the patch.

Actually, the example I have would be for mesa. It can use this on its
own scratch buffers to just discard writes and prevent ordering on
a single scratch shared between contexts, and for its fence tracking using
a single page for multiple rings.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH] i965: Share the workaround bo between all contexts
  2017-01-26 10:32       ` Chris Wilson
@ 2017-01-26 10:58         ` Chris Wilson
  2017-01-26 17:39           ` [Mesa-dev] " Chad Versace
  2017-01-27  0:07         ` [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing Chad Versace
  1 sibling, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2017-01-26 10:58 UTC (permalink / raw)
  To: mesa-dev; +Cc: Daniel Vetter, Chad Versace, Kenneth Graunke, intel-gfx

Since the workaround bo is used strictly as a write-only buffer, we need
only allocate one per screen and use the same one from all contexts.

(The caveat here is during extension initialisation, where we write into
and read back register values from the buffer, but that is performed only
once for the first context - and baring synchronisation issues should not
be a problem. Safer would be to move that also to the screen.)

v2: Give the workaround bo its own init function and don't piggy back
intel_bufmgr_init() since it is not that related.

v3: Drop the reference count of the workaround bo for the context since
the context itself is owned by the screen (and so we can rely on the bo
existing for the lifetime of the context).

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Kenneth Graunke <kenneth@whitecape.org>
Cc: Martin Peres <martin.peres@linux.intel.com>
Cc: Chad Versace <chadversary@chromium.org>
Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 src/mesa/drivers/dri/i965/Makefile.am        |  2 +-
 src/mesa/drivers/dri/i965/brw_pipe_control.c | 12 +++++-----
 src/mesa/drivers/dri/i965/intel_screen.c     | 24 ++++++++++++++++++++
 src/mesa/drivers/dri/i965/intel_screen.h     |  1 +
 src/mesa/drivers/dri/i965/libdrm_compat.h    | 33 ++++++++++++++++++++++++++++
 5 files changed, 64 insertions(+), 8 deletions(-)
 create mode 100644 src/mesa/drivers/dri/i965/libdrm_compat.h

diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index 6602a17995..b208563f7d 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -77,7 +77,7 @@ noinst_LTLIBRARIES = \
 	libi965_compiler.la \
 	$(I965_PERGEN_LIBS)
 
-libi965_dri_la_SOURCES = $(i965_FILES)
+libi965_dri_la_SOURCES = $(i965_FILES) libdrm_compat.h
 libi965_dri_la_LIBADD = \
 	$(top_builddir)/src/intel/common/libintel_common.la \
 	$(top_builddir)/src/intel/isl/libisl.la \
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index b8f740640f..22c946f744 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -371,20 +371,18 @@ brw_init_pipe_control(struct brw_context *brw,
    /* We can't just use brw_state_batch to get a chunk of space for
     * the gen6 workaround because it involves actually writing to
     * the buffer, and the kernel doesn't let us write to the batch.
+    *
+    * As the screen has a long lifetime than the contexts derived from
+    * it, we do not need to add our own reference count and can simply
+    * rely on the bo always existing for the duration of the context.
     */
-   brw->workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
-                                           "pipe_control workaround",
-                                           4096, 4096);
-   if (brw->workaround_bo == NULL)
-      return -ENOMEM;
+   brw->workaround_bo = brw->screen->workaround_bo;
 
    brw->pipe_controls_since_last_cs_stall = 0;
-
    return 0;
 }
 
 void
 brw_fini_pipe_control(struct brw_context *brw)
 {
-   drm_intel_bo_unreference(brw->workaround_bo);
 }
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 5f800008c1..6e788c41cc 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -107,6 +107,7 @@ DRI_CONF_END
 #include "brw_context.h"
 
 #include "i915_drm.h"
+#include "libdrm_compat.h"
 
 /**
  * For debugging purposes, this returns a time in seconds.
@@ -1030,6 +1031,7 @@ intelDestroyScreen(__DRIscreen * sPriv)
 {
    struct intel_screen *screen = sPriv->driverPrivate;
 
+   drm_intel_bo_unreference(screen->workaround_bo);
    dri_bufmgr_destroy(screen->bufmgr);
    driDestroyOptionInfo(&screen->optionCache);
 
@@ -1210,6 +1212,25 @@ intel_init_bufmgr(struct intel_screen *screen)
 }
 
 static bool
+intel_init_workaround_bo(struct intel_screen *screen)
+{
+   /* A small scratch bo shared by all contexts, primarily used
+    * for doing PIPECONTROL serialisation writes that are discarded.
+    */
+   screen->workaround_bo =
+      drm_intel_bo_alloc(screen->bufmgr, "pipe_control w/a", 4096, 4096);
+
+   /* We want to use this bo from any and all contexts, without undue
+    * writing ordering between them. To prevent the kernel enforcing
+    * the order due to writes from different contexts, we disable
+    * the use of (the kernel's) implicit sync on this bo.
+    */
+   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
+
+   return screen->workaround_bo != NULL;
+}
+
+static bool
 intel_detect_swizzling(struct intel_screen *screen)
 {
    drm_intel_bo *buffer;
@@ -1675,6 +1696,9 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen)
    if (!intel_init_bufmgr(screen))
        return false;
 
+   if (!intel_init_workaround_bo(screen))
+       return false;
+
    screen->deviceID = drm_intel_bufmgr_gem_get_devid(screen->bufmgr);
    if (!gen_get_device_info(screen->deviceID, &screen->devinfo))
       return false;
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index 890dd9044b..0fb83e724f 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -74,6 +74,7 @@ struct intel_screen
 #define KERNEL_ALLOWS_COMPUTE_DISPATCH              (1<<4)
 
    dri_bufmgr *bufmgr;
+   drm_intel_bo *workaround_bo;
 
    /**
     * A unique ID for shader programs.
diff --git a/src/mesa/drivers/dri/i965/libdrm_compat.h b/src/mesa/drivers/dri/i965/libdrm_compat.h
new file mode 100644
index 0000000000..bef9a1286b
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/libdrm_compat.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LIBDRM_COMPAT_H
+#define __LIBDRM_COMPAT_H
+
+#include <intel_bufmgr.h>
+
+#ifndef HAVE_DRM_INTEL_GEM_BO_DISABLE_IMPLICIT_SYNC
+#define drm_intel_gem_bo_disable_implicit_sync(BO) do { } while (0)
+#endif
+
+#endif /* !__LIBDRM_COMPAT_H */
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [Mesa-dev] [PATCH] i965: Share the workaround bo between all contexts
  2017-01-26 10:58         ` [PATCH] i965: Share the workaround bo between all contexts Chris Wilson
@ 2017-01-26 17:39           ` Chad Versace
  2017-01-26 18:05             ` Chris Wilson
                               ` (2 more replies)
  0 siblings, 3 replies; 82+ messages in thread
From: Chad Versace @ 2017-01-26 17:39 UTC (permalink / raw)
  To: Chris Wilson; +Cc: mesa-dev, Kenneth Graunke, intel-gfx, Daniel Vetter

On Thu 26 Jan 2017, Chris Wilson wrote:
> Since the workaround bo is used strictly as a write-only buffer, we need
> only allocate one per screen and use the same one from all contexts.
> 
> (The caveat here is during extension initialisation, where we write into
> and read back register values from the buffer, but that is performed only
> once for the first context - and baring synchronisation issues should not
> be a problem. Safer would be to move that also to the screen.)
> 
> v2: Give the workaround bo its own init function and don't piggy back
> intel_bufmgr_init() since it is not that related.
> 
> v3: Drop the reference count of the workaround bo for the context since
> the context itself is owned by the screen (and so we can rely on the bo
> existing for the lifetime of the context).

I like this idea, but I have questions and comments about the details.
More questions than comments, really.

Today, with only Mesa changes, could we effectively do the same as
  drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
by hacking Mesa to set no read/write domain when emitting relocs for the
workaround_bo? (I admit I don't fully understand the kernel's domain
tracking). If that does work, then it just would require a small hack to
brw_emit_pipe_control_write().

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Kenneth Graunke <kenneth@whitecape.org>
> Cc: Martin Peres <martin.peres@linux.intel.com>
> Cc: Chad Versace <chadversary@chromium.org>
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  src/mesa/drivers/dri/i965/Makefile.am        |  2 +-
>  src/mesa/drivers/dri/i965/brw_pipe_control.c | 12 +++++-----
>  src/mesa/drivers/dri/i965/intel_screen.c     | 24 ++++++++++++++++++++
>  src/mesa/drivers/dri/i965/intel_screen.h     |  1 +
>  src/mesa/drivers/dri/i965/libdrm_compat.h    | 33 ++++++++++++++++++++++++++++
>  5 files changed, 64 insertions(+), 8 deletions(-)
>  create mode 100644 src/mesa/drivers/dri/i965/libdrm_compat.h
> 
> diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
> index 6602a17995..b208563f7d 100644
> --- a/src/mesa/drivers/dri/i965/Makefile.am
> +++ b/src/mesa/drivers/dri/i965/Makefile.am
> @@ -77,7 +77,7 @@ noinst_LTLIBRARIES = \
>  	libi965_compiler.la \
>  	$(I965_PERGEN_LIBS)
>  
> -libi965_dri_la_SOURCES = $(i965_FILES)
> +libi965_dri_la_SOURCES = $(i965_FILES) libdrm_compat.h
>  libi965_dri_la_LIBADD = \
>  	$(top_builddir)/src/intel/common/libintel_common.la \
>  	$(top_builddir)/src/intel/isl/libisl.la \
> diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
> index b8f740640f..22c946f744 100644
> --- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
> +++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
> @@ -371,20 +371,18 @@ brw_init_pipe_control(struct brw_context *brw,
>     /* We can't just use brw_state_batch to get a chunk of space for
>      * the gen6 workaround because it involves actually writing to
>      * the buffer, and the kernel doesn't let us write to the batch.
> +    *
> +    * As the screen has a long lifetime than the contexts derived from
> +    * it, we do not need to add our own reference count and can simply
> +    * rely on the bo always existing for the duration of the context.
>      */
> -   brw->workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
> -                                           "pipe_control workaround",
> -                                           4096, 4096);
> -   if (brw->workaround_bo == NULL)
> -      return -ENOMEM;
> +   brw->workaround_bo = brw->screen->workaround_bo;
>  
>     brw->pipe_controls_since_last_cs_stall = 0;
> -
>     return 0;
>  }
>  
>  void
>  brw_fini_pipe_control(struct brw_context *brw)
>  {
> -   drm_intel_bo_unreference(brw->workaround_bo);
>  }
> diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
> index 5f800008c1..6e788c41cc 100644
> --- a/src/mesa/drivers/dri/i965/intel_screen.c
> +++ b/src/mesa/drivers/dri/i965/intel_screen.c
> @@ -107,6 +107,7 @@ DRI_CONF_END
>  #include "brw_context.h"
>  
>  #include "i915_drm.h"
> +#include "libdrm_compat.h"
>  
>  /**
>   * For debugging purposes, this returns a time in seconds.
> @@ -1030,6 +1031,7 @@ intelDestroyScreen(__DRIscreen * sPriv)
>  {
>     struct intel_screen *screen = sPriv->driverPrivate;
>  
> +   drm_intel_bo_unreference(screen->workaround_bo);
>     dri_bufmgr_destroy(screen->bufmgr);
>     driDestroyOptionInfo(&screen->optionCache);
>  
> @@ -1210,6 +1212,25 @@ intel_init_bufmgr(struct intel_screen *screen)
>  }
>  
>  static bool
> +intel_init_workaround_bo(struct intel_screen *screen)
> +{
> +   /* A small scratch bo shared by all contexts, primarily used
> +    * for doing PIPECONTROL serialisation writes that are discarded.
> +    */
> +   screen->workaround_bo =
> +      drm_intel_bo_alloc(screen->bufmgr, "pipe_control w/a", 4096, 4096);
> +
> +   /* We want to use this bo from any and all contexts, without undue
> +    * writing ordering between them. To prevent the kernel enforcing
> +    * the order due to writes from different contexts, we disable
> +    * the use of (the kernel's) implicit sync on this bo.
> +    */
> +   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
> +
> +   return screen->workaround_bo != NULL;
> +}
> +
> +static bool
>  intel_detect_swizzling(struct intel_screen *screen)
>  {
>     drm_intel_bo *buffer;
> @@ -1675,6 +1696,9 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen)
>     if (!intel_init_bufmgr(screen))
>         return false;
>  
> +   if (!intel_init_workaround_bo(screen))
> +       return false;
> +
>     screen->deviceID = drm_intel_bufmgr_gem_get_devid(screen->bufmgr);
>     if (!gen_get_device_info(screen->deviceID, &screen->devinfo))
>        return false;
> diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
> index 890dd9044b..0fb83e724f 100644
> --- a/src/mesa/drivers/dri/i965/intel_screen.h
> +++ b/src/mesa/drivers/dri/i965/intel_screen.h
> @@ -74,6 +74,7 @@ struct intel_screen
>  #define KERNEL_ALLOWS_COMPUTE_DISPATCH              (1<<4)
>  
>     dri_bufmgr *bufmgr;
> +   drm_intel_bo *workaround_bo;
>  
>     /**
>      * A unique ID for shader programs.
> diff --git a/src/mesa/drivers/dri/i965/libdrm_compat.h b/src/mesa/drivers/dri/i965/libdrm_compat.h
> new file mode 100644
> index 0000000000..bef9a1286b
> --- /dev/null
> +++ b/src/mesa/drivers/dri/i965/libdrm_compat.h
> @@ -0,0 +1,33 @@
> +/*
> + * Copyright © 2016 Intel Corporation
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a
> + * copy of this software and associated documentation files (the "Software"),
> + * to deal in the Software without restriction, including without limitation
> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
> + * and/or sell copies of the Software, and to permit persons to whom the
> + * Software is furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice (including the next
> + * paragraph) shall be included in all copies or substantial portions of the
> + * Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
> + * IN THE SOFTWARE.
> + */
> +
> +#ifndef __LIBDRM_COMPAT_H
> +#define __LIBDRM_COMPAT_H
> +
> +#include <intel_bufmgr.h>
> +
> +#ifndef HAVE_DRM_INTEL_GEM_BO_DISABLE_IMPLICIT_SYNC
> +#define drm_intel_gem_bo_disable_implicit_sync(BO) do { } while (0)
> +#endif
> +
> +#endif /* !__LIBDRM_COMPAT_H */
> -- 
> 2.11.0
> 
> _______________________________________________
> mesa-dev mailing list
> mesa-dev@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [Mesa-dev] [PATCH] i965: Share the workaround bo between all contexts
  2017-01-26 17:39           ` [Mesa-dev] " Chad Versace
@ 2017-01-26 18:05             ` Chris Wilson
  2017-01-26 23:40               ` Chad Versace
  2017-01-26 18:46             ` Chris Wilson
  2017-01-27  0:01             ` Chad Versace
  2 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2017-01-26 18:05 UTC (permalink / raw)
  To: Chad Versace, mesa-dev, Daniel Vetter, Kenneth Graunke, intel-gfx

On Thu, Jan 26, 2017 at 09:39:51AM -0800, Chad Versace wrote:
> On Thu 26 Jan 2017, Chris Wilson wrote:
> > Since the workaround bo is used strictly as a write-only buffer, we need
> > only allocate one per screen and use the same one from all contexts.
> > 
> > (The caveat here is during extension initialisation, where we write into
> > and read back register values from the buffer, but that is performed only
> > once for the first context - and baring synchronisation issues should not
> > be a problem. Safer would be to move that also to the screen.)
> > 
> > v2: Give the workaround bo its own init function and don't piggy back
> > intel_bufmgr_init() since it is not that related.
> > 
> > v3: Drop the reference count of the workaround bo for the context since
> > the context itself is owned by the screen (and so we can rely on the bo
> > existing for the lifetime of the context).
> 
> I like this idea, but I have questions and comments about the details.
> More questions than comments, really.
> 
> Today, with only Mesa changes, could we effectively do the same as
>   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
> by hacking Mesa to set no read/write domain when emitting relocs for the
> workaround_bo? (I admit I don't fully understand the kernel's domain
> tracking). If that does work, then it just would require a small hack to
> brw_emit_pipe_control_write().

Yes, for anything that is totally scratch just not setting the write
hazard is the same. For something like the seqno page where we have
multiple engines that we do want to be preserved, not settting the write
hazzard had the consequence that page could be lost under memory pressure
or across resume. (As usual there are some details that this part of the
ABI had to be relaxed because userspace didn't have this flag.)
But that doesn't sell many bananas.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [Mesa-dev] [PATCH] i965: Share the workaround bo between all contexts
  2017-01-26 17:39           ` [Mesa-dev] " Chad Versace
  2017-01-26 18:05             ` Chris Wilson
@ 2017-01-26 18:46             ` Chris Wilson
  2017-01-27  0:01             ` Chad Versace
  2 siblings, 0 replies; 82+ messages in thread
From: Chris Wilson @ 2017-01-26 18:46 UTC (permalink / raw)
  To: Chad Versace, mesa-dev, Daniel Vetter, Kenneth Graunke, intel-gfx

On Thu, Jan 26, 2017 at 09:39:51AM -0800, Chad Versace wrote:
> On Thu 26 Jan 2017, Chris Wilson wrote:
> > Since the workaround bo is used strictly as a write-only buffer, we need
> > only allocate one per screen and use the same one from all contexts.
> > 
> > (The caveat here is during extension initialisation, where we write into
> > and read back register values from the buffer, but that is performed only
> > once for the first context - and baring synchronisation issues should not
> > be a problem. Safer would be to move that also to the screen.)
> > 
> > v2: Give the workaround bo its own init function and don't piggy back
> > intel_bufmgr_init() since it is not that related.
> > 
> > v3: Drop the reference count of the workaround bo for the context since
> > the context itself is owned by the screen (and so we can rely on the bo
> > existing for the lifetime of the context).
> 
> I like this idea, but I have questions and comments about the details.
> More questions than comments, really.
> 
> Today, with only Mesa changes, could we effectively do the same as
>   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
> by hacking Mesa to set no read/write domain when emitting relocs for the
> workaround_bo? (I admit I don't fully understand the kernel's domain
> tracking). If that does work, then it just would require a small hack to
> brw_emit_pipe_control_write().

However... There is a hack that requires the write hazard for gen6
pipecontrols unless you use the noreloc patches (hw limitation causing
pipecontrols to always use ggtt offsets not the ppgtt you have normally).
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [Mesa-dev] [PATCH] i965: Share the workaround bo between all contexts
  2017-01-26 18:05             ` Chris Wilson
@ 2017-01-26 23:40               ` Chad Versace
  0 siblings, 0 replies; 82+ messages in thread
From: Chad Versace @ 2017-01-26 23:40 UTC (permalink / raw)
  To: Chris Wilson, mesa-dev, Daniel Vetter, Kenneth Graunke, intel-gfx

On Thu 26 Jan 2017, Chris Wilson wrote:
> On Thu, Jan 26, 2017 at 09:39:51AM -0800, Chad Versace wrote:
> > On Thu 26 Jan 2017, Chris Wilson wrote:
> > > Since the workaround bo is used strictly as a write-only buffer, we need
> > > only allocate one per screen and use the same one from all contexts.
> > > 
> > > (The caveat here is during extension initialisation, where we write into
> > > and read back register values from the buffer, but that is performed only
> > > once for the first context - and baring synchronisation issues should not
> > > be a problem. Safer would be to move that also to the screen.)
> > > 
> > > v2: Give the workaround bo its own init function and don't piggy back
> > > intel_bufmgr_init() since it is not that related.
> > > 
> > > v3: Drop the reference count of the workaround bo for the context since
> > > the context itself is owned by the screen (and so we can rely on the bo
> > > existing for the lifetime of the context).
> > 
> > I like this idea, but I have questions and comments about the details.
> > More questions than comments, really.
> > 
> > Today, with only Mesa changes, could we effectively do the same as
> >   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
> > by hacking Mesa to set no read/write domain when emitting relocs for the
> > workaround_bo? (I admit I don't fully understand the kernel's domain
> > tracking). If that does work, then it just would require a small hack to
> > brw_emit_pipe_control_write().
> 
> Yes, for anything that is totally scratch just not setting the write
> hazard is the same. For something like the seqno page where we have
> multiple engines that we do want to be preserved, not settting the write
> hazzard had the consequence that page could be lost under memory pressure
> or across resume. (As usual there are some details that this part of the
> ABI had to be relaxed because userspace didn't have this flag.)
> But that doesn't sell many bananas.

Good. That's how I thought it worked.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [Mesa-dev] [PATCH] i965: Share the workaround bo between all contexts
  2017-01-26 17:39           ` [Mesa-dev] " Chad Versace
  2017-01-26 18:05             ` Chris Wilson
  2017-01-26 18:46             ` Chris Wilson
@ 2017-01-27  0:01             ` Chad Versace
  2017-01-27 18:20               ` [Intel-gfx] " Emil Velikov
  2 siblings, 1 reply; 82+ messages in thread
From: Chad Versace @ 2017-01-27  0:01 UTC (permalink / raw)
  To: Chris Wilson, mesa-dev, Daniel Vetter, Kenneth Graunke, intel-gfx

On Thu 26 Jan 2017, Chad Versace wrote:
> On Thu 26 Jan 2017, Chris Wilson wrote:
> > Since the workaround bo is used strictly as a write-only buffer, we need
> > only allocate one per screen and use the same one from all contexts.
> > 
> > (The caveat here is during extension initialisation, where we write into
> > and read back register values from the buffer, but that is performed only
> > once for the first context - and baring synchronisation issues should not
> > be a problem. Safer would be to move that also to the screen.)
> > 
> > v2: Give the workaround bo its own init function and don't piggy back
> > intel_bufmgr_init() since it is not that related.
> > 
> > v3: Drop the reference count of the workaround bo for the context since
> > the context itself is owned by the screen (and so we can rely on the bo
> > existing for the lifetime of the context).
> 
> I like this idea, but I have questions and comments about the details.
> More questions than comments, really.
> 
> Today, with only Mesa changes, could we effectively do the same as
>   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
> by hacking Mesa to set no read/write domain when emitting relocs for the
> workaround_bo? (I admit I don't fully understand the kernel's domain
> tracking). If that does work, then it just would require a small hack to
> brw_emit_pipe_control_write().
> 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Kenneth Graunke <kenneth@whitecape.org>
> > Cc: Martin Peres <martin.peres@linux.intel.com>
> > Cc: Chad Versace <chadversary@chromium.org>
> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>

> > diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c

> > +   /* We want to use this bo from any and all contexts, without undue
> > +    * writing ordering between them. To prevent the kernel enforcing
> > +    * the order due to writes from different contexts, we disable
> > +    * the use of (the kernel's) implicit sync on this bo.
> > +    */
> > +   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);

> > +#ifndef HAVE_DRM_INTEL_GEM_BO_DISABLE_IMPLICIT_SYNC
> > +#define drm_intel_gem_bo_disable_implicit_sync(BO) do { } while (0)
> > +#endif

Until Mesa can actually disable the implicit sync, I think this patch
should be postponed. If it landed now, it may cause additional
unneccessary stalls between contexts. Chrome OS uses many contexts in
the same process, so if problems exist, they'll exhibit on CrOS. Perhaps
the extra stalls will be imperceptible, but I don't want to take the
risk.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing
  2017-01-26 10:32       ` Chris Wilson
  2017-01-26 10:58         ` [PATCH] i965: Share the workaround bo between all contexts Chris Wilson
@ 2017-01-27  0:07         ` Chad Versace
  1 sibling, 0 replies; 82+ messages in thread
From: Chad Versace @ 2017-01-27  0:07 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx, Daniel Vetter

On Thu 26 Jan 2017, Chris Wilson wrote:
> On Wed, Jan 25, 2017 at 12:38:32PM -0800, Chad Versace wrote:
> > On Mon 14 Nov 2016, Chris Wilson wrote:
> > > Userspace is faced with a dilemma. The kernel requires implicit fencing
> > > to manage resource usage (we always must wait for the GPU to finish
> > > before releasing its PTE) and for third parties. However, userspace may
> > > wish to avoid this serialisation if it is either using explicit fencing
> > > between parties and wants more fine-grained access to buffers (e.g. it
> > > may partition the buffer between uses and track fences on ranges rather
> > > than the implicit fences tracking the whole object). It follows that
> > > userspace needs a mechanism to avoid the kernel's serialisation on its
> > > implicit fences before execbuf execution.
> > > 
> > > The next question is whether this is an object, execbuf or context flag.
> > > Hybrid users (such as using explicit EGL_ANDROID_native_sync fencing on
> > > shared winsys buffers, but implicit fencing on internal surfaces)
> > > require a per-object level flag. Given that this flag need to be only
> > > set once for the lifetime of the object, this reduces the convenience of
> > > having an execbuf or context level flag (and avoids having multiple
> > > pieces of uABI controlling the same feature).
> > > 
> > > Incorrect use of this flag will result in rendering corruption and GPU
> > > hangs - but will not result in use-after-free or similar resource
> > > tracking issues.
> > > 
> > > Serious caveat: write ordering is not strictly correct after setting
> > > this flag on a render target on multiple engines. This affects all
> > > subsequent GEM operations (execbuf, set-domain, pread) and shared
> > > dma-buf operations. A fix is possible - but costly (both in terms of
> > > further ABI changes and runtime overhead).
> > > 
> > > Testcase: igt/gem_exec_async
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
> > > ---
> > >  drivers/gpu/drm/i915/i915_drv.c            |  1 +
> > >  drivers/gpu/drm/i915/i915_gem_execbuffer.c |  3 +++
> > >  include/uapi/drm/i915_drm.h                | 29 ++++++++++++++++++++++++++++-
> > >  3 files changed, 32 insertions(+), 1 deletion(-)
> > 
> > I'm neutral about this patch. I believe patch 14/14 is useful with or
> > without this patch, and I want to see patch 14 land regardless of what
> > happens with this one.
> 
> I don't like the patch, it opens up a big wart in the GEM api (incorrect
> write tracking on GEM/dma-buf across multiple timelines). Otoh, being
> able to discard the implicit fence tracking seems to be an important
> feature request - if we go forward witout it, we will then be lacking a
> feature that is common across other drivers and in particular seems to
> be commonplace in the Android ecosystem.

I agree. The explicit fence fds provide more benefit (that is, less
blocking and, in general, more *explicitness*) when implicit fencing is
disabled. Userspace should have some API to disable the implicit
fencing, and this patch seems like an ok approach. I certainly can think
of nothing better.

> Daniel, what's your feeling? One problem will be that the
> synchronisation issue may be hard to track down in future (proving that
> the cause of a stall is an avoidable implicit fence).
>  
> > I'm not opposed to this patch. It's just that I don't yet understand
> > exactly if Mesa's EGL/GL code could effectively use this feature for
> > Android winsys buffers. The amount of information loss between the
> > EGL/GL apis and the eventual execbuffer submission may prevent Mesa from
> > annotating the Android winsys buffers with this.  I'm unsure.  I'm still
> > thinking about it.
> > 
> > But, if Chris, or anyone, already has plans to use this somehow, perhaps
> > in the DDX, then don't let my hesitation block the patch.
> 
> Actually, the example I have would be for mesa. It can use this on its
> own scratch buffers to just discard writes and prevent ordering on
> a single scratch shared between contexts, and for its fence tracking using
> a single page for multiple rings.

Those use cases sound good to me. This patch is
Acked-by: Chad Versace <chadversary@chromium.org>
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [Intel-gfx] [PATCH] i965: Share the workaround bo between all contexts
  2017-01-27  0:01             ` Chad Versace
@ 2017-01-27 18:20               ` Emil Velikov
  2017-01-27 18:30                 ` [Mesa-dev] " Chris Wilson
  0 siblings, 1 reply; 82+ messages in thread
From: Emil Velikov @ 2017-01-27 18:20 UTC (permalink / raw)
  To: Chad Versace, Chris Wilson, ML mesa-dev, Daniel Vetter,
	Kenneth Graunke, intel-gfx

On 27 January 2017 at 00:01, Chad Versace <chadversary@chromium.org> wrote:
> On Thu 26 Jan 2017, Chad Versace wrote:
>> On Thu 26 Jan 2017, Chris Wilson wrote:
>> > Since the workaround bo is used strictly as a write-only buffer, we need
>> > only allocate one per screen and use the same one from all contexts.
>> >
>> > (The caveat here is during extension initialisation, where we write into
>> > and read back register values from the buffer, but that is performed only
>> > once for the first context - and baring synchronisation issues should not
>> > be a problem. Safer would be to move that also to the screen.)
>> >
>> > v2: Give the workaround bo its own init function and don't piggy back
>> > intel_bufmgr_init() since it is not that related.
>> >
>> > v3: Drop the reference count of the workaround bo for the context since
>> > the context itself is owned by the screen (and so we can rely on the bo
>> > existing for the lifetime of the context).
>>
>> I like this idea, but I have questions and comments about the details.
>> More questions than comments, really.
>>
>> Today, with only Mesa changes, could we effectively do the same as
>>   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
>> by hacking Mesa to set no read/write domain when emitting relocs for the
>> workaround_bo? (I admit I don't fully understand the kernel's domain
>> tracking). If that does work, then it just would require a small hack to
>> brw_emit_pipe_control_write().
>>
>> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> > Cc: Kenneth Graunke <kenneth@whitecape.org>
>> > Cc: Martin Peres <martin.peres@linux.intel.com>
>> > Cc: Chad Versace <chadversary@chromium.org>
>> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>
>> > diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
>
>> > +   /* We want to use this bo from any and all contexts, without undue
>> > +    * writing ordering between them. To prevent the kernel enforcing
>> > +    * the order due to writes from different contexts, we disable
>> > +    * the use of (the kernel's) implicit sync on this bo.
>> > +    */
>> > +   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
>
>> > +#ifndef HAVE_DRM_INTEL_GEM_BO_DISABLE_IMPLICIT_SYNC
>> > +#define drm_intel_gem_bo_disable_implicit_sync(BO) do { } while (0)
>> > +#endif
>
> Until Mesa can actually disable the implicit sync, I think this patch
> should be postponed. If it landed now, it may cause additional
> unneccessary stalls between contexts. Chrome OS uses many contexts in
> the same process, so if problems exist, they'll exhibit on CrOS. Perhaps
> the extra stalls will be imperceptible, but I don't want to take the
> risk.
Afaict the libdrm API is fine although we're missing a
drm_intel_bufmgr_gem_can_disable_implicit_sync() call.
We'd want to check that and fallback when applicable ?

Please don't use wrappers like this in mesa. Just roll a new libdrm
and bump the requirement.

Thanks
Emil
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [Mesa-dev] [PATCH] i965: Share the workaround bo between all contexts
  2017-01-27 18:20               ` [Intel-gfx] " Emil Velikov
@ 2017-01-27 18:30                 ` Chris Wilson
  2017-01-27 18:37                   ` [Intel-gfx] " Emil Velikov
  0 siblings, 1 reply; 82+ messages in thread
From: Chris Wilson @ 2017-01-27 18:30 UTC (permalink / raw)
  To: Emil Velikov
  Cc: ML mesa-dev, Chad Versace, intel-gfx, Kenneth Graunke, Daniel Vetter

On Fri, Jan 27, 2017 at 06:20:46PM +0000, Emil Velikov wrote:
> On 27 January 2017 at 00:01, Chad Versace <chadversary@chromium.org> wrote:
> > On Thu 26 Jan 2017, Chad Versace wrote:
> >> On Thu 26 Jan 2017, Chris Wilson wrote:
> >> > Since the workaround bo is used strictly as a write-only buffer, we need
> >> > only allocate one per screen and use the same one from all contexts.
> >> >
> >> > (The caveat here is during extension initialisation, where we write into
> >> > and read back register values from the buffer, but that is performed only
> >> > once for the first context - and baring synchronisation issues should not
> >> > be a problem. Safer would be to move that also to the screen.)
> >> >
> >> > v2: Give the workaround bo its own init function and don't piggy back
> >> > intel_bufmgr_init() since it is not that related.
> >> >
> >> > v3: Drop the reference count of the workaround bo for the context since
> >> > the context itself is owned by the screen (and so we can rely on the bo
> >> > existing for the lifetime of the context).
> >>
> >> I like this idea, but I have questions and comments about the details.
> >> More questions than comments, really.
> >>
> >> Today, with only Mesa changes, could we effectively do the same as
> >>   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
> >> by hacking Mesa to set no read/write domain when emitting relocs for the
> >> workaround_bo? (I admit I don't fully understand the kernel's domain
> >> tracking). If that does work, then it just would require a small hack to
> >> brw_emit_pipe_control_write().
> >>
> >> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >> > Cc: Kenneth Graunke <kenneth@whitecape.org>
> >> > Cc: Martin Peres <martin.peres@linux.intel.com>
> >> > Cc: Chad Versace <chadversary@chromium.org>
> >> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> >
> >> > diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
> >
> >> > +   /* We want to use this bo from any and all contexts, without undue
> >> > +    * writing ordering between them. To prevent the kernel enforcing
> >> > +    * the order due to writes from different contexts, we disable
> >> > +    * the use of (the kernel's) implicit sync on this bo.
> >> > +    */
> >> > +   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
> >
> >> > +#ifndef HAVE_DRM_INTEL_GEM_BO_DISABLE_IMPLICIT_SYNC
> >> > +#define drm_intel_gem_bo_disable_implicit_sync(BO) do { } while (0)
> >> > +#endif
> >
> > Until Mesa can actually disable the implicit sync, I think this patch
> > should be postponed. If it landed now, it may cause additional
> > unneccessary stalls between contexts. Chrome OS uses many contexts in
> > the same process, so if problems exist, they'll exhibit on CrOS. Perhaps
> > the extra stalls will be imperceptible, but I don't want to take the
> > risk.
> Afaict the libdrm API is fine although we're missing a
> drm_intel_bufmgr_gem_can_disable_implicit_sync() call.
> We'd want to check that and fallback when applicable ?
> 
> Please don't use wrappers like this in mesa. Just roll a new libdrm
> and bump the requirement.

I was told that there was a preference now for a shortlived compat layer
because distro's were unhappy.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [Intel-gfx] [PATCH] i965: Share the workaround bo between all contexts
  2017-01-27 18:30                 ` [Mesa-dev] " Chris Wilson
@ 2017-01-27 18:37                   ` Emil Velikov
  0 siblings, 0 replies; 82+ messages in thread
From: Emil Velikov @ 2017-01-27 18:37 UTC (permalink / raw)
  To: Chris Wilson, Emil Velikov, Chad Versace, ML mesa-dev,
	Daniel Vetter, Kenneth Graunke, intel-gfx

On 27 January 2017 at 18:30, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Fri, Jan 27, 2017 at 06:20:46PM +0000, Emil Velikov wrote:
>> On 27 January 2017 at 00:01, Chad Versace <chadversary@chromium.org> wrote:
>> > On Thu 26 Jan 2017, Chad Versace wrote:
>> >> On Thu 26 Jan 2017, Chris Wilson wrote:
>> >> > Since the workaround bo is used strictly as a write-only buffer, we need
>> >> > only allocate one per screen and use the same one from all contexts.
>> >> >
>> >> > (The caveat here is during extension initialisation, where we write into
>> >> > and read back register values from the buffer, but that is performed only
>> >> > once for the first context - and baring synchronisation issues should not
>> >> > be a problem. Safer would be to move that also to the screen.)
>> >> >
>> >> > v2: Give the workaround bo its own init function and don't piggy back
>> >> > intel_bufmgr_init() since it is not that related.
>> >> >
>> >> > v3: Drop the reference count of the workaround bo for the context since
>> >> > the context itself is owned by the screen (and so we can rely on the bo
>> >> > existing for the lifetime of the context).
>> >>
>> >> I like this idea, but I have questions and comments about the details.
>> >> More questions than comments, really.
>> >>
>> >> Today, with only Mesa changes, could we effectively do the same as
>> >>   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
>> >> by hacking Mesa to set no read/write domain when emitting relocs for the
>> >> workaround_bo? (I admit I don't fully understand the kernel's domain
>> >> tracking). If that does work, then it just would require a small hack to
>> >> brw_emit_pipe_control_write().
>> >>
>> >> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>> >> > Cc: Kenneth Graunke <kenneth@whitecape.org>
>> >> > Cc: Martin Peres <martin.peres@linux.intel.com>
>> >> > Cc: Chad Versace <chadversary@chromium.org>
>> >> > Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
>> >
>> >> > diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
>> >
>> >> > +   /* We want to use this bo from any and all contexts, without undue
>> >> > +    * writing ordering between them. To prevent the kernel enforcing
>> >> > +    * the order due to writes from different contexts, we disable
>> >> > +    * the use of (the kernel's) implicit sync on this bo.
>> >> > +    */
>> >> > +   drm_intel_gem_bo_disable_implicit_sync(screen->workaround_bo);
>> >
>> >> > +#ifndef HAVE_DRM_INTEL_GEM_BO_DISABLE_IMPLICIT_SYNC
>> >> > +#define drm_intel_gem_bo_disable_implicit_sync(BO) do { } while (0)
>> >> > +#endif
>> >
>> > Until Mesa can actually disable the implicit sync, I think this patch
>> > should be postponed. If it landed now, it may cause additional
>> > unneccessary stalls between contexts. Chrome OS uses many contexts in
>> > the same process, so if problems exist, they'll exhibit on CrOS. Perhaps
>> > the extra stalls will be imperceptible, but I don't want to take the
>> > risk.
>> Afaict the libdrm API is fine although we're missing a
>> drm_intel_bufmgr_gem_can_disable_implicit_sync() call.
>> We'd want to check that and fallback when applicable ?
>>
>> Please don't use wrappers like this in mesa. Just roll a new libdrm
>> and bump the requirement.
>
> I was told that there was a preference now for a shortlived compat layer
> because distro's were unhappy.

As long as there's a libdrm release distros should be fine. Obviously
there's always the case of someone being unhappy - in one example a
distro decided to freeze their libdrm package on the exact version
which badly broke nouveau. Ilia can tell you how many times he had to
repeat the same suggestion - downgrade or update to a local package
:-\

-Emil
_______________________________________________
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev

^ permalink raw reply	[flat|nested] 82+ messages in thread

end of thread, other threads:[~2017-01-27 18:37 UTC | newest]

Thread overview: 82+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-11-07 13:59 Trivial scheduler, take 2 Chris Wilson
2016-11-07 13:59 ` [PATCH v2 01/11] drm/i915: Create distinct lockclasses for execution vs user timelines Chris Wilson
2016-11-08  7:43   ` Joonas Lahtinen
2016-11-08  8:50     ` Chris Wilson
2016-11-07 13:59 ` [PATCH v2 02/11] drm/i915: Split request submit/execute phase into two Chris Wilson
2016-11-08  9:06   ` Joonas Lahtinen
2016-11-07 13:59 ` [PATCH v2 03/11] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
2016-11-10 10:43   ` Tvrtko Ursulin
2016-11-10 11:11     ` Chris Wilson
2016-11-10 11:51       ` Tvrtko Ursulin
2016-11-10 14:43         ` Chris Wilson
2016-11-10 11:23     ` [PATCH v3] " Chris Wilson
2016-11-07 13:59 ` [PATCH v2 04/11] drm/i915: Remove engine->execlist_lock Chris Wilson
2016-11-07 13:59 ` [PATCH v2 05/11] drm/i915/scheduler: Signal the arrival of a new request Chris Wilson
2016-11-07 13:59 ` [PATCH v2 06/11] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
2016-11-08 12:20   ` Chris Wilson
2016-11-10 10:44     ` Tvrtko Ursulin
2016-11-10 10:55       ` Chris Wilson
2016-11-10 11:54         ` Tvrtko Ursulin
2016-11-10 12:10           ` Chris Wilson
2016-11-10 14:45   ` Tvrtko Ursulin
2016-11-10 15:01     ` Chris Wilson
2016-11-10 15:36       ` Tvrtko Ursulin
2016-11-10 15:55         ` Chris Wilson
2016-11-07 13:59 ` [PATCH v2 07/11] drm/i915/scheduler: Boost priorities for flips Chris Wilson
2016-11-10 10:52   ` Tvrtko Ursulin
2016-11-07 13:59 ` [PATCH v2 08/11] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
2016-11-07 13:59 ` [PATCH v2 09/11] drm/i915/scheduler: Support user-defined priorities Chris Wilson
2016-11-10 13:02   ` Tvrtko Ursulin
2016-11-10 13:10     ` Chris Wilson
2016-11-07 13:59 ` [PATCH v2 10/11] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
2016-11-07 13:59 ` [PATCH v2 11/11] drm/i915: Support explicit fencing for execbuf Chris Wilson
2016-11-07 15:18 ` ✓ Fi.CI.BAT: success for series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines Patchwork
2016-11-10 11:45 ` ✓ Fi.CI.BAT: success for series starting with [v2,01/11] drm/i915: Create distinct lockclasses for execution vs user timelines (rev2) Patchwork
2016-11-10 12:04   ` Saarinen, Jani
2016-11-14  8:56 ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Chris Wilson
2016-11-14  8:56   ` [PATCH v3 02/14] drm/i915: Create distinct lockclasses for execution vs user timelines Chris Wilson
2016-11-14  8:56   ` [PATCH v3 03/14] drm/i915: Split request submit/execute phase into two Chris Wilson
2016-11-14  8:56   ` [PATCH v3 04/14] drm/i915: Defer transfer onto execution timeline to actual hw submission Chris Wilson
2016-11-14 10:59     ` Tvrtko Ursulin
2016-11-14  8:56   ` [PATCH v3 05/14] drm/i915: Remove engine->execlist_lock Chris Wilson
2016-11-14  8:56   ` [PATCH v3 06/14] drm/i915/scheduler: Signal the arrival of a new request Chris Wilson
2016-11-14  8:56   ` [PATCH v3 07/14] drm/i915/scheduler: Record all dependencies upon request construction Chris Wilson
2016-11-14 11:09     ` Tvrtko Ursulin
2016-11-14  8:56   ` [PATCH v3 08/14] drm/i915/scheduler: Execute requests in order of priorities Chris Wilson
2016-11-14 11:15     ` Tvrtko Ursulin
2016-11-14 11:41       ` Chris Wilson
2016-11-14 11:48         ` Tvrtko Ursulin
2016-11-14 14:25           ` Chris Wilson
2016-11-14  8:56   ` [PATCH v3 09/14] drm/i915: Store the execution priority on the context Chris Wilson
2016-11-14 11:16     ` Tvrtko Ursulin
2016-11-14  8:56   ` [PATCH v3 10/14] drm/i915/scheduler: Boost priorities for flips Chris Wilson
2016-11-14  8:57   ` [PATCH v3 11/14] HACK drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
2016-11-14 11:31     ` Tvrtko Ursulin
2016-11-14 14:40       ` Chris Wilson
2016-12-01 10:45     ` Tvrtko Ursulin
2016-12-01 11:18       ` Chris Wilson
2016-12-01 12:45         ` Tvrtko Ursulin
2016-12-01 13:01           ` Chris Wilson
2016-11-14  8:57   ` [PATCH v3 12/14] drm/i915/scheduler: Support user-defined priorities Chris Wilson
2016-11-14 11:32     ` Tvrtko Ursulin
2016-11-14  8:57   ` [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing Chris Wilson
2017-01-25 20:38     ` Chad Versace
2017-01-26 10:32       ` Chris Wilson
2017-01-26 10:58         ` [PATCH] i965: Share the workaround bo between all contexts Chris Wilson
2017-01-26 17:39           ` [Mesa-dev] " Chad Versace
2017-01-26 18:05             ` Chris Wilson
2017-01-26 23:40               ` Chad Versace
2017-01-26 18:46             ` Chris Wilson
2017-01-27  0:01             ` Chad Versace
2017-01-27 18:20               ` [Intel-gfx] " Emil Velikov
2017-01-27 18:30                 ` [Mesa-dev] " Chris Wilson
2017-01-27 18:37                   ` [Intel-gfx] " Emil Velikov
2017-01-27  0:07         ` [PATCH v3 13/14] drm/i915: Enable userspace to opt-out of implicit fencing Chad Versace
2016-11-14  8:57   ` [PATCH v3 14/14] drm/i915: Support explicit fencing for execbuf Chris Wilson
2016-11-14 22:29     ` Rafael Antognolli
2017-01-25 20:27     ` Chad Versace
2016-11-14  9:01   ` [PATCH v3 01/14] drm/i915: Give each sw_fence its own lockclass Tvrtko Ursulin
2016-11-14  9:05     ` Chris Wilson
2016-11-14 10:57   ` Tvrtko Ursulin
2016-11-14 14:48   ` Joonas Lahtinen
2016-11-14 15:13     ` Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.