All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc
@ 2017-02-01 10:24 Chris Wilson
  2017-02-01 12:09 ` Tvrtko Ursulin
                   ` (4 more replies)
  0 siblings, 5 replies; 14+ messages in thread
From: Chris Wilson @ 2017-02-01 10:24 UTC (permalink / raw)
  To: intel-gfx

This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).

v2: Drop hack status - though iirc there is still a lockdep inversion
between fence and engine->timeline->lock (which is impossible as the
nesting only occurs on different fences - hopefully just requires some
judicious lockdep annotation)
v3: Apply lockdep nesting to enabling signaling on the request, using
the pattern we already have in __i915_gem_request_submit();
v4: Replaying requests after a hang also now needs the timeline
spinlock, to disable the interrupts at least
v5: Hold wq lock for completeness, and emit a tracepoint for enabling signal

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 109 +++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_irq.c            |   4 +-
 drivers/gpu/drm/i915/intel_lrc.c           |   5 +-
 3 files changed, 106 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 8ced9e26f075..68c32da28094 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -25,6 +25,8 @@
 #include "i915_drv.h"
 #include "intel_uc.h"
 
+#include <trace/events/dma_fence.h>
+
 /**
  * DOC: GuC-based command submission
  *
@@ -348,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 	u32 freespace;
 	int ret;
 
-	spin_lock(&client->wq_lock);
+	spin_lock_irq(&client->wq_lock);
 	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
 	freespace -= client->wq_rsvd;
 	if (likely(freespace >= wqi_size)) {
@@ -358,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 		client->no_wq_space++;
 		ret = -EAGAIN;
 	}
-	spin_unlock(&client->wq_lock);
+	spin_unlock_irq(&client->wq_lock);
 
 	return ret;
 }
@@ -370,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
 
 	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
 
-	spin_lock(&client->wq_lock);
+	spin_lock_irq(&client->wq_lock);
 	client->wq_rsvd -= wqi_size;
-	spin_unlock(&client->wq_lock);
+	spin_unlock_irq(&client->wq_lock);
 }
 
 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -532,10 +534,96 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq)
 
 static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
-	i915_gem_request_submit(rq);
+	__i915_gem_request_submit(rq);
 	__i915_guc_submit(rq);
 }
 
+static void nested_enable_signaling(struct drm_i915_gem_request *rq)
+{
+	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
+	 * detects an ordering issue between the fence lockclass and the
+	 * global_timeline. This circular dependency can only occur via 2
+	 * different fences (but same fence lockclass), so we use the nesting
+	 * annotation here to prevent the warn, equivalent to the nesting
+	 * inside i915_gem_request_submit() for when we also enable the
+	 * signaler.
+	 */
+
+	if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+			     &rq->fence.flags))
+		return;
+
+	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags));
+	trace_dma_fence_enable_signal(&rq->fence);
+
+	spin_lock_nested(&rq->lock, SINGLE_DEPTH_NESTING);
+	intel_engine_enable_signaling(rq);
+	spin_unlock(&rq->lock);
+}
+
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *last = port[0].request;
+	unsigned long flags;
+	struct rb_node *rb;
+	bool submit = false;
+
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
+		if (last && cursor->ctx != last->ctx) {
+			if (port != engine->execlist_port)
+				break;
+
+			i915_gem_request_assign(&port->request, last);
+			nested_enable_signaling(last);
+			port++;
+		}
+
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
+		i915_guc_submit(cursor);
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		i915_gem_request_assign(&port->request, last);
+		nested_enable_signaling(last);
+		engine->execlist_first = rb;
+	}
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+
+	return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *rq;
+	bool submit;
+
+	do {
+		rq = port[0].request;
+		while (rq && i915_gem_request_completed(rq)) {
+			i915_gem_request_put(rq);
+			port[0].request = port[1].request;
+			port[1].request = NULL;
+		}
+
+		submit = false;
+		if (!port[1].request)
+			submit = i915_guc_dequeue(engine);
+	} while (submit);
+}
+
 /*
  * Everything below here is concerned with setup & teardown, and is
  * therefore not part of the somewhat time-critical batch-submission
@@ -944,15 +1032,22 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
 		struct drm_i915_gem_request *rq;
+		unsigned long flags;
 
-		engine->submit_request = i915_guc_submit;
-		engine->schedule = NULL;
+		tasklet_init(&engine->irq_tasklet,
+			     i915_guc_irq_handler,
+			     (unsigned long)engine);
 
 		/* Replay the current set of previously submitted requests */
+		spin_lock_irqsave(&engine->timeline->lock, flags);
 		list_for_each_entry(rq, &engine->timeline->requests, link) {
+			spin_lock(&client->wq_lock);
 			client->wq_rsvd += sizeof(struct guc_wq_item);
+			spin_unlock(&client->wq_lock);
+
 			__i915_guc_submit(rq);
 		}
+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 0ff75f2282fa..e0cec254faa5 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1350,8 +1350,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 static __always_inline void
 gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 {
-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
+		tasklet_schedule(&engine->irq_tasklet);
 		notify_ring(engine);
+	}
 
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
 		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 0e99d53d5523..1b7fc0ffa7ad 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1272,7 +1272,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 
 	/* After a GPU reset, we may have requests to replay */
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-	if (!execlists_elsp_idle(engine)) {
+	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
 		engine->execlist_port[0].count = 0;
 		engine->execlist_port[1].count = 0;
 		execlists_submit_ports(engine);
@@ -1340,9 +1340,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	request->ring->last_retired_head = -1;
 	intel_ring_update_space(request->ring);
 
-	if (i915.enable_guc_submission)
-		return;
-
 	/* Catch up with any missed context-switch interrupts */
 	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
 	if (request->ctx != port[0].request->ctx) {
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc
  2017-02-01 10:24 [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
@ 2017-02-01 12:09 ` Tvrtko Ursulin
  2017-02-01 12:46   ` Chris Wilson
  2017-02-01 12:54 ` ✓ Fi.CI.BAT: success for " Patchwork
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2017-02-01 12:09 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 01/02/2017 10:24, Chris Wilson wrote:
> This emulates execlists on top of the GuC in order to defer submission of
> requests to the hardware. This deferral allows time for high priority
> requests to gazump their way to the head of the queue, however it nerfs
> the GuC by converting it back into a simple execlist (where the CPU has
> to wake up after every request to feed new commands into the GuC).

Not every request since it does have two virtual ports, or even a bit 
better with request merging. Just saying so it sounds less dramatic for 
people reading commit messages.

> v2: Drop hack status - though iirc there is still a lockdep inversion
> between fence and engine->timeline->lock (which is impossible as the
> nesting only occurs on different fences - hopefully just requires some
> judicious lockdep annotation)
> v3: Apply lockdep nesting to enabling signaling on the request, using
> the pattern we already have in __i915_gem_request_submit();
> v4: Replaying requests after a hang also now needs the timeline
> spinlock, to disable the interrupts at least
> v5: Hold wq lock for completeness, and emit a tracepoint for enabling signal
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 109 +++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/i915_irq.c            |   4 +-
>  drivers/gpu/drm/i915/intel_lrc.c           |   5 +-
>  3 files changed, 106 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 8ced9e26f075..68c32da28094 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -25,6 +25,8 @@
>  #include "i915_drv.h"
>  #include "intel_uc.h"
>
> +#include <trace/events/dma_fence.h>
> +
>  /**
>   * DOC: GuC-based command submission
>   *
> @@ -348,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  	u32 freespace;
>  	int ret;
>
> -	spin_lock(&client->wq_lock);
> +	spin_lock_irq(&client->wq_lock);
>  	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
>  	freespace -= client->wq_rsvd;
>  	if (likely(freespace >= wqi_size)) {
> @@ -358,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  		client->no_wq_space++;
>  		ret = -EAGAIN;
>  	}
> -	spin_unlock(&client->wq_lock);
> +	spin_unlock_irq(&client->wq_lock);
>
>  	return ret;
>  }
> @@ -370,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
>
>  	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
>
> -	spin_lock(&client->wq_lock);
> +	spin_lock_irq(&client->wq_lock);
>  	client->wq_rsvd -= wqi_size;
> -	spin_unlock(&client->wq_lock);
> +	spin_unlock_irq(&client->wq_lock);
>  }
>
>  /* Construct a Work Item and append it to the GuC's Work Queue */
> @@ -532,10 +534,96 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq)
>
>  static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  {
> -	i915_gem_request_submit(rq);
> +	__i915_gem_request_submit(rq);
>  	__i915_guc_submit(rq);
>  }
>
> +static void nested_enable_signaling(struct drm_i915_gem_request *rq)
> +{
> +	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
> +	 * detects an ordering issue between the fence lockclass and the
> +	 * global_timeline. This circular dependency can only occur via 2
> +	 * different fences (but same fence lockclass), so we use the nesting
> +	 * annotation here to prevent the warn, equivalent to the nesting
> +	 * inside i915_gem_request_submit() for when we also enable the
> +	 * signaler.
> +	 */
> +
> +	if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> +			     &rq->fence.flags))
> +		return;
> +
> +	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags));
> +	trace_dma_fence_enable_signal(&rq->fence);
> +
> +	spin_lock_nested(&rq->lock, SINGLE_DEPTH_NESTING);
> +	intel_engine_enable_signaling(rq);
> +	spin_unlock(&rq->lock);
> +}
> +
> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +{
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *last = port[0].request;
> +	unsigned long flags;
> +	struct rb_node *rb;
> +	bool submit = false;
> +
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
> +		if (last && cursor->ctx != last->ctx) {
> +			if (port != engine->execlist_port)
> +				break;
> +
> +			i915_gem_request_assign(&port->request, last);
> +			nested_enable_signaling(last);
> +			port++;
> +		}
> +
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		i915_guc_submit(cursor);
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		i915_gem_request_assign(&port->request, last);
> +		nested_enable_signaling(last);
> +		engine->execlist_first = rb;
> +	}
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +	return submit;
> +}
> +
> +static void i915_guc_irq_handler(unsigned long data)
> +{
> +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *rq;
> +	bool submit;
> +
> +	do {
> +		rq = port[0].request;
> +		while (rq && i915_gem_request_completed(rq)) {
> +			i915_gem_request_put(rq);
> +			port[0].request = port[1].request;
> +			port[1].request = NULL;
> +		}
> +
> +		submit = false;
> +		if (!port[1].request)
> +			submit = i915_guc_dequeue(engine);
> +	} while (submit);
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission
> @@ -944,15 +1032,22 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
>  		struct drm_i915_gem_request *rq;
> +		unsigned long flags;
>
> -		engine->submit_request = i915_guc_submit;
> -		engine->schedule = NULL;
> +		tasklet_init(&engine->irq_tasklet,
> +			     i915_guc_irq_handler,
> +			     (unsigned long)engine);
>
>  		/* Replay the current set of previously submitted requests */
> +		spin_lock_irqsave(&engine->timeline->lock, flags);
>  		list_for_each_entry(rq, &engine->timeline->requests, link) {
> +			spin_lock(&client->wq_lock);
>  			client->wq_rsvd += sizeof(struct guc_wq_item);
> +			spin_unlock(&client->wq_lock);
> +
>  			__i915_guc_submit(rq);
>  		}
> +		spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 0ff75f2282fa..e0cec254faa5 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1350,8 +1350,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>  static __always_inline void
>  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>  {
> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> +		tasklet_schedule(&engine->irq_tasklet);
>  		notify_ring(engine);
> +	}

Would this be better:

if (iir & (CTX | USR)) {
	set_bit();
	tasklet_hi_schedule();
	
	if (iir & USR) {
		notify_ring();
	}
}

?

>
>  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
>  		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 0e99d53d5523..1b7fc0ffa7ad 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1272,7 +1272,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>
>  	/* After a GPU reset, we may have requests to replay */
>  	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -	if (!execlists_elsp_idle(engine)) {
> +	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
>  		engine->execlist_port[0].count = 0;
>  		engine->execlist_port[1].count = 0;
>  		execlists_submit_ports(engine);
> @@ -1340,9 +1340,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
>
> -	if (i915.enable_guc_submission)
> -		return;
> -
>  	/* Catch up with any missed context-switch interrupts */
>  	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));

Should we put this write under the i915.enable_guc_submission perhasp? 
Maybe it makes no difference since this is after the reset but don't 
know, perhaps it is more logical.

>  	if (request->ctx != port[0].request->ctx) {
>

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc
  2017-02-01 12:09 ` Tvrtko Ursulin
@ 2017-02-01 12:46   ` Chris Wilson
  2017-02-01 13:15     ` Tvrtko Ursulin
  0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2017-02-01 12:46 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Wed, Feb 01, 2017 at 12:09:40PM +0000, Tvrtko Ursulin wrote:
> 
> On 01/02/2017 10:24, Chris Wilson wrote:
> >This emulates execlists on top of the GuC in order to defer submission of
> >requests to the hardware. This deferral allows time for high priority
> >requests to gazump their way to the head of the queue, however it nerfs
> >the GuC by converting it back into a simple execlist (where the CPU has
> >to wake up after every request to feed new commands into the GuC).
> 
> Not every request since it does have two virtual ports, or even a
> bit better with request merging. Just saying so it sounds less
> dramatic for people reading commit messages.

We still wakeup after the first virtual port is drained, with the hope
that the second is full to prevent the stall. What you meant is that we
coalesce requests from the same context into a port, so not every
request need result in an extra interrupt. Almost every request.
> >diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> >index 0ff75f2282fa..e0cec254faa5 100644
> >--- a/drivers/gpu/drm/i915/i915_irq.c
> >+++ b/drivers/gpu/drm/i915/i915_irq.c
> >@@ -1350,8 +1350,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
> > static __always_inline void
> > gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
> > {
> >-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> >+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> >+		tasklet_schedule(&engine->irq_tasklet);
> > 		notify_ring(engine);
> >+	}
> 
> Would this be better:
> 
> if (iir & (CTX | USR)) {
> 	set_bit();

No because ^.

> 	tasklet_hi_schedule();
> 	
> 	if (iir & USR) {
> 		notify_ring();
> 	}
> }
> 
> ?

If we set the context-switch bit too early (and process the
user-interrupt and the context-switch as two separate passes through the
tasklet), we encounter the error from before that the CSB register may
be undefined. Possibly not since the user interrupt should mean the ring
is powered up (and so the register restored from the power context, one
hopes!), but there is no reason for us to read back the registers until
we see the interrupt telling us they have been updated.

Note sure the best way to write that for simplicity, so I kept the patch
clean.

        if (!(iir & ((GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT) << test_shift)))
                return;

        if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
                set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);

        tasklet_hi_schedule(&engine->irq_tasklet);

        if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
                notify_ring(engine);

add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-85 (-85)
function                                     old     new   delta
gen8_gt_irq_handler                          756     671     -85
Total: Before=1093982, After=1093897, chg -0.01%

Ok, gcc likes that a lot more.

> > 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
> > 		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> >diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> >index 0e99d53d5523..1b7fc0ffa7ad 100644
> >--- a/drivers/gpu/drm/i915/intel_lrc.c
> >+++ b/drivers/gpu/drm/i915/intel_lrc.c
> >@@ -1272,7 +1272,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
> >
> > 	/* After a GPU reset, we may have requests to replay */
> > 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> >-	if (!execlists_elsp_idle(engine)) {
> >+	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
> > 		engine->execlist_port[0].count = 0;
> > 		engine->execlist_port[1].count = 0;
> > 		execlists_submit_ports(engine);
> >@@ -1340,9 +1340,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
> > 	request->ring->last_retired_head = -1;
> > 	intel_ring_update_space(request->ring);
> >
> >-	if (i915.enable_guc_submission)
> >-		return;
> >-
> > 	/* Catch up with any missed context-switch interrupts */
> > 	I915_WRITE(RING_CONTEXT_STATUS_PTR(engine), _MASKED_FIELD(0xffff, 0));
> 
> Should we put this write under the i915.enable_guc_submission
> perhasp? Maybe it makes no difference since this is after the reset
> but don't know, perhaps it is more logical.

Perhaps. Although now with the set_bit(CTX_SWITCH), we should be able to
remove this line entirely.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915/scheduler: emulate a scheduler for guc
  2017-02-01 10:24 [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
  2017-02-01 12:09 ` Tvrtko Ursulin
@ 2017-02-01 12:54 ` Patchwork
  2017-02-01 15:31 ` [PATCH v6] " Chris Wilson
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 14+ messages in thread
From: Patchwork @ 2017-02-01 12:54 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/scheduler: emulate a scheduler for guc
URL   : https://patchwork.freedesktop.org/series/18909/
State : success

== Summary ==

Series 18909v1 drm/i915/scheduler: emulate a scheduler for guc
https://patchwork.freedesktop.org/api/1.0/series/18909/revisions/1/mbox/


fi-bdw-5557u     total:247  pass:233  dwarn:0   dfail:0   fail:0   skip:14 
fi-bsw-n3050     total:247  pass:208  dwarn:0   dfail:0   fail:0   skip:39 
fi-bxt-j4205     total:247  pass:225  dwarn:0   dfail:0   fail:0   skip:22 
fi-bxt-t5700     total:78   pass:65   dwarn:0   dfail:0   fail:0   skip:12 
fi-byt-j1900     total:247  pass:220  dwarn:0   dfail:0   fail:0   skip:27 
fi-byt-n2820     total:247  pass:216  dwarn:0   dfail:0   fail:0   skip:31 
fi-hsw-4770      total:247  pass:228  dwarn:0   dfail:0   fail:0   skip:19 
fi-hsw-4770r     total:247  pass:228  dwarn:0   dfail:0   fail:0   skip:19 
fi-ivb-3520m     total:247  pass:226  dwarn:0   dfail:0   fail:0   skip:21 
fi-ivb-3770      total:247  pass:226  dwarn:0   dfail:0   fail:0   skip:21 
fi-kbl-7500u     total:247  pass:224  dwarn:0   dfail:0   fail:2   skip:21 
fi-skl-6260u     total:247  pass:234  dwarn:0   dfail:0   fail:0   skip:13 
fi-skl-6700hq    total:247  pass:227  dwarn:0   dfail:0   fail:0   skip:20 
fi-skl-6700k     total:247  pass:222  dwarn:4   dfail:0   fail:0   skip:21 
fi-skl-6770hq    total:247  pass:234  dwarn:0   dfail:0   fail:0   skip:13 
fi-snb-2520m     total:247  pass:216  dwarn:0   dfail:0   fail:0   skip:31 
fi-snb-2600      total:247  pass:215  dwarn:0   dfail:0   fail:0   skip:32 

eb9b7b42023edc1b5849d1ff3bef490b492067a3 drm-tip: 2017y-02m-01d-11h-09m-17s UTC integration manifest
27530f8 drm/i915/scheduler: emulate a scheduler for guc

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_3661/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc
  2017-02-01 12:46   ` Chris Wilson
@ 2017-02-01 13:15     ` Tvrtko Ursulin
  2017-02-01 13:39       ` Chris Wilson
  0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2017-02-01 13:15 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 01/02/2017 12:46, Chris Wilson wrote:
> On Wed, Feb 01, 2017 at 12:09:40PM +0000, Tvrtko Ursulin wrote:
>>
>> On 01/02/2017 10:24, Chris Wilson wrote:
>>> This emulates execlists on top of the GuC in order to defer submission of
>>> requests to the hardware. This deferral allows time for high priority
>>> requests to gazump their way to the head of the queue, however it nerfs
>>> the GuC by converting it back into a simple execlist (where the CPU has
>>> to wake up after every request to feed new commands into the GuC).
>>
>> Not every request since it does have two virtual ports, or even a
>> bit better with request merging. Just saying so it sounds less
>> dramatic for people reading commit messages.
>
> We still wakeup after the first virtual port is drained, with the hope
> that the second is full to prevent the stall. What you meant is that we
> coalesce requests from the same context into a port, so not every
> request need result in an extra interrupt. Almost every request.

I was just after a milder description so it doesn't sound like we never 
submit more than one request to the GuC.

>>> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
>>> index 0ff75f2282fa..e0cec254faa5 100644
>>> --- a/drivers/gpu/drm/i915/i915_irq.c
>>> +++ b/drivers/gpu/drm/i915/i915_irq.c
>>> @@ -1350,8 +1350,10 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>>> static __always_inline void
>>> gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>>> {
>>> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
>>> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
>>> +		tasklet_schedule(&engine->irq_tasklet);
>>> 		notify_ring(engine);
>>> +	}
>>
>> Would this be better:
>>
>> if (iir & (CTX | USR)) {
>> 	set_bit();
>
> No because ^.
>
>> 	tasklet_hi_schedule();
>> 	
>> 	if (iir & USR) {
>> 		notify_ring();
>> 	}
>> }
>>
>> ?
>
> If we set the context-switch bit too early (and process the
> user-interrupt and the context-switch as two separate passes through the
> tasklet), we encounter the error from before that the CSB register may
> be undefined. Possibly not since the user interrupt should mean the ring
> is powered up (and so the register restored from the power context, one
> hopes!), but there is no reason for us to read back the registers until
> we see the interrupt telling us they have been updated.
>
> Note sure the best way to write that for simplicity, so I kept the patch
> clean.
>
>         if (!(iir & ((GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT) << test_shift)))
>                 return;
>
>         if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
>                 set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
>
>         tasklet_hi_schedule(&engine->irq_tasklet);
>
>         if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
>                 notify_ring(engine);
>
> add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-85 (-85)
> function                                     old     new   delta
> gen8_gt_irq_handler                          756     671     -85
> Total: Before=1093982, After=1093897, chg -0.01%
>
> Ok, gcc likes that a lot more.

What's this add/remove grow/shrink up/down business? Something from some 
standard tool?

Back to the topic - should we be concerned here that in execlist mode we 
might wake up the tasklet prematurely? If it manages to run and exit in 
the window between the user interrupt and the context interrupt it would 
just waste cycles. I can easily see ~150us between the two here. What do 
you think?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc
  2017-02-01 13:15     ` Tvrtko Ursulin
@ 2017-02-01 13:39       ` Chris Wilson
  0 siblings, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2017-02-01 13:39 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

On Wed, Feb 01, 2017 at 01:15:49PM +0000, Tvrtko Ursulin wrote:
> 
> On 01/02/2017 12:46, Chris Wilson wrote:
> >If we set the context-switch bit too early (and process the
> >user-interrupt and the context-switch as two separate passes through the
> >tasklet), we encounter the error from before that the CSB register may
> >be undefined. Possibly not since the user interrupt should mean the ring
> >is powered up (and so the register restored from the power context, one
> >hopes!), but there is no reason for us to read back the registers until
> >we see the interrupt telling us they have been updated.
> >
> >Note sure the best way to write that for simplicity, so I kept the patch
> >clean.
> >
> >        if (!(iir & ((GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT) << test_shift)))
> >                return;
> >
> >        if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
> >                set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> >
> >        tasklet_hi_schedule(&engine->irq_tasklet);
> >
> >        if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> >                notify_ring(engine);
> >
> >add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-85 (-85)
> >function                                     old     new   delta
> >gen8_gt_irq_handler                          756     671     -85
> >Total: Before=1093982, After=1093897, chg -0.01%
> >
> >Ok, gcc likes that a lot more.
> 
> What's this add/remove grow/shrink up/down business? Something from
> some standard tool?

That was ./scripts/bloat-o-meter

> Back to the topic - should we be concerned here that in execlist
> mode we might wake up the tasklet prematurely? If it manages to run
> and exit in the window between the user interrupt and the context
> interrupt it would just waste cycles. I can easily see ~150us
> between the two here. What do you think?

Been worrying, but it's been below the level of sensitivity whilst using
my bdw. If some person did have some superfine tools to probe
latency/throughput ;) and it highlighted this as a problem, I was thinking
that we would simply split the gen8/gen9 interrupt (GT?) handling.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v6] drm/i915/scheduler: emulate a scheduler for guc
  2017-02-01 10:24 [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
  2017-02-01 12:09 ` Tvrtko Ursulin
  2017-02-01 12:54 ` ✓ Fi.CI.BAT: success for " Patchwork
@ 2017-02-01 15:31 ` Chris Wilson
  2017-02-01 16:15   ` [PATCH v7] " Chris Wilson
  2017-02-01 16:54 ` ✓ Fi.CI.BAT: success for drm/i915/scheduler: emulate a scheduler for guc (rev3) Patchwork
  2017-02-02 13:54 ` ✓ Fi.CI.BAT: success for drm/i915/scheduler: emulate a scheduler for guc (rev4) Patchwork
  4 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2017-02-01 15:31 UTC (permalink / raw)
  To: intel-gfx

This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).

v2: Drop hack status - though iirc there is still a lockdep inversion
between fence and engine->timeline->lock (which is impossible as the
nesting only occurs on different fences - hopefully just requires some
judicious lockdep annotation)
v3: Apply lockdep nesting to enabling signaling on the request, using
the pattern we already have in __i915_gem_request_submit();
v4: Replaying requests after a hang also now needs the timeline
spinlock, to disable the interrupts at least
v5: Hold wq lock for completeness, and emit a tracepoint for enabling signal
v6: Reorder interrupt checking for a happier gcc.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 109 +++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_irq.c            |  13 ++--
 drivers/gpu/drm/i915/intel_lrc.c           |   5 +-
 3 files changed, 111 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 8ced9e26f075..68c32da28094 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -25,6 +25,8 @@
 #include "i915_drv.h"
 #include "intel_uc.h"
 
+#include <trace/events/dma_fence.h>
+
 /**
  * DOC: GuC-based command submission
  *
@@ -348,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 	u32 freespace;
 	int ret;
 
-	spin_lock(&client->wq_lock);
+	spin_lock_irq(&client->wq_lock);
 	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
 	freespace -= client->wq_rsvd;
 	if (likely(freespace >= wqi_size)) {
@@ -358,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 		client->no_wq_space++;
 		ret = -EAGAIN;
 	}
-	spin_unlock(&client->wq_lock);
+	spin_unlock_irq(&client->wq_lock);
 
 	return ret;
 }
@@ -370,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
 
 	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
 
-	spin_lock(&client->wq_lock);
+	spin_lock_irq(&client->wq_lock);
 	client->wq_rsvd -= wqi_size;
-	spin_unlock(&client->wq_lock);
+	spin_unlock_irq(&client->wq_lock);
 }
 
 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -532,10 +534,96 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq)
 
 static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
-	i915_gem_request_submit(rq);
+	__i915_gem_request_submit(rq);
 	__i915_guc_submit(rq);
 }
 
+static void nested_enable_signaling(struct drm_i915_gem_request *rq)
+{
+	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
+	 * detects an ordering issue between the fence lockclass and the
+	 * global_timeline. This circular dependency can only occur via 2
+	 * different fences (but same fence lockclass), so we use the nesting
+	 * annotation here to prevent the warn, equivalent to the nesting
+	 * inside i915_gem_request_submit() for when we also enable the
+	 * signaler.
+	 */
+
+	if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+			     &rq->fence.flags))
+		return;
+
+	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags));
+	trace_dma_fence_enable_signal(&rq->fence);
+
+	spin_lock_nested(&rq->lock, SINGLE_DEPTH_NESTING);
+	intel_engine_enable_signaling(rq);
+	spin_unlock(&rq->lock);
+}
+
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *last = port[0].request;
+	unsigned long flags;
+	struct rb_node *rb;
+	bool submit = false;
+
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
+		if (last && cursor->ctx != last->ctx) {
+			if (port != engine->execlist_port)
+				break;
+
+			i915_gem_request_assign(&port->request, last);
+			nested_enable_signaling(last);
+			port++;
+		}
+
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
+		i915_guc_submit(cursor);
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		i915_gem_request_assign(&port->request, last);
+		nested_enable_signaling(last);
+		engine->execlist_first = rb;
+	}
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+
+	return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *rq;
+	bool submit;
+
+	do {
+		rq = port[0].request;
+		while (rq && i915_gem_request_completed(rq)) {
+			i915_gem_request_put(rq);
+			port[0].request = port[1].request;
+			port[1].request = NULL;
+		}
+
+		submit = false;
+		if (!port[1].request)
+			submit = i915_guc_dequeue(engine);
+	} while (submit);
+}
+
 /*
  * Everything below here is concerned with setup & teardown, and is
  * therefore not part of the somewhat time-critical batch-submission
@@ -944,15 +1032,22 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
 		struct drm_i915_gem_request *rq;
+		unsigned long flags;
 
-		engine->submit_request = i915_guc_submit;
-		engine->schedule = NULL;
+		tasklet_init(&engine->irq_tasklet,
+			     i915_guc_irq_handler,
+			     (unsigned long)engine);
 
 		/* Replay the current set of previously submitted requests */
+		spin_lock_irqsave(&engine->timeline->lock, flags);
 		list_for_each_entry(rq, &engine->timeline->requests, link) {
+			spin_lock(&client->wq_lock);
 			client->wq_rsvd += sizeof(struct guc_wq_item);
+			spin_unlock(&client->wq_lock);
+
 			__i915_guc_submit(rq);
 		}
+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 0ff75f2282fa..0291a2c7264e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1350,13 +1350,16 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 static __always_inline void
 gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 {
-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
-		notify_ring(engine);
+	if (!(iir & ((GT_RENDER_USER_INTERRUPT | GT_CONTEXT_SWITCH_INTERRUPT) << test_shift)))
+		return;
 
-	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
+	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
 		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-		tasklet_hi_schedule(&engine->irq_tasklet);
-	}
+
+	tasklet_hi_schedule(&engine->irq_tasklet);
+
+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
+		notify_ring(engine);
 }
 
 static irqreturn_t gen8_gt_irq_ack(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 44a92ea464ba..c81d86afb52f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1278,7 +1278,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 
 	/* After a GPU reset, we may have requests to replay */
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-	if (!execlists_elsp_idle(engine)) {
+	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
 		engine->execlist_port[0].count = 0;
 		engine->execlist_port[1].count = 0;
 		execlists_submit_ports(engine);
@@ -1345,9 +1345,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	request->ring->last_retired_head = -1;
 	intel_ring_update_space(request->ring);
 
-	if (i915.enable_guc_submission)
-		return;
-
 	/* Catch up with any missed context-switch interrupts */
 	if (request->ctx != port[0].request->ctx) {
 		i915_gem_request_put(port[0].request);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH v7] drm/i915/scheduler: emulate a scheduler for guc
  2017-02-01 15:31 ` [PATCH v6] " Chris Wilson
@ 2017-02-01 16:15   ` Chris Wilson
  2017-02-01 16:19     ` Tvrtko Ursulin
  2017-02-02 10:35     ` [PATCH v8] " Chris Wilson
  0 siblings, 2 replies; 14+ messages in thread
From: Chris Wilson @ 2017-02-01 16:15 UTC (permalink / raw)
  To: intel-gfx

This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).

v2: Drop hack status - though iirc there is still a lockdep inversion
between fence and engine->timeline->lock (which is impossible as the
nesting only occurs on different fences - hopefully just requires some
judicious lockdep annotation)
v3: Apply lockdep nesting to enabling signaling on the request, using
the pattern we already have in __i915_gem_request_submit();
v4: Replaying requests after a hang also now needs the timeline
spinlock, to disable the interrupts at least
v5: Hold wq lock for completeness, and emit a tracepoint for enabling signal
v6: Reorder interrupt checking for a happier gcc.
v7: Only signal the tasklet after a user-interrupt if using guc scheduling

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 109 +++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_irq.c            |  13 +++-
 drivers/gpu/drm/i915/intel_lrc.c           |   5 +-
 3 files changed, 113 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 8ced9e26f075..68c32da28094 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -25,6 +25,8 @@
 #include "i915_drv.h"
 #include "intel_uc.h"
 
+#include <trace/events/dma_fence.h>
+
 /**
  * DOC: GuC-based command submission
  *
@@ -348,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 	u32 freespace;
 	int ret;
 
-	spin_lock(&client->wq_lock);
+	spin_lock_irq(&client->wq_lock);
 	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
 	freespace -= client->wq_rsvd;
 	if (likely(freespace >= wqi_size)) {
@@ -358,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 		client->no_wq_space++;
 		ret = -EAGAIN;
 	}
-	spin_unlock(&client->wq_lock);
+	spin_unlock_irq(&client->wq_lock);
 
 	return ret;
 }
@@ -370,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
 
 	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
 
-	spin_lock(&client->wq_lock);
+	spin_lock_irq(&client->wq_lock);
 	client->wq_rsvd -= wqi_size;
-	spin_unlock(&client->wq_lock);
+	spin_unlock_irq(&client->wq_lock);
 }
 
 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -532,10 +534,96 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq)
 
 static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
-	i915_gem_request_submit(rq);
+	__i915_gem_request_submit(rq);
 	__i915_guc_submit(rq);
 }
 
+static void nested_enable_signaling(struct drm_i915_gem_request *rq)
+{
+	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
+	 * detects an ordering issue between the fence lockclass and the
+	 * global_timeline. This circular dependency can only occur via 2
+	 * different fences (but same fence lockclass), so we use the nesting
+	 * annotation here to prevent the warn, equivalent to the nesting
+	 * inside i915_gem_request_submit() for when we also enable the
+	 * signaler.
+	 */
+
+	if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+			     &rq->fence.flags))
+		return;
+
+	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags));
+	trace_dma_fence_enable_signal(&rq->fence);
+
+	spin_lock_nested(&rq->lock, SINGLE_DEPTH_NESTING);
+	intel_engine_enable_signaling(rq);
+	spin_unlock(&rq->lock);
+}
+
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *last = port[0].request;
+	unsigned long flags;
+	struct rb_node *rb;
+	bool submit = false;
+
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
+		if (last && cursor->ctx != last->ctx) {
+			if (port != engine->execlist_port)
+				break;
+
+			i915_gem_request_assign(&port->request, last);
+			nested_enable_signaling(last);
+			port++;
+		}
+
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
+		i915_guc_submit(cursor);
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		i915_gem_request_assign(&port->request, last);
+		nested_enable_signaling(last);
+		engine->execlist_first = rb;
+	}
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+
+	return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *rq;
+	bool submit;
+
+	do {
+		rq = port[0].request;
+		while (rq && i915_gem_request_completed(rq)) {
+			i915_gem_request_put(rq);
+			port[0].request = port[1].request;
+			port[1].request = NULL;
+		}
+
+		submit = false;
+		if (!port[1].request)
+			submit = i915_guc_dequeue(engine);
+	} while (submit);
+}
+
 /*
  * Everything below here is concerned with setup & teardown, and is
  * therefore not part of the somewhat time-critical batch-submission
@@ -944,15 +1032,22 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
 		struct drm_i915_gem_request *rq;
+		unsigned long flags;
 
-		engine->submit_request = i915_guc_submit;
-		engine->schedule = NULL;
+		tasklet_init(&engine->irq_tasklet,
+			     i915_guc_irq_handler,
+			     (unsigned long)engine);
 
 		/* Replay the current set of previously submitted requests */
+		spin_lock_irqsave(&engine->timeline->lock, flags);
 		list_for_each_entry(rq, &engine->timeline->requests, link) {
+			spin_lock(&client->wq_lock);
 			client->wq_rsvd += sizeof(struct guc_wq_item);
+			spin_unlock(&client->wq_lock);
+
 			__i915_guc_submit(rq);
 		}
+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 0ff75f2282fa..e153344fcb1a 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1350,13 +1350,20 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 static __always_inline void
 gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 {
-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
-		notify_ring(engine);
+	bool tasklet = false;
 
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
 		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-		tasklet_hi_schedule(&engine->irq_tasklet);
+		tasklet = true;
 	}
+
+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
+		notify_ring(engine);
+		tasklet |= i915.enable_guc_submission;
+	}
+
+	if (tasklet)
+		tasklet_hi_schedule(&engine->irq_tasklet);
 }
 
 static irqreturn_t gen8_gt_irq_ack(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 44a92ea464ba..c81d86afb52f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1278,7 +1278,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 
 	/* After a GPU reset, we may have requests to replay */
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-	if (!execlists_elsp_idle(engine)) {
+	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
 		engine->execlist_port[0].count = 0;
 		engine->execlist_port[1].count = 0;
 		execlists_submit_ports(engine);
@@ -1345,9 +1345,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	request->ring->last_retired_head = -1;
 	intel_ring_update_space(request->ring);
 
-	if (i915.enable_guc_submission)
-		return;
-
 	/* Catch up with any missed context-switch interrupts */
 	if (request->ctx != port[0].request->ctx) {
 		i915_gem_request_put(port[0].request);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH v7] drm/i915/scheduler: emulate a scheduler for guc
  2017-02-01 16:15   ` [PATCH v7] " Chris Wilson
@ 2017-02-01 16:19     ` Tvrtko Ursulin
  2017-02-02 10:35     ` [PATCH v8] " Chris Wilson
  1 sibling, 0 replies; 14+ messages in thread
From: Tvrtko Ursulin @ 2017-02-01 16:19 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 01/02/2017 16:15, Chris Wilson wrote:
> This emulates execlists on top of the GuC in order to defer submission of
> requests to the hardware. This deferral allows time for high priority
> requests to gazump their way to the head of the queue, however it nerfs
> the GuC by converting it back into a simple execlist (where the CPU has
> to wake up after every request to feed new commands into the GuC).
>
> v2: Drop hack status - though iirc there is still a lockdep inversion
> between fence and engine->timeline->lock (which is impossible as the
> nesting only occurs on different fences - hopefully just requires some
> judicious lockdep annotation)
> v3: Apply lockdep nesting to enabling signaling on the request, using
> the pattern we already have in __i915_gem_request_submit();
> v4: Replaying requests after a hang also now needs the timeline
> spinlock, to disable the interrupts at least
> v5: Hold wq lock for completeness, and emit a tracepoint for enabling signal
> v6: Reorder interrupt checking for a happier gcc.
> v7: Only signal the tasklet after a user-interrupt if using guc scheduling
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 109 +++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/i915_irq.c            |  13 +++-
>  drivers/gpu/drm/i915/intel_lrc.c           |   5 +-
>  3 files changed, 113 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 8ced9e26f075..68c32da28094 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -25,6 +25,8 @@
>  #include "i915_drv.h"
>  #include "intel_uc.h"
>
> +#include <trace/events/dma_fence.h>
> +
>  /**
>   * DOC: GuC-based command submission
>   *
> @@ -348,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  	u32 freespace;
>  	int ret;
>
> -	spin_lock(&client->wq_lock);
> +	spin_lock_irq(&client->wq_lock);
>  	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
>  	freespace -= client->wq_rsvd;
>  	if (likely(freespace >= wqi_size)) {
> @@ -358,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
>  		client->no_wq_space++;
>  		ret = -EAGAIN;
>  	}
> -	spin_unlock(&client->wq_lock);
> +	spin_unlock_irq(&client->wq_lock);
>
>  	return ret;
>  }
> @@ -370,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
>
>  	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
>
> -	spin_lock(&client->wq_lock);
> +	spin_lock_irq(&client->wq_lock);
>  	client->wq_rsvd -= wqi_size;
> -	spin_unlock(&client->wq_lock);
> +	spin_unlock_irq(&client->wq_lock);
>  }
>
>  /* Construct a Work Item and append it to the GuC's Work Queue */
> @@ -532,10 +534,96 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq)
>
>  static void i915_guc_submit(struct drm_i915_gem_request *rq)
>  {
> -	i915_gem_request_submit(rq);
> +	__i915_gem_request_submit(rq);
>  	__i915_guc_submit(rq);
>  }
>
> +static void nested_enable_signaling(struct drm_i915_gem_request *rq)
> +{
> +	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
> +	 * detects an ordering issue between the fence lockclass and the
> +	 * global_timeline. This circular dependency can only occur via 2
> +	 * different fences (but same fence lockclass), so we use the nesting
> +	 * annotation here to prevent the warn, equivalent to the nesting
> +	 * inside i915_gem_request_submit() for when we also enable the
> +	 * signaler.
> +	 */
> +
> +	if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> +			     &rq->fence.flags))
> +		return;
> +
> +	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags));
> +	trace_dma_fence_enable_signal(&rq->fence);
> +
> +	spin_lock_nested(&rq->lock, SINGLE_DEPTH_NESTING);
> +	intel_engine_enable_signaling(rq);
> +	spin_unlock(&rq->lock);
> +}
> +
> +static bool i915_guc_dequeue(struct intel_engine_cs *engine)
> +{
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *last = port[0].request;
> +	unsigned long flags;
> +	struct rb_node *rb;
> +	bool submit = false;
> +
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
> +		if (last && cursor->ctx != last->ctx) {
> +			if (port != engine->execlist_port)
> +				break;
> +
> +			i915_gem_request_assign(&port->request, last);
> +			nested_enable_signaling(last);
> +			port++;
> +		}
> +
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		i915_guc_submit(cursor);
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		i915_gem_request_assign(&port->request, last);
> +		nested_enable_signaling(last);
> +		engine->execlist_first = rb;
> +	}
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +	return submit;
> +}
> +
> +static void i915_guc_irq_handler(unsigned long data)
> +{
> +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *rq;
> +	bool submit;
> +
> +	do {
> +		rq = port[0].request;
> +		while (rq && i915_gem_request_completed(rq)) {
> +			i915_gem_request_put(rq);
> +			port[0].request = port[1].request;
> +			port[1].request = NULL;
> +		}
> +
> +		submit = false;
> +		if (!port[1].request)
> +			submit = i915_guc_dequeue(engine);
> +	} while (submit);
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission
> @@ -944,15 +1032,22 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
>  		struct drm_i915_gem_request *rq;
> +		unsigned long flags;
>
> -		engine->submit_request = i915_guc_submit;
> -		engine->schedule = NULL;
> +		tasklet_init(&engine->irq_tasklet,
> +			     i915_guc_irq_handler,
> +			     (unsigned long)engine);
>
>  		/* Replay the current set of previously submitted requests */
> +		spin_lock_irqsave(&engine->timeline->lock, flags);
>  		list_for_each_entry(rq, &engine->timeline->requests, link) {
> +			spin_lock(&client->wq_lock);
>  			client->wq_rsvd += sizeof(struct guc_wq_item);
> +			spin_unlock(&client->wq_lock);
> +
>  			__i915_guc_submit(rq);
>  		}
> +		spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  	}
>
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index 0ff75f2282fa..e153344fcb1a 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1350,13 +1350,20 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
>  static __always_inline void
>  gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
>  {
> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> -		notify_ring(engine);
> +	bool tasklet = false;
>
>  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
>  		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -		tasklet_hi_schedule(&engine->irq_tasklet);
> +		tasklet = true;
>  	}
> +
> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> +		notify_ring(engine);
> +		tasklet |= i915.enable_guc_submission;
> +	}
> +
> +	if (tasklet)
> +		tasklet_hi_schedule(&engine->irq_tasklet);
>  }
>
>  static irqreturn_t gen8_gt_irq_ack(struct drm_i915_private *dev_priv,
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 44a92ea464ba..c81d86afb52f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1278,7 +1278,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
>
>  	/* After a GPU reset, we may have requests to replay */
>  	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -	if (!execlists_elsp_idle(engine)) {
> +	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
>  		engine->execlist_port[0].count = 0;
>  		engine->execlist_port[1].count = 0;
>  		execlists_submit_ports(engine);
> @@ -1345,9 +1345,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
>
> -	if (i915.enable_guc_submission)
> -		return;
> -
>  	/* Catch up with any missed context-switch interrupts */
>  	if (request->ctx != port[0].request->ctx) {
>  		i915_gem_request_put(port[0].request);
>

Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

We should though withhold merging it before the performance data arrives.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915/scheduler: emulate a scheduler for guc (rev3)
  2017-02-01 10:24 [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
                   ` (2 preceding siblings ...)
  2017-02-01 15:31 ` [PATCH v6] " Chris Wilson
@ 2017-02-01 16:54 ` Patchwork
  2017-02-02 13:54 ` ✓ Fi.CI.BAT: success for drm/i915/scheduler: emulate a scheduler for guc (rev4) Patchwork
  4 siblings, 0 replies; 14+ messages in thread
From: Patchwork @ 2017-02-01 16:54 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/scheduler: emulate a scheduler for guc (rev3)
URL   : https://patchwork.freedesktop.org/series/18909/
State : success

== Summary ==

Series 18909v3 drm/i915/scheduler: emulate a scheduler for guc
https://patchwork.freedesktop.org/api/1.0/series/18909/revisions/3/mbox/


fi-bdw-5557u     total:247  pass:233  dwarn:0   dfail:0   fail:0   skip:14 
fi-bsw-n3050     total:247  pass:208  dwarn:0   dfail:0   fail:0   skip:39 
fi-bxt-j4205     total:247  pass:225  dwarn:0   dfail:0   fail:0   skip:22 
fi-bxt-t5700     total:78   pass:65   dwarn:0   dfail:0   fail:0   skip:12 
fi-byt-j1900     total:247  pass:220  dwarn:0   dfail:0   fail:0   skip:27 
fi-byt-n2820     total:247  pass:216  dwarn:0   dfail:0   fail:0   skip:31 
fi-hsw-4770      total:247  pass:228  dwarn:0   dfail:0   fail:0   skip:19 
fi-hsw-4770r     total:247  pass:228  dwarn:0   dfail:0   fail:0   skip:19 
fi-ivb-3520m     total:247  pass:226  dwarn:0   dfail:0   fail:0   skip:21 
fi-ivb-3770      total:247  pass:226  dwarn:0   dfail:0   fail:0   skip:21 
fi-kbl-7500u     total:247  pass:224  dwarn:0   dfail:0   fail:2   skip:21 
fi-skl-6260u     total:247  pass:234  dwarn:0   dfail:0   fail:0   skip:13 
fi-skl-6700hq    total:247  pass:227  dwarn:0   dfail:0   fail:0   skip:20 
fi-skl-6700k     total:247  pass:222  dwarn:4   dfail:0   fail:0   skip:21 
fi-skl-6770hq    total:247  pass:234  dwarn:0   dfail:0   fail:0   skip:13 
fi-snb-2520m     total:247  pass:216  dwarn:0   dfail:0   fail:0   skip:31 
fi-snb-2600      total:247  pass:215  dwarn:0   dfail:0   fail:0   skip:32 

8126864ab4eab95a09af5677535e8ccae3b6b73a drm-tip: 2017y-02m-01d-15h-25m-23s UTC integration manifest
b8f86a1 drm/i915/scheduler: emulate a scheduler for guc

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_3667/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH v8] drm/i915/scheduler: emulate a scheduler for guc
  2017-02-01 16:15   ` [PATCH v7] " Chris Wilson
  2017-02-01 16:19     ` Tvrtko Ursulin
@ 2017-02-02 10:35     ` Chris Wilson
  2017-02-13  8:06       ` Zheng, Xiao
  1 sibling, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2017-02-02 10:35 UTC (permalink / raw)
  To: intel-gfx

This emulates execlists on top of the GuC in order to defer submission of
requests to the hardware. This deferral allows time for high priority
requests to gazump their way to the head of the queue, however it nerfs
the GuC by converting it back into a simple execlist (where the CPU has
to wake up after every request to feed new commands into the GuC).

v2: Drop hack status - though iirc there is still a lockdep inversion
between fence and engine->timeline->lock (which is impossible as the
nesting only occurs on different fences - hopefully just requires some
judicious lockdep annotation)
v3: Apply lockdep nesting to enabling signaling on the request, using
the pattern we already have in __i915_gem_request_submit();
v4: Replaying requests after a hang also now needs the timeline
spinlock, to disable the interrupts at least
v5: Hold wq lock for completeness, and emit a tracepoint for enabling signal
v6: Reorder interrupt checking for a happier gcc.
v7: Only signal the tasklet after a user-interrupt if using guc scheduling
v8: Restore lost update of rq through the i915_guc_irq_handler (Tvrtko)

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
---
 drivers/gpu/drm/i915/i915_guc_submission.c | 110 +++++++++++++++++++++++++++--
 drivers/gpu/drm/i915/i915_irq.c            |  13 +++-
 drivers/gpu/drm/i915/intel_lrc.c           |   5 +-
 3 files changed, 114 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 8ced9e26f075..d99185aafe58 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -25,6 +25,8 @@
 #include "i915_drv.h"
 #include "intel_uc.h"
 
+#include <trace/events/dma_fence.h>
+
 /**
  * DOC: GuC-based command submission
  *
@@ -348,7 +350,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 	u32 freespace;
 	int ret;
 
-	spin_lock(&client->wq_lock);
+	spin_lock_irq(&client->wq_lock);
 	freespace = CIRC_SPACE(client->wq_tail, desc->head, client->wq_size);
 	freespace -= client->wq_rsvd;
 	if (likely(freespace >= wqi_size)) {
@@ -358,7 +360,7 @@ int i915_guc_wq_reserve(struct drm_i915_gem_request *request)
 		client->no_wq_space++;
 		ret = -EAGAIN;
 	}
-	spin_unlock(&client->wq_lock);
+	spin_unlock_irq(&client->wq_lock);
 
 	return ret;
 }
@@ -370,9 +372,9 @@ void i915_guc_wq_unreserve(struct drm_i915_gem_request *request)
 
 	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
 
-	spin_lock(&client->wq_lock);
+	spin_lock_irq(&client->wq_lock);
 	client->wq_rsvd -= wqi_size;
-	spin_unlock(&client->wq_lock);
+	spin_unlock_irq(&client->wq_lock);
 }
 
 /* Construct a Work Item and append it to the GuC's Work Queue */
@@ -532,10 +534,97 @@ static void __i915_guc_submit(struct drm_i915_gem_request *rq)
 
 static void i915_guc_submit(struct drm_i915_gem_request *rq)
 {
-	i915_gem_request_submit(rq);
+	__i915_gem_request_submit(rq);
 	__i915_guc_submit(rq);
 }
 
+static void nested_enable_signaling(struct drm_i915_gem_request *rq)
+{
+	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
+	 * detects an ordering issue between the fence lockclass and the
+	 * global_timeline. This circular dependency can only occur via 2
+	 * different fences (but same fence lockclass), so we use the nesting
+	 * annotation here to prevent the warn, equivalent to the nesting
+	 * inside i915_gem_request_submit() for when we also enable the
+	 * signaler.
+	 */
+
+	if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
+			     &rq->fence.flags))
+		return;
+
+	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq->fence.flags));
+	trace_dma_fence_enable_signal(&rq->fence);
+
+	spin_lock_nested(&rq->lock, SINGLE_DEPTH_NESTING);
+	intel_engine_enable_signaling(rq);
+	spin_unlock(&rq->lock);
+}
+
+static bool i915_guc_dequeue(struct intel_engine_cs *engine)
+{
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *last = port[0].request;
+	unsigned long flags;
+	struct rb_node *rb;
+	bool submit = false;
+
+	spin_lock_irqsave(&engine->timeline->lock, flags);
+	rb = engine->execlist_first;
+	while (rb) {
+		struct drm_i915_gem_request *cursor =
+			rb_entry(rb, typeof(*cursor), priotree.node);
+
+		if (last && cursor->ctx != last->ctx) {
+			if (port != engine->execlist_port)
+				break;
+
+			i915_gem_request_assign(&port->request, last);
+			nested_enable_signaling(last);
+			port++;
+		}
+
+		rb = rb_next(rb);
+		rb_erase(&cursor->priotree.node, &engine->execlist_queue);
+		RB_CLEAR_NODE(&cursor->priotree.node);
+		cursor->priotree.priority = INT_MAX;
+
+		i915_guc_submit(cursor);
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		i915_gem_request_assign(&port->request, last);
+		nested_enable_signaling(last);
+		engine->execlist_first = rb;
+	}
+	spin_unlock_irqrestore(&engine->timeline->lock, flags);
+
+	return submit;
+}
+
+static void i915_guc_irq_handler(unsigned long data)
+{
+	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct execlist_port *port = engine->execlist_port;
+	struct drm_i915_gem_request *rq;
+	bool submit;
+
+	do {
+		rq = port[0].request;
+		while (rq && i915_gem_request_completed(rq)) {
+			i915_gem_request_put(rq);
+			port[0].request = port[1].request;
+			port[1].request = NULL;
+			rq = port[0].request;
+		}
+
+		submit = false;
+		if (!port[1].request)
+			submit = i915_guc_dequeue(engine);
+	} while (submit);
+}
+
 /*
  * Everything below here is concerned with setup & teardown, and is
  * therefore not part of the somewhat time-critical batch-submission
@@ -944,15 +1033,22 @@ int i915_guc_submission_enable(struct drm_i915_private *dev_priv)
 	/* Take over from manual control of ELSP (execlists) */
 	for_each_engine(engine, dev_priv, id) {
 		struct drm_i915_gem_request *rq;
+		unsigned long flags;
 
-		engine->submit_request = i915_guc_submit;
-		engine->schedule = NULL;
+		tasklet_init(&engine->irq_tasklet,
+			     i915_guc_irq_handler,
+			     (unsigned long)engine);
 
 		/* Replay the current set of previously submitted requests */
+		spin_lock_irqsave(&engine->timeline->lock, flags);
 		list_for_each_entry(rq, &engine->timeline->requests, link) {
+			spin_lock(&client->wq_lock);
 			client->wq_rsvd += sizeof(struct guc_wq_item);
+			spin_unlock(&client->wq_lock);
+
 			__i915_guc_submit(rq);
 		}
+		spin_unlock_irqrestore(&engine->timeline->lock, flags);
 	}
 
 	return 0;
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 436b5baf38c2..72f9497518bc 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1350,13 +1350,20 @@ static void snb_gt_irq_handler(struct drm_i915_private *dev_priv,
 static __always_inline void
 gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
 {
-	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
-		notify_ring(engine);
+	bool tasklet = false;
 
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
 		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-		tasklet_hi_schedule(&engine->irq_tasklet);
+		tasklet = true;
 	}
+
+	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
+		notify_ring(engine);
+		tasklet |= i915.enable_guc_submission;
+	}
+
+	if (tasklet)
+		tasklet_hi_schedule(&engine->irq_tasklet);
 }
 
 static irqreturn_t gen8_gt_irq_ack(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 44a92ea464ba..c81d86afb52f 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -1278,7 +1278,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 
 	/* After a GPU reset, we may have requests to replay */
 	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
-	if (!execlists_elsp_idle(engine)) {
+	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
 		engine->execlist_port[0].count = 0;
 		engine->execlist_port[1].count = 0;
 		execlists_submit_ports(engine);
@@ -1345,9 +1345,6 @@ static void reset_common_ring(struct intel_engine_cs *engine,
 	request->ring->last_retired_head = -1;
 	intel_ring_update_space(request->ring);
 
-	if (i915.enable_guc_submission)
-		return;
-
 	/* Catch up with any missed context-switch interrupts */
 	if (request->ctx != port[0].request->ctx) {
 		i915_gem_request_put(port[0].request);
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915/scheduler: emulate a scheduler for guc (rev4)
  2017-02-01 10:24 [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
                   ` (3 preceding siblings ...)
  2017-02-01 16:54 ` ✓ Fi.CI.BAT: success for drm/i915/scheduler: emulate a scheduler for guc (rev3) Patchwork
@ 2017-02-02 13:54 ` Patchwork
  4 siblings, 0 replies; 14+ messages in thread
From: Patchwork @ 2017-02-02 13:54 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915/scheduler: emulate a scheduler for guc (rev4)
URL   : https://patchwork.freedesktop.org/series/18909/
State : success

== Summary ==

Series 18909v4 drm/i915/scheduler: emulate a scheduler for guc
https://patchwork.freedesktop.org/api/1.0/series/18909/revisions/4/mbox/


fi-bdw-5557u     total:247  pass:233  dwarn:0   dfail:0   fail:0   skip:14 
fi-bsw-n3050     total:247  pass:208  dwarn:0   dfail:0   fail:0   skip:39 
fi-bxt-j4205     total:247  pass:225  dwarn:0   dfail:0   fail:0   skip:22 
fi-bxt-t5700     total:78   pass:65   dwarn:0   dfail:0   fail:0   skip:12 
fi-byt-j1900     total:247  pass:220  dwarn:0   dfail:0   fail:0   skip:27 
fi-byt-n2820     total:247  pass:216  dwarn:0   dfail:0   fail:0   skip:31 
fi-hsw-4770      total:247  pass:228  dwarn:0   dfail:0   fail:0   skip:19 
fi-hsw-4770r     total:247  pass:228  dwarn:0   dfail:0   fail:0   skip:19 
fi-ivb-3520m     total:247  pass:226  dwarn:0   dfail:0   fail:0   skip:21 
fi-ivb-3770      total:247  pass:226  dwarn:0   dfail:0   fail:0   skip:21 
fi-skl-6260u     total:247  pass:234  dwarn:0   dfail:0   fail:0   skip:13 
fi-skl-6700hq    total:247  pass:227  dwarn:0   dfail:0   fail:0   skip:20 
fi-skl-6700k     total:247  pass:222  dwarn:4   dfail:0   fail:0   skip:21 
fi-skl-6770hq    total:247  pass:234  dwarn:0   dfail:0   fail:0   skip:13 
fi-snb-2520m     total:247  pass:216  dwarn:0   dfail:0   fail:0   skip:31 
fi-snb-2600      total:247  pass:215  dwarn:0   dfail:0   fail:0   skip:32 

f9c08830c03caa5bc556eb7bb0e42dfd55d229cf drm-tip: 2017y-02m-02d-12h-26m-40s UTC integration manifest
1df3b1f9 drm/i915/scheduler: emulate a scheduler for guc

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_3675/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8] drm/i915/scheduler: emulate a scheduler for guc
  2017-02-02 10:35     ` [PATCH v8] " Chris Wilson
@ 2017-02-13  8:06       ` Zheng, Xiao
  2017-02-14  1:08         ` Zheng, Xiao
  0 siblings, 1 reply; 14+ messages in thread
From: Zheng, Xiao @ 2017-02-13  8:06 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx; +Cc: Wang, Zhenyu Z

Thanks Chris. 
This is Xiao from GVT-g team. GVT-g is actually looking for this patch once GUC submission enabled in host/native side.  GVT-g requires a single submission (only one request in GUC FW anytime) to GUC FW because we need to do additional setup/cleanup for VM switch.
One thing need to mention:  we need  'execlists_context_status_change' notification before i915_guc_submit() just what we did for ELSP port write. Anyway we can submit another patch based on this. 
The question is:  when will this patch be merged to upstream?

> -----Original Message-----
> From: Intel-gfx [mailto:intel-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Chris Wilson
> Sent: Thursday, February 2, 2017 6:36 PM
> To: intel-gfx@lists.freedesktop.org
> Subject: [Intel-gfx] [PATCH v8] drm/i915/scheduler: emulate a scheduler for
> guc
> 
> This emulates execlists on top of the GuC in order to defer submission of
> requests to the hardware. This deferral allows time for high priority requests
> to gazump their way to the head of the queue, however it nerfs the GuC by
> converting it back into a simple execlist (where the CPU has to wake up after
> every request to feed new commands into the GuC).
> 
> v2: Drop hack status - though iirc there is still a lockdep inversion between
> fence and engine->timeline->lock (which is impossible as the nesting only
> occurs on different fences - hopefully just requires some judicious lockdep
> annotation)
> v3: Apply lockdep nesting to enabling signaling on the request, using the
> pattern we already have in __i915_gem_request_submit();
> v4: Replaying requests after a hang also now needs the timeline spinlock, to
> disable the interrupts at least
> v5: Hold wq lock for completeness, and emit a tracepoint for enabling signal
> v6: Reorder interrupt checking for a happier gcc.
> v7: Only signal the tasklet after a user-interrupt if using guc scheduling
> v8: Restore lost update of rq through the i915_guc_irq_handler (Tvrtko)
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> ---
>  drivers/gpu/drm/i915/i915_guc_submission.c | 110
> +++++++++++++++++++++++++++--
>  drivers/gpu/drm/i915/i915_irq.c            |  13 +++-
>  drivers/gpu/drm/i915/intel_lrc.c           |   5 +-
>  3 files changed, 114 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c
> b/drivers/gpu/drm/i915/i915_guc_submission.c
> index 8ced9e26f075..d99185aafe58 100644
> --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> @@ -25,6 +25,8 @@
>  #include "i915_drv.h"
>  #include "intel_uc.h"
> 
> +#include <trace/events/dma_fence.h>
> +
>  /**
>   * DOC: GuC-based command submission
>   *
> @@ -348,7 +350,7 @@ int i915_guc_wq_reserve(struct
> drm_i915_gem_request *request)
>  	u32 freespace;
>  	int ret;
> 
> -	spin_lock(&client->wq_lock);
> +	spin_lock_irq(&client->wq_lock);
>  	freespace = CIRC_SPACE(client->wq_tail, desc->head, client-
> >wq_size);
>  	freespace -= client->wq_rsvd;
>  	if (likely(freespace >= wqi_size)) {
> @@ -358,7 +360,7 @@ int i915_guc_wq_reserve(struct
> drm_i915_gem_request *request)
>  		client->no_wq_space++;
>  		ret = -EAGAIN;
>  	}
> -	spin_unlock(&client->wq_lock);
> +	spin_unlock_irq(&client->wq_lock);
> 
>  	return ret;
>  }
> @@ -370,9 +372,9 @@ void i915_guc_wq_unreserve(struct
> drm_i915_gem_request *request)
> 
>  	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
> 
> -	spin_lock(&client->wq_lock);
> +	spin_lock_irq(&client->wq_lock);
>  	client->wq_rsvd -= wqi_size;
> -	spin_unlock(&client->wq_lock);
> +	spin_unlock_irq(&client->wq_lock);
>  }
> 
>  /* Construct a Work Item and append it to the GuC's Work Queue */ @@ -
> 532,10 +534,97 @@ static void __i915_guc_submit(struct
> drm_i915_gem_request *rq)
> 
>  static void i915_guc_submit(struct drm_i915_gem_request *rq)  {
> -	i915_gem_request_submit(rq);
> +	__i915_gem_request_submit(rq);
>  	__i915_guc_submit(rq);
>  }
> 
> +static void nested_enable_signaling(struct drm_i915_gem_request *rq) {
> +	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
> +	 * detects an ordering issue between the fence lockclass and the
> +	 * global_timeline. This circular dependency can only occur via 2
> +	 * different fences (but same fence lockclass), so we use the nesting
> +	 * annotation here to prevent the warn, equivalent to the nesting
> +	 * inside i915_gem_request_submit() for when we also enable the
> +	 * signaler.
> +	 */
> +
> +	if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> +			     &rq->fence.flags))
> +		return;
> +
> +	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq-
> >fence.flags));
> +	trace_dma_fence_enable_signal(&rq->fence);
> +
> +	spin_lock_nested(&rq->lock, SINGLE_DEPTH_NESTING);
> +	intel_engine_enable_signaling(rq);
> +	spin_unlock(&rq->lock);
> +}
> +
> +static bool i915_guc_dequeue(struct intel_engine_cs *engine) {
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *last = port[0].request;
> +	unsigned long flags;
> +	struct rb_node *rb;
> +	bool submit = false;
> +
> +	spin_lock_irqsave(&engine->timeline->lock, flags);
> +	rb = engine->execlist_first;
> +	while (rb) {
> +		struct drm_i915_gem_request *cursor =
> +			rb_entry(rb, typeof(*cursor), priotree.node);
> +
> +		if (last && cursor->ctx != last->ctx) {
> +			if (port != engine->execlist_port)
> +				break;
> +
> +			i915_gem_request_assign(&port->request, last);
> +			nested_enable_signaling(last);
> +			port++;
> +		}
> +
> +		rb = rb_next(rb);
> +		rb_erase(&cursor->priotree.node, &engine-
> >execlist_queue);
> +		RB_CLEAR_NODE(&cursor->priotree.node);
> +		cursor->priotree.priority = INT_MAX;
> +
> +		i915_guc_submit(cursor);
> +		last = cursor;
> +		submit = true;
> +	}
> +	if (submit) {
> +		i915_gem_request_assign(&port->request, last);
> +		nested_enable_signaling(last);
> +		engine->execlist_first = rb;
> +	}
> +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> +
> +	return submit;
> +}
> +
> +static void i915_guc_irq_handler(unsigned long data) {
> +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> +	struct execlist_port *port = engine->execlist_port;
> +	struct drm_i915_gem_request *rq;
> +	bool submit;
> +
> +	do {
> +		rq = port[0].request;
> +		while (rq && i915_gem_request_completed(rq)) {
> +			i915_gem_request_put(rq);
> +			port[0].request = port[1].request;
> +			port[1].request = NULL;
> +			rq = port[0].request;
> +		}
> +
> +		submit = false;
> +		if (!port[1].request)
> +			submit = i915_guc_dequeue(engine);
> +	} while (submit);
> +}
> +
>  /*
>   * Everything below here is concerned with setup & teardown, and is
>   * therefore not part of the somewhat time-critical batch-submission @@ -
> 944,15 +1033,22 @@ int i915_guc_submission_enable(struct
> drm_i915_private *dev_priv)
>  	/* Take over from manual control of ELSP (execlists) */
>  	for_each_engine(engine, dev_priv, id) {
>  		struct drm_i915_gem_request *rq;
> +		unsigned long flags;
> 
> -		engine->submit_request = i915_guc_submit;
> -		engine->schedule = NULL;
> +		tasklet_init(&engine->irq_tasklet,
> +			     i915_guc_irq_handler,
> +			     (unsigned long)engine);
> 
>  		/* Replay the current set of previously submitted requests */
> +		spin_lock_irqsave(&engine->timeline->lock, flags);
>  		list_for_each_entry(rq, &engine->timeline->requests, link) {
> +			spin_lock(&client->wq_lock);
>  			client->wq_rsvd += sizeof(struct guc_wq_item);
> +			spin_unlock(&client->wq_lock);
> +
>  			__i915_guc_submit(rq);
>  		}
> +		spin_unlock_irqrestore(&engine->timeline->lock, flags);
>  	}
> 
>  	return 0;
> diff --git a/drivers/gpu/drm/i915/i915_irq.c
> b/drivers/gpu/drm/i915/i915_irq.c index 436b5baf38c2..72f9497518bc 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -1350,13 +1350,20 @@ static void snb_gt_irq_handler(struct
> drm_i915_private *dev_priv,  static __always_inline void
> gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)  {
> -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> -		notify_ring(engine);
> +	bool tasklet = false;
> 
>  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
>  		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -		tasklet_hi_schedule(&engine->irq_tasklet);
> +		tasklet = true;
>  	}
> +
> +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> +		notify_ring(engine);
> +		tasklet |= i915.enable_guc_submission;
> +	}
> +
> +	if (tasklet)
> +		tasklet_hi_schedule(&engine->irq_tasklet);
>  }
> 
>  static irqreturn_t gen8_gt_irq_ack(struct drm_i915_private *dev_priv, diff --
> git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 44a92ea464ba..c81d86afb52f 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -1278,7 +1278,7 @@ static int gen8_init_common_ring(struct
> intel_engine_cs *engine)
> 
>  	/* After a GPU reset, we may have requests to replay */
>  	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> -	if (!execlists_elsp_idle(engine)) {
> +	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
>  		engine->execlist_port[0].count = 0;
>  		engine->execlist_port[1].count = 0;
>  		execlists_submit_ports(engine);
> @@ -1345,9 +1345,6 @@ static void reset_common_ring(struct
> intel_engine_cs *engine,
>  	request->ring->last_retired_head = -1;
>  	intel_ring_update_space(request->ring);
> 
> -	if (i915.enable_guc_submission)
> -		return;
> -
>  	/* Catch up with any missed context-switch interrupts */
>  	if (request->ctx != port[0].request->ctx) {
>  		i915_gem_request_put(port[0].request);
> --
> 2.11.0
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH v8] drm/i915/scheduler: emulate a scheduler for guc
  2017-02-13  8:06       ` Zheng, Xiao
@ 2017-02-14  1:08         ` Zheng, Xiao
  0 siblings, 0 replies; 14+ messages in thread
From: Zheng, Xiao @ 2017-02-14  1:08 UTC (permalink / raw)
  To: Zheng, Xiao, Chris Wilson, intel-gfx; +Cc: Wang, Zhenyu Z

Hi Chris
Just to check with you that this patch will checkin. Because it is essential for GVT once GUC enabled.

> -----Original Message-----
> From: Intel-gfx [mailto:intel-gfx-bounces@lists.freedesktop.org] On Behalf
> Of Zheng, Xiao
> Sent: Monday, February 13, 2017 4:06 PM
> To: Chris Wilson <chris@chris-wilson.co.uk>; intel-gfx@lists.freedesktop.org
> Cc: Wang, Zhenyu Z <zhenyu.z.wang@intel.com>
> Subject: Re: [Intel-gfx] [PATCH v8] drm/i915/scheduler: emulate a scheduler
> for guc
> 
> Thanks Chris.
> This is Xiao from GVT-g team. GVT-g is actually looking for this patch once
> GUC submission enabled in host/native side.  GVT-g requires a single
> submission (only one request in GUC FW anytime) to GUC FW because we
> need to do additional setup/cleanup for VM switch.
> One thing need to mention:  we need  'execlists_context_status_change'
> notification before i915_guc_submit() just what we did for ELSP port write.
> Anyway we can submit another patch based on this.
> The question is:  when will this patch be merged to upstream?
> 
> > -----Original Message-----
> > From: Intel-gfx [mailto:intel-gfx-bounces@lists.freedesktop.org] On
> > Behalf Of Chris Wilson
> > Sent: Thursday, February 2, 2017 6:36 PM
> > To: intel-gfx@lists.freedesktop.org
> > Subject: [Intel-gfx] [PATCH v8] drm/i915/scheduler: emulate a
> > scheduler for guc
> >
> > This emulates execlists on top of the GuC in order to defer submission
> > of requests to the hardware. This deferral allows time for high
> > priority requests to gazump their way to the head of the queue,
> > however it nerfs the GuC by converting it back into a simple execlist
> > (where the CPU has to wake up after every request to feed new
> commands into the GuC).
> >
> > v2: Drop hack status - though iirc there is still a lockdep inversion
> > between fence and engine->timeline->lock (which is impossible as the
> > nesting only occurs on different fences - hopefully just requires some
> > judicious lockdep
> > annotation)
> > v3: Apply lockdep nesting to enabling signaling on the request, using
> > the pattern we already have in __i915_gem_request_submit();
> > v4: Replaying requests after a hang also now needs the timeline
> > spinlock, to disable the interrupts at least
> > v5: Hold wq lock for completeness, and emit a tracepoint for enabling
> > signal
> > v6: Reorder interrupt checking for a happier gcc.
> > v7: Only signal the tasklet after a user-interrupt if using guc
> > scheduling
> > v8: Restore lost update of rq through the i915_guc_irq_handler
> > (Tvrtko)
> >
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
> > ---
> >  drivers/gpu/drm/i915/i915_guc_submission.c | 110
> > +++++++++++++++++++++++++++--
> >  drivers/gpu/drm/i915/i915_irq.c            |  13 +++-
> >  drivers/gpu/drm/i915/intel_lrc.c           |   5 +-
> >  3 files changed, 114 insertions(+), 14 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c
> > b/drivers/gpu/drm/i915/i915_guc_submission.c
> > index 8ced9e26f075..d99185aafe58 100644
> > --- a/drivers/gpu/drm/i915/i915_guc_submission.c
> > +++ b/drivers/gpu/drm/i915/i915_guc_submission.c
> > @@ -25,6 +25,8 @@
> >  #include "i915_drv.h"
> >  #include "intel_uc.h"
> >
> > +#include <trace/events/dma_fence.h>
> > +
> >  /**
> >   * DOC: GuC-based command submission
> >   *
> > @@ -348,7 +350,7 @@ int i915_guc_wq_reserve(struct
> > drm_i915_gem_request *request)
> >  	u32 freespace;
> >  	int ret;
> >
> > -	spin_lock(&client->wq_lock);
> > +	spin_lock_irq(&client->wq_lock);
> >  	freespace = CIRC_SPACE(client->wq_tail, desc->head, client-
> > >wq_size);
> >  	freespace -= client->wq_rsvd;
> >  	if (likely(freespace >= wqi_size)) { @@ -358,7 +360,7 @@ int
> > i915_guc_wq_reserve(struct drm_i915_gem_request *request)
> >  		client->no_wq_space++;
> >  		ret = -EAGAIN;
> >  	}
> > -	spin_unlock(&client->wq_lock);
> > +	spin_unlock_irq(&client->wq_lock);
> >
> >  	return ret;
> >  }
> > @@ -370,9 +372,9 @@ void i915_guc_wq_unreserve(struct
> > drm_i915_gem_request *request)
> >
> >  	GEM_BUG_ON(READ_ONCE(client->wq_rsvd) < wqi_size);
> >
> > -	spin_lock(&client->wq_lock);
> > +	spin_lock_irq(&client->wq_lock);
> >  	client->wq_rsvd -= wqi_size;
> > -	spin_unlock(&client->wq_lock);
> > +	spin_unlock_irq(&client->wq_lock);
> >  }
> >
> >  /* Construct a Work Item and append it to the GuC's Work Queue */ @@
> > -
> > 532,10 +534,97 @@ static void __i915_guc_submit(struct
> > drm_i915_gem_request *rq)
> >
> >  static void i915_guc_submit(struct drm_i915_gem_request *rq)  {
> > -	i915_gem_request_submit(rq);
> > +	__i915_gem_request_submit(rq);
> >  	__i915_guc_submit(rq);
> >  }
> >
> > +static void nested_enable_signaling(struct drm_i915_gem_request *rq) {
> > +	/* If we use dma_fence_enable_sw_signaling() directly, lockdep
> > +	 * detects an ordering issue between the fence lockclass and the
> > +	 * global_timeline. This circular dependency can only occur via 2
> > +	 * different fences (but same fence lockclass), so we use the nesting
> > +	 * annotation here to prevent the warn, equivalent to the nesting
> > +	 * inside i915_gem_request_submit() for when we also enable the
> > +	 * signaler.
> > +	 */
> > +
> > +	if (test_and_set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
> > +			     &rq->fence.flags))
> > +		return;
> > +
> > +	GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &rq-
> > >fence.flags));
> > +	trace_dma_fence_enable_signal(&rq->fence);
> > +
> > +	spin_lock_nested(&rq->lock, SINGLE_DEPTH_NESTING);
> > +	intel_engine_enable_signaling(rq);
> > +	spin_unlock(&rq->lock);
> > +}
> > +
> > +static bool i915_guc_dequeue(struct intel_engine_cs *engine) {
> > +	struct execlist_port *port = engine->execlist_port;
> > +	struct drm_i915_gem_request *last = port[0].request;
> > +	unsigned long flags;
> > +	struct rb_node *rb;
> > +	bool submit = false;
> > +
> > +	spin_lock_irqsave(&engine->timeline->lock, flags);
> > +	rb = engine->execlist_first;
> > +	while (rb) {
> > +		struct drm_i915_gem_request *cursor =
> > +			rb_entry(rb, typeof(*cursor), priotree.node);
> > +
> > +		if (last && cursor->ctx != last->ctx) {
> > +			if (port != engine->execlist_port)
> > +				break;
> > +
> > +			i915_gem_request_assign(&port->request, last);
> > +			nested_enable_signaling(last);
> > +			port++;
> > +		}
> > +
> > +		rb = rb_next(rb);
> > +		rb_erase(&cursor->priotree.node, &engine-
> > >execlist_queue);
> > +		RB_CLEAR_NODE(&cursor->priotree.node);
> > +		cursor->priotree.priority = INT_MAX;
> > +
> > +		i915_guc_submit(cursor);
> > +		last = cursor;
> > +		submit = true;
> > +	}
> > +	if (submit) {
> > +		i915_gem_request_assign(&port->request, last);
> > +		nested_enable_signaling(last);
> > +		engine->execlist_first = rb;
> > +	}
> > +	spin_unlock_irqrestore(&engine->timeline->lock, flags);
> > +
> > +	return submit;
> > +}
> > +
> > +static void i915_guc_irq_handler(unsigned long data) {
> > +	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
> > +	struct execlist_port *port = engine->execlist_port;
> > +	struct drm_i915_gem_request *rq;
> > +	bool submit;
> > +
> > +	do {
> > +		rq = port[0].request;
> > +		while (rq && i915_gem_request_completed(rq)) {
> > +			i915_gem_request_put(rq);
> > +			port[0].request = port[1].request;
> > +			port[1].request = NULL;
> > +			rq = port[0].request;
> > +		}
> > +
> > +		submit = false;
> > +		if (!port[1].request)
> > +			submit = i915_guc_dequeue(engine);
> > +	} while (submit);
> > +}
> > +
> >  /*
> >   * Everything below here is concerned with setup & teardown, and is
> >   * therefore not part of the somewhat time-critical batch-submission
> > @@ -
> > 944,15 +1033,22 @@ int i915_guc_submission_enable(struct
> > drm_i915_private *dev_priv)
> >  	/* Take over from manual control of ELSP (execlists) */
> >  	for_each_engine(engine, dev_priv, id) {
> >  		struct drm_i915_gem_request *rq;
> > +		unsigned long flags;
> >
> > -		engine->submit_request = i915_guc_submit;
> > -		engine->schedule = NULL;
> > +		tasklet_init(&engine->irq_tasklet,
> > +			     i915_guc_irq_handler,
> > +			     (unsigned long)engine);
> >
> >  		/* Replay the current set of previously submitted requests */
> > +		spin_lock_irqsave(&engine->timeline->lock, flags);
> >  		list_for_each_entry(rq, &engine->timeline->requests, link) {
> > +			spin_lock(&client->wq_lock);
> >  			client->wq_rsvd += sizeof(struct guc_wq_item);
> > +			spin_unlock(&client->wq_lock);
> > +
> >  			__i915_guc_submit(rq);
> >  		}
> > +		spin_unlock_irqrestore(&engine->timeline->lock, flags);
> >  	}
> >
> >  	return 0;
> > diff --git a/drivers/gpu/drm/i915/i915_irq.c
> > b/drivers/gpu/drm/i915/i915_irq.c index 436b5baf38c2..72f9497518bc
> > 100644
> > --- a/drivers/gpu/drm/i915/i915_irq.c
> > +++ b/drivers/gpu/drm/i915/i915_irq.c
> > @@ -1350,13 +1350,20 @@ static void snb_gt_irq_handler(struct
> > drm_i915_private *dev_priv,  static __always_inline void
> > gen8_cs_irq_handler(struct intel_engine_cs *engine, u32 iir, int test_shift)
> {
> > -	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
> > -		notify_ring(engine);
> > +	bool tasklet = false;
> >
> >  	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift)) {
> >  		set_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > -		tasklet_hi_schedule(&engine->irq_tasklet);
> > +		tasklet = true;
> >  	}
> > +
> > +	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift)) {
> > +		notify_ring(engine);
> > +		tasklet |= i915.enable_guc_submission;
> > +	}
> > +
> > +	if (tasklet)
> > +		tasklet_hi_schedule(&engine->irq_tasklet);
> >  }
> >
> >  static irqreturn_t gen8_gt_irq_ack(struct drm_i915_private *dev_priv,
> > diff -- git a/drivers/gpu/drm/i915/intel_lrc.c
> > b/drivers/gpu/drm/i915/intel_lrc.c
> > index 44a92ea464ba..c81d86afb52f 100644
> > --- a/drivers/gpu/drm/i915/intel_lrc.c
> > +++ b/drivers/gpu/drm/i915/intel_lrc.c
> > @@ -1278,7 +1278,7 @@ static int gen8_init_common_ring(struct
> > intel_engine_cs *engine)
> >
> >  	/* After a GPU reset, we may have requests to replay */
> >  	clear_bit(ENGINE_IRQ_EXECLIST, &engine->irq_posted);
> > -	if (!execlists_elsp_idle(engine)) {
> > +	if (!i915.enable_guc_submission && !execlists_elsp_idle(engine)) {
> >  		engine->execlist_port[0].count = 0;
> >  		engine->execlist_port[1].count = 0;
> >  		execlists_submit_ports(engine);
> > @@ -1345,9 +1345,6 @@ static void reset_common_ring(struct
> > intel_engine_cs *engine,
> >  	request->ring->last_retired_head = -1;
> >  	intel_ring_update_space(request->ring);
> >
> > -	if (i915.enable_guc_submission)
> > -		return;
> > -
> >  	/* Catch up with any missed context-switch interrupts */
> >  	if (request->ctx != port[0].request->ctx) {
> >  		i915_gem_request_put(port[0].request);
> > --
> > 2.11.0
> >
> > _______________________________________________
> > Intel-gfx mailing list
> > Intel-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/intel-gfx
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-02-14  1:09 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-01 10:24 [PATCH v5] drm/i915/scheduler: emulate a scheduler for guc Chris Wilson
2017-02-01 12:09 ` Tvrtko Ursulin
2017-02-01 12:46   ` Chris Wilson
2017-02-01 13:15     ` Tvrtko Ursulin
2017-02-01 13:39       ` Chris Wilson
2017-02-01 12:54 ` ✓ Fi.CI.BAT: success for " Patchwork
2017-02-01 15:31 ` [PATCH v6] " Chris Wilson
2017-02-01 16:15   ` [PATCH v7] " Chris Wilson
2017-02-01 16:19     ` Tvrtko Ursulin
2017-02-02 10:35     ` [PATCH v8] " Chris Wilson
2017-02-13  8:06       ` Zheng, Xiao
2017-02-14  1:08         ` Zheng, Xiao
2017-02-01 16:54 ` ✓ Fi.CI.BAT: success for drm/i915/scheduler: emulate a scheduler for guc (rev3) Patchwork
2017-02-02 13:54 ` ✓ Fi.CI.BAT: success for drm/i915/scheduler: emulate a scheduler for guc (rev4) Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.