All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] drm/i915: Move CSB MMIO reads out of the execlists lock
@ 2016-03-17 12:59 Tvrtko Ursulin
  2016-03-17 13:14 ` Chris Wilson
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Tvrtko Ursulin @ 2016-03-17 12:59 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

By reading the CSB (slow MMIO accesses) into a temporary local
buffer we can decrease the duration of holding the execlist
lock.

Main advantage is that during heavy batch buffer submission we
reduce the execlist lock contention, which should decrease the
latency and CPU usage between the submitting userspace process
and interrupt handling.

Downside is that we need to grab and relase the forcewake twice,
but as the below numbers will show this is completely hidden
by the primary gains.

Testing with "gem_latency -n 100" (submit batch buffers with a
hundred nops each) shows more than doubling of the throughput
and more than halving of the dispatch latency, overall latency
and CPU time spend in the submitting process.

Submitting empty batches ("gem_latency -n 0") does not seem
significantly affected by this change with throughput and CPU
time improving by half a percent, and overall latency worsening
by the same amount.

Above tests were done in a hundred runs on a big core Broadwell.

v2:
  * Overflow protection to local CSB buffer.
  * Use closer dev_priv in execlists_submit_requests. (Chris Wilson)

v3: Rebase.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/intel_lrc.c | 77 ++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index f72782200226..c6b3a9d19af2 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -416,15 +416,23 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
 static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
 				      struct drm_i915_gem_request *rq1)
 {
+	struct drm_i915_private *dev_priv = rq0->i915;
+
 	execlists_update_context(rq0);
 
 	if (rq1)
 		execlists_update_context(rq1);
 
+	spin_lock(&dev_priv->uncore.lock);
+	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
+
 	execlists_elsp_write(rq0, rq1);
+
+	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
+	spin_unlock(&dev_priv->uncore.lock);
 }
 
-static void execlists_context_unqueue__locked(struct intel_engine_cs *engine)
+static void execlists_context_unqueue(struct intel_engine_cs *engine)
 {
 	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
 	struct drm_i915_gem_request *cursor, *tmp;
@@ -478,19 +486,6 @@ static void execlists_context_unqueue__locked(struct intel_engine_cs *engine)
 	execlists_submit_requests(req0, req1);
 }
 
-static void execlists_context_unqueue(struct intel_engine_cs *engine)
-{
-	struct drm_i915_private *dev_priv = engine->dev->dev_private;
-
-	spin_lock(&dev_priv->uncore.lock);
-	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
-
-	execlists_context_unqueue__locked(engine);
-
-	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
-	spin_unlock(&dev_priv->uncore.lock);
-}
-
 static unsigned int
 execlists_check_remove_request(struct intel_engine_cs *engine, u32 request_id)
 {
@@ -551,12 +546,10 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine)
 	struct drm_i915_private *dev_priv = engine->dev->dev_private;
 	u32 status_pointer;
 	unsigned int read_pointer, write_pointer;
-	u32 status = 0;
-	u32 status_id;
+	u32 csb[GEN8_CSB_ENTRIES][2];
+	unsigned int csb_read = 0, i;
 	unsigned int submit_contexts = 0;
 
-	spin_lock(&engine->execlist_lock);
-
 	spin_lock(&dev_priv->uncore.lock);
 	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
 
@@ -568,41 +561,47 @@ void intel_lrc_irq_handler(struct intel_engine_cs *engine)
 		write_pointer += GEN8_CSB_ENTRIES;
 
 	while (read_pointer < write_pointer) {
-		status = get_context_status(engine, ++read_pointer,
-				            &status_id);
+		if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
+			break;
+		csb[csb_read][0] = get_context_status(engine, ++read_pointer,
+						      &csb[csb_read][1]);
+		csb_read++;
+	}
 
-		if (unlikely(status & GEN8_CTX_STATUS_PREEMPTED)) {
-			if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
-				if (execlists_check_remove_request(engine, status_id))
+	engine->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
+
+	/* Update the read pointer to the old write pointer. Manual ringbuffer
+	 * management ftw </sarcasm> */
+	I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(engine),
+		      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
+				    engine->next_context_status_buffer << 8));
+
+	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
+	spin_unlock(&dev_priv->uncore.lock);
+
+	spin_lock(&engine->execlist_lock);
+
+	for (i = 0; i < csb_read; i++) {
+		if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) {
+			if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) {
+				if (execlists_check_remove_request(engine, csb[i][1]))
 					WARN(1, "Lite Restored request removed from queue\n");
 			} else
 				WARN(1, "Preemption without Lite Restore\n");
 		}
 
-		if (status & (GEN8_CTX_STATUS_ACTIVE_IDLE |
+		if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE |
 		    GEN8_CTX_STATUS_ELEMENT_SWITCH))
 			submit_contexts +=
-				execlists_check_remove_request(engine,
-						               status_id);
+				execlists_check_remove_request(engine, csb[i][1]);
 	}
 
 	if (submit_contexts) {
 		if (!engine->disable_lite_restore_wa ||
-		    (status & GEN8_CTX_STATUS_ACTIVE_IDLE))
-			execlists_context_unqueue__locked(engine);
+		    (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
+			execlists_context_unqueue(engine);
 	}
 
-	engine->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
-
-	/* Update the read pointer to the old write pointer. Manual ringbuffer
-	 * management ftw </sarcasm> */
-	I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(engine),
-		      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
-				    engine->next_context_status_buffer << 8));
-
-	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
-	spin_unlock(&dev_priv->uncore.lock);
-
 	spin_unlock(&engine->execlist_lock);
 
 	if (unlikely(submit_contexts > 2))
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] drm/i915: Move CSB MMIO reads out of the execlists lock
  2016-03-17 12:59 [PATCH v3] drm/i915: Move CSB MMIO reads out of the execlists lock Tvrtko Ursulin
@ 2016-03-17 13:14 ` Chris Wilson
  2016-03-17 16:30   ` Tvrtko Ursulin
  2016-03-18  7:02 ` ✗ Fi.CI.BAT: failure for drm/i915: Move CSB MMIO reads out of the execlists lock (rev4) Patchwork
  2016-03-18  9:46 ` Patchwork
  2 siblings, 1 reply; 7+ messages in thread
From: Chris Wilson @ 2016-03-17 13:14 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx

On Thu, Mar 17, 2016 at 12:59:46PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> By reading the CSB (slow MMIO accesses) into a temporary local
> buffer we can decrease the duration of holding the execlist
> lock.
> 
> Main advantage is that during heavy batch buffer submission we
> reduce the execlist lock contention, which should decrease the
> latency and CPU usage between the submitting userspace process
> and interrupt handling.
> 
> Downside is that we need to grab and relase the forcewake twice,
> but as the below numbers will show this is completely hidden
> by the primary gains.
> 
> Testing with "gem_latency -n 100" (submit batch buffers with a
> hundred nops each) shows more than doubling of the throughput
> and more than halving of the dispatch latency, overall latency
> and CPU time spend in the submitting process.
> 
> Submitting empty batches ("gem_latency -n 0") does not seem
> significantly affected by this change with throughput and CPU
> time improving by half a percent, and overall latency worsening
> by the same amount.
> 
> Above tests were done in a hundred runs on a big core Broadwell.
> 
> v2:
>   * Overflow protection to local CSB buffer.
>   * Use closer dev_priv in execlists_submit_requests. (Chris Wilson)
> 
> v3: Rebase.
> 
> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> Cc: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/intel_lrc.c | 77 ++++++++++++++++++++--------------------
>  1 file changed, 38 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index f72782200226..c6b3a9d19af2 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -416,15 +416,23 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
>  static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
>  				      struct drm_i915_gem_request *rq1)
>  {
> +	struct drm_i915_private *dev_priv = rq0->i915;
> +
>  	execlists_update_context(rq0);
>  
>  	if (rq1)
>  		execlists_update_context(rq1);
>  
> +	spin_lock(&dev_priv->uncore.lock);
> +	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);

My only problem with this is that it puts the onus on the caller to
remember to disable irqs first. (That would be a silent conflict with
the patch to move the submission to a kthread for example.)
What's the cost of being upfront with spin_lock_irqsave() here?
/* BUG_ON(!irqs_disabled());  */ ?

I'm quite happy to go with the comment here and since it doesn't make
the lockups any worse,
Reviewed-by: Chris Wilsno <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] drm/i915: Move CSB MMIO reads out of the execlists lock
  2016-03-17 13:14 ` Chris Wilson
@ 2016-03-17 16:30   ` Tvrtko Ursulin
  2016-03-21  9:22     ` Daniel Vetter
  0 siblings, 1 reply; 7+ messages in thread
From: Tvrtko Ursulin @ 2016-03-17 16:30 UTC (permalink / raw)
  To: Chris Wilson, Intel-gfx, Tvrtko Ursulin


On 17/03/16 13:14, Chris Wilson wrote:
> On Thu, Mar 17, 2016 at 12:59:46PM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> By reading the CSB (slow MMIO accesses) into a temporary local
>> buffer we can decrease the duration of holding the execlist
>> lock.
>>
>> Main advantage is that during heavy batch buffer submission we
>> reduce the execlist lock contention, which should decrease the
>> latency and CPU usage between the submitting userspace process
>> and interrupt handling.
>>
>> Downside is that we need to grab and relase the forcewake twice,
>> but as the below numbers will show this is completely hidden
>> by the primary gains.
>>
>> Testing with "gem_latency -n 100" (submit batch buffers with a
>> hundred nops each) shows more than doubling of the throughput
>> and more than halving of the dispatch latency, overall latency
>> and CPU time spend in the submitting process.
>>
>> Submitting empty batches ("gem_latency -n 0") does not seem
>> significantly affected by this change with throughput and CPU
>> time improving by half a percent, and overall latency worsening
>> by the same amount.
>>
>> Above tests were done in a hundred runs on a big core Broadwell.
>>
>> v2:
>>    * Overflow protection to local CSB buffer.
>>    * Use closer dev_priv in execlists_submit_requests. (Chris Wilson)
>>
>> v3: Rebase.
>>
>> Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>> Cc: Chris Wilson <chris@chris-wilson.co.uk>
>> ---
>>   drivers/gpu/drm/i915/intel_lrc.c | 77 ++++++++++++++++++++--------------------
>>   1 file changed, 38 insertions(+), 39 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index f72782200226..c6b3a9d19af2 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -416,15 +416,23 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
>>   static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
>>   				      struct drm_i915_gem_request *rq1)
>>   {
>> +	struct drm_i915_private *dev_priv = rq0->i915;
>> +
>>   	execlists_update_context(rq0);
>>
>>   	if (rq1)
>>   		execlists_update_context(rq1);
>>
>> +	spin_lock(&dev_priv->uncore.lock);
>> +	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
>
> My only problem with this is that it puts the onus on the caller to
> remember to disable irqs first. (That would be a silent conflict with
> the patch to move the submission to a kthread for example.)

Oh well in this code ones has to know what they are doing anyway so I 
consider this very minor.

> What's the cost of being upfront with spin_lock_irqsave() here?

No idea but probably not much if at all detectable. Issue I have with 
that that elsewhere we have this approach of using the exact right one 
for the use case.

> /* BUG_ON(!irqs_disabled());  */ ?

As a comment?

> I'm quite happy to go with the comment here and since it doesn't make
> the lockups any worse,
> Reviewed-by: Chris Wilsno <chris@chris-wilson.co.uk>

Thanks, will add that comment.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* ✗ Fi.CI.BAT: failure for drm/i915: Move CSB MMIO reads out of the execlists lock (rev4)
  2016-03-17 12:59 [PATCH v3] drm/i915: Move CSB MMIO reads out of the execlists lock Tvrtko Ursulin
  2016-03-17 13:14 ` Chris Wilson
@ 2016-03-18  7:02 ` Patchwork
  2016-03-18 10:31   ` Tvrtko Ursulin
  2016-03-18  9:46 ` Patchwork
  2 siblings, 1 reply; 7+ messages in thread
From: Patchwork @ 2016-03-18  7:02 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Move CSB MMIO reads out of the execlists lock (rev4)
URL   : https://patchwork.freedesktop.org/series/3973/
State : failure

== Summary ==

Series 3973v4 drm/i915: Move CSB MMIO reads out of the execlists lock
http://patchwork.freedesktop.org/api/1.0/series/3973/revisions/4/mbox/

Test gem_ringfill:
        Subgroup basic-default-s3:
                dmesg-warn -> PASS       (skl-nuci5)
Test gem_storedw_loop:
        Subgroup basic-default:
                dmesg-warn -> PASS       (skl-nuci5)
Test gem_sync:
        Subgroup basic-vebox:
                dmesg-warn -> PASS       (skl-nuci5)
Test kms_flip:
        Subgroup basic-flip-vs-wf_vblank:
                pass       -> DMESG-WARN (bdw-ultra)
Test kms_pipe_crc_basic:
        Subgroup nonblocking-crc-pipe-c-frame-sequence:
                pass       -> DMESG-WARN (hsw-brixbox)
        Subgroup read-crc-pipe-a-frame-sequence:
                dmesg-warn -> PASS       (hsw-gt2)
        Subgroup suspend-read-crc-pipe-b:
                pass       -> INCOMPLETE (hsw-gt2)
        Subgroup suspend-read-crc-pipe-c:
                dmesg-warn -> PASS       (bsw-nuc-2)
Test pm_rpm:
        Subgroup basic-pci-d3-state:
                pass       -> DMESG-WARN (hsw-brixbox)

bdw-ultra        total:194  pass:171  dwarn:2   dfail:0   fail:0   skip:21 
bsw-nuc-2        total:194  pass:156  dwarn:1   dfail:0   fail:0   skip:37 
hsw-brixbox      total:194  pass:169  dwarn:3   dfail:0   fail:0   skip:22 
hsw-gt2          total:115  pass:104  dwarn:1   dfail:0   fail:0   skip:9  
ivb-t430s        total:194  pass:168  dwarn:1   dfail:0   fail:0   skip:25 
skl-i5k-2        total:194  pass:170  dwarn:1   dfail:0   fail:0   skip:23 
skl-i7k-2        total:194  pass:170  dwarn:1   dfail:0   fail:0   skip:23 
skl-nuci5        total:194  pass:182  dwarn:1   dfail:0   fail:0   skip:11 

Results at /archive/results/CI_IGT_test/Patchwork_1632/

10e913a48ca36790da9b58bed8729598ea79ebdb drm-intel-nightly: 2016y-03m-17d-13h-22m-41s UTC integration manifest
f6a228bfa750f3f23c1a95ca76f08b148a02f2a5 drm/i915: Move CSB MMIO reads out of the execlists lock

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* ✗ Fi.CI.BAT: failure for drm/i915: Move CSB MMIO reads out of the execlists lock (rev4)
  2016-03-17 12:59 [PATCH v3] drm/i915: Move CSB MMIO reads out of the execlists lock Tvrtko Ursulin
  2016-03-17 13:14 ` Chris Wilson
  2016-03-18  7:02 ` ✗ Fi.CI.BAT: failure for drm/i915: Move CSB MMIO reads out of the execlists lock (rev4) Patchwork
@ 2016-03-18  9:46 ` Patchwork
  2 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2016-03-18  9:46 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Move CSB MMIO reads out of the execlists lock (rev4)
URL   : https://patchwork.freedesktop.org/series/3973/
State : failure

== Summary ==

Series 3973v4 drm/i915: Move CSB MMIO reads out of the execlists lock
http://patchwork.freedesktop.org/api/1.0/series/3973/revisions/4/mbox/

Test gem_ringfill:
        Subgroup basic-default-s3:
                dmesg-warn -> PASS       (skl-nuci5)
Test gem_storedw_loop:
        Subgroup basic-default:
                dmesg-warn -> PASS       (skl-nuci5)
Test gem_sync:
        Subgroup basic-vebox:
                dmesg-warn -> PASS       (skl-nuci5)
Test kms_flip:
        Subgroup basic-flip-vs-wf_vblank:
                pass       -> DMESG-WARN (bdw-ultra)
Test kms_pipe_crc_basic:
        Subgroup nonblocking-crc-pipe-c-frame-sequence:
                pass       -> DMESG-WARN (hsw-brixbox)
        Subgroup read-crc-pipe-a-frame-sequence:
                dmesg-warn -> PASS       (hsw-gt2)
        Subgroup suspend-read-crc-pipe-b:
                pass       -> INCOMPLETE (hsw-gt2)
        Subgroup suspend-read-crc-pipe-c:
                dmesg-warn -> PASS       (bsw-nuc-2)
Test pm_rpm:
        Subgroup basic-pci-d3-state:
                pass       -> DMESG-WARN (hsw-brixbox)

bdw-ultra        total:194  pass:171  dwarn:2   dfail:0   fail:0   skip:21 
bsw-nuc-2        total:194  pass:156  dwarn:1   dfail:0   fail:0   skip:37 
hsw-brixbox      total:194  pass:169  dwarn:3   dfail:0   fail:0   skip:22 
hsw-gt2          total:115  pass:104  dwarn:1   dfail:0   fail:0   skip:9  
ivb-t430s        total:194  pass:168  dwarn:1   dfail:0   fail:0   skip:25 
skl-i5k-2        total:194  pass:170  dwarn:1   dfail:0   fail:0   skip:23 
skl-i7k-2        total:194  pass:170  dwarn:1   dfail:0   fail:0   skip:23 
skl-nuci5        total:194  pass:182  dwarn:1   dfail:0   fail:0   skip:11 

Results at /archive/results/CI_IGT_test/Patchwork_1632/

10e913a48ca36790da9b58bed8729598ea79ebdb drm-intel-nightly: 2016y-03m-17d-13h-22m-41s UTC integration manifest
f6a228bfa750f3f23c1a95ca76f08b148a02f2a5 drm/i915: Move CSB MMIO reads out of the execlists lock

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: ✗ Fi.CI.BAT: failure for drm/i915: Move CSB MMIO reads out of the execlists lock (rev4)
  2016-03-18  7:02 ` ✗ Fi.CI.BAT: failure for drm/i915: Move CSB MMIO reads out of the execlists lock (rev4) Patchwork
@ 2016-03-18 10:31   ` Tvrtko Ursulin
  0 siblings, 0 replies; 7+ messages in thread
From: Tvrtko Ursulin @ 2016-03-18 10:31 UTC (permalink / raw)
  To: intel-gfx, Chris Wilson


On 18/03/16 07:02, Patchwork wrote:
> == Series Details ==
>
> Series: drm/i915: Move CSB MMIO reads out of the execlists lock (rev4)
> URL   : https://patchwork.freedesktop.org/series/3973/
> State : failure
>
> == Summary ==
>
> Series 3973v4 drm/i915: Move CSB MMIO reads out of the execlists lock
> http://patchwork.freedesktop.org/api/1.0/series/3973/revisions/4/mbox/
>
> Test gem_ringfill:
>          Subgroup basic-default-s3:
>                  dmesg-warn -> PASS       (skl-nuci5)
> Test gem_storedw_loop:
>          Subgroup basic-default:
>                  dmesg-warn -> PASS       (skl-nuci5)
> Test gem_sync:
>          Subgroup basic-vebox:
>                  dmesg-warn -> PASS       (skl-nuci5)
> Test kms_flip:
>          Subgroup basic-flip-vs-wf_vblank:
>                  pass       -> DMESG-WARN (bdw-ultra)

Device suspended during HW access: 
https://bugs.freedesktop.org/show_bug.cgi?id=94349

> Test kms_pipe_crc_basic:
>          Subgroup nonblocking-crc-pipe-c-frame-sequence:
>                  pass       -> DMESG-WARN (hsw-brixbox)

Unclaimed register: https://bugs.freedesktop.org/show_bug.cgi?id=94384

>          Subgroup read-crc-pipe-a-frame-sequence:
>                  dmesg-warn -> PASS       (hsw-gt2)
>          Subgroup suspend-read-crc-pipe-b:
>                  pass       -> INCOMPLETE (hsw-gt2)

Haswell does not use the code touched here so I am discounting this as 
unrelated.

>          Subgroup suspend-read-crc-pipe-c:
>                  dmesg-warn -> PASS       (bsw-nuc-2)
> Test pm_rpm:
>          Subgroup basic-pci-d3-state:
>                  pass       -> DMESG-WARN (hsw-brixbox)

Device suspended during HW access: 
https://bugs.freedesktop.org/show_bug.cgi?id=94349

>
> bdw-ultra        total:194  pass:171  dwarn:2   dfail:0   fail:0   skip:21
> bsw-nuc-2        total:194  pass:156  dwarn:1   dfail:0   fail:0   skip:37
> hsw-brixbox      total:194  pass:169  dwarn:3   dfail:0   fail:0   skip:22
> hsw-gt2          total:115  pass:104  dwarn:1   dfail:0   fail:0   skip:9
> ivb-t430s        total:194  pass:168  dwarn:1   dfail:0   fail:0   skip:25
> skl-i5k-2        total:194  pass:170  dwarn:1   dfail:0   fail:0   skip:23
> skl-i7k-2        total:194  pass:170  dwarn:1   dfail:0   fail:0   skip:23
> skl-nuci5        total:194  pass:182  dwarn:1   dfail:0   fail:0   skip:11
>
> Results at /archive/results/CI_IGT_test/Patchwork_1632/
>
> 10e913a48ca36790da9b58bed8729598ea79ebdb drm-intel-nightly: 2016y-03m-17d-13h-22m-41s UTC integration manifest
> f6a228bfa750f3f23c1a95ca76f08b148a02f2a5 drm/i915: Move CSB MMIO reads out of the execlists lock

Merged, thanks for the review.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] drm/i915: Move CSB MMIO reads out of the execlists lock
  2016-03-17 16:30   ` Tvrtko Ursulin
@ 2016-03-21  9:22     ` Daniel Vetter
  0 siblings, 0 replies; 7+ messages in thread
From: Daniel Vetter @ 2016-03-21  9:22 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx

On Thu, Mar 17, 2016 at 04:30:47PM +0000, Tvrtko Ursulin wrote:
> 
> On 17/03/16 13:14, Chris Wilson wrote:
> >On Thu, Mar 17, 2016 at 12:59:46PM +0000, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >>By reading the CSB (slow MMIO accesses) into a temporary local
> >>buffer we can decrease the duration of holding the execlist
> >>lock.
> >>
> >>Main advantage is that during heavy batch buffer submission we
> >>reduce the execlist lock contention, which should decrease the
> >>latency and CPU usage between the submitting userspace process
> >>and interrupt handling.
> >>
> >>Downside is that we need to grab and relase the forcewake twice,
> >>but as the below numbers will show this is completely hidden
> >>by the primary gains.
> >>
> >>Testing with "gem_latency -n 100" (submit batch buffers with a
> >>hundred nops each) shows more than doubling of the throughput
> >>and more than halving of the dispatch latency, overall latency
> >>and CPU time spend in the submitting process.
> >>
> >>Submitting empty batches ("gem_latency -n 0") does not seem
> >>significantly affected by this change with throughput and CPU
> >>time improving by half a percent, and overall latency worsening
> >>by the same amount.
> >>
> >>Above tests were done in a hundred runs on a big core Broadwell.
> >>
> >>v2:
> >>   * Overflow protection to local CSB buffer.
> >>   * Use closer dev_priv in execlists_submit_requests. (Chris Wilson)
> >>
> >>v3: Rebase.
> >>
> >>Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>Cc: Chris Wilson <chris@chris-wilson.co.uk>
> >>---
> >>  drivers/gpu/drm/i915/intel_lrc.c | 77 ++++++++++++++++++++--------------------
> >>  1 file changed, 38 insertions(+), 39 deletions(-)
> >>
> >>diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> >>index f72782200226..c6b3a9d19af2 100644
> >>--- a/drivers/gpu/drm/i915/intel_lrc.c
> >>+++ b/drivers/gpu/drm/i915/intel_lrc.c
> >>@@ -416,15 +416,23 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
> >>  static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
> >>  				      struct drm_i915_gem_request *rq1)
> >>  {
> >>+	struct drm_i915_private *dev_priv = rq0->i915;
> >>+
> >>  	execlists_update_context(rq0);
> >>
> >>  	if (rq1)
> >>  		execlists_update_context(rq1);
> >>
> >>+	spin_lock(&dev_priv->uncore.lock);
> >>+	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
> >
> >My only problem with this is that it puts the onus on the caller to
> >remember to disable irqs first. (That would be a silent conflict with
> >the patch to move the submission to a kthread for example.)
> 
> Oh well in this code ones has to know what they are doing anyway so I
> consider this very minor.
> 
> >What's the cost of being upfront with spin_lock_irqsave() here?
> 
> No idea but probably not much if at all detectable. Issue I have with that
> that elsewhere we have this approach of using the exact right one for the
> use case.
> 
> >/* BUG_ON(!irqs_disabled());  */ ?
> 
> As a comment?

As a line of code. I agree with Chris that we should add this, making it
even harder for newbies to dig into gem/execlist code doesn't help anyone.
With this line we get up to

"Get it right or it breaks at runtime."

Before that we are at

"Read the right mailing list thread and you get it right"

Per http://sweng.the-davies.net/Home/rustys-api-design-manifesto

Ofc if it has significant overhead then we need to reconsider.
-Daniel

> 
> >I'm quite happy to go with the comment here and since it doesn't make
> >the lockups any worse,
> >Reviewed-by: Chris Wilsno <chris@chris-wilson.co.uk>
> 
> Thanks, will add that comment.
> 
> Regards,
> 
> Tvrtko
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2016-03-21  9:22 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-03-17 12:59 [PATCH v3] drm/i915: Move CSB MMIO reads out of the execlists lock Tvrtko Ursulin
2016-03-17 13:14 ` Chris Wilson
2016-03-17 16:30   ` Tvrtko Ursulin
2016-03-21  9:22     ` Daniel Vetter
2016-03-18  7:02 ` ✗ Fi.CI.BAT: failure for drm/i915: Move CSB MMIO reads out of the execlists lock (rev4) Patchwork
2016-03-18 10:31   ` Tvrtko Ursulin
2016-03-18  9:46 ` Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.