All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed
@ 2015-12-18 11:59 Tvrtko Ursulin
  2015-12-18 12:28 ` Chris Wilson
                   ` (2 more replies)
  0 siblings, 3 replies; 7+ messages in thread
From: Tvrtko Ursulin @ 2015-12-18 11:59 UTC (permalink / raw)
  To: Intel-gfx

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

We can rely on context complete interrupt to wake up the waiters
apart in the case where requests are merged into a single ELSP
submission. In this case we inject MI_USER_INTERRUPTS in the
ring buffer to ensure prompt wake-ups.

This optimization has the effect on for example GLBenchmark
Egypt off-screen test of decreasing the number of generated
interrupts per second by a factor of two, and context switched
by factor of five to six.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_drv.h  |  1 +
 drivers/gpu/drm/i915/i915_irq.c  | 24 ++++++++++++------------
 drivers/gpu/drm/i915/intel_lrc.c | 26 +++++++++++++++++++++++++-
 3 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 1d28d90ed901..f6fd2023aaf0 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -2811,6 +2811,7 @@ ibx_disable_display_interrupt(struct drm_i915_private *dev_priv, uint32_t bits)
 	ibx_display_interrupt_update(dev_priv, bits, 0);
 }
 
+void intel_notify_ring(struct intel_engine_cs *ring);
 
 /* i915_gem.c */
 int i915_gem_create_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 3f8c753997ba..a2087bad0d64 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -994,7 +994,7 @@ static void ironlake_rps_change_irq_handler(struct drm_device *dev)
 	return;
 }
 
-static void notify_ring(struct intel_engine_cs *ring)
+void intel_notify_ring(struct intel_engine_cs *ring)
 {
 	if (!intel_ring_initialized(ring))
 		return;
@@ -1291,9 +1291,9 @@ static void ilk_gt_irq_handler(struct drm_device *dev,
 {
 	if (gt_iir &
 	    (GT_RENDER_USER_INTERRUPT | GT_RENDER_PIPECTL_NOTIFY_INTERRUPT))
-		notify_ring(&dev_priv->ring[RCS]);
+		intel_notify_ring(&dev_priv->ring[RCS]);
 	if (gt_iir & ILK_BSD_USER_INTERRUPT)
-		notify_ring(&dev_priv->ring[VCS]);
+		intel_notify_ring(&dev_priv->ring[VCS]);
 }
 
 static void snb_gt_irq_handler(struct drm_device *dev,
@@ -1303,11 +1303,11 @@ static void snb_gt_irq_handler(struct drm_device *dev,
 
 	if (gt_iir &
 	    (GT_RENDER_USER_INTERRUPT | GT_RENDER_PIPECTL_NOTIFY_INTERRUPT))
-		notify_ring(&dev_priv->ring[RCS]);
+		intel_notify_ring(&dev_priv->ring[RCS]);
 	if (gt_iir & GT_BSD_USER_INTERRUPT)
-		notify_ring(&dev_priv->ring[VCS]);
+		intel_notify_ring(&dev_priv->ring[VCS]);
 	if (gt_iir & GT_BLT_USER_INTERRUPT)
-		notify_ring(&dev_priv->ring[BCS]);
+		intel_notify_ring(&dev_priv->ring[BCS]);
 
 	if (gt_iir & (GT_BLT_CS_ERROR_INTERRUPT |
 		      GT_BSD_CS_ERROR_INTERRUPT |
@@ -1322,7 +1322,7 @@ static __always_inline void
 gen8_cs_irq_handler(struct intel_engine_cs *ring, u32 iir, int test_shift)
 {
 	if (iir & (GT_RENDER_USER_INTERRUPT << test_shift))
-		notify_ring(ring);
+		intel_notify_ring(ring);
 	if (iir & (GT_CONTEXT_SWITCH_INTERRUPT << test_shift))
 		intel_lrc_irq_handler(ring);
 }
@@ -1629,7 +1629,7 @@ static void gen6_rps_irq_handler(struct drm_i915_private *dev_priv, u32 pm_iir)
 
 	if (HAS_VEBOX(dev_priv->dev)) {
 		if (pm_iir & PM_VEBOX_USER_INTERRUPT)
-			notify_ring(&dev_priv->ring[VECS]);
+			intel_notify_ring(&dev_priv->ring[VECS]);
 
 		if (pm_iir & PM_VEBOX_CS_ERROR_INTERRUPT)
 			DRM_DEBUG("Command parser error, pm_iir 0x%08x\n", pm_iir);
@@ -3961,7 +3961,7 @@ static irqreturn_t i8xx_irq_handler(int irq, void *arg)
 		new_iir = I915_READ16(IIR); /* Flush posted writes */
 
 		if (iir & I915_USER_INTERRUPT)
-			notify_ring(&dev_priv->ring[RCS]);
+			intel_notify_ring(&dev_priv->ring[RCS]);
 
 		for_each_pipe(dev_priv, pipe) {
 			int plane = pipe;
@@ -4157,7 +4157,7 @@ static irqreturn_t i915_irq_handler(int irq, void *arg)
 		new_iir = I915_READ(IIR); /* Flush posted writes */
 
 		if (iir & I915_USER_INTERRUPT)
-			notify_ring(&dev_priv->ring[RCS]);
+			intel_notify_ring(&dev_priv->ring[RCS]);
 
 		for_each_pipe(dev_priv, pipe) {
 			int plane = pipe;
@@ -4387,9 +4387,9 @@ static irqreturn_t i965_irq_handler(int irq, void *arg)
 		new_iir = I915_READ(IIR); /* Flush posted writes */
 
 		if (iir & I915_USER_INTERRUPT)
-			notify_ring(&dev_priv->ring[RCS]);
+			intel_notify_ring(&dev_priv->ring[RCS]);
 		if (iir & I915_BSD_USER_INTERRUPT)
-			notify_ring(&dev_priv->ring[VCS]);
+			intel_notify_ring(&dev_priv->ring[VCS]);
 
 		for_each_pipe(dev_priv, pipe) {
 			if (pipe_stats[pipe] & PIPE_START_VBLANK_INTERRUPT_STATUS &&
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 27f06198a51e..d9be878dbde7 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -359,6 +359,13 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 	spin_unlock(&dev_priv->uncore.lock);
 }
 
+static void execlists_emit_user_interrupt(struct drm_i915_gem_request *req)
+{
+	struct intel_ringbuffer *ringbuf = req->ringbuf;
+
+	iowrite32(MI_USER_INTERRUPT, ringbuf->virtual_start + req->tail - 8);
+}
+
 static int execlists_update_context(struct drm_i915_gem_request *rq)
 {
 	struct intel_engine_cs *ring = rq->ring;
@@ -433,6 +440,12 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
 			cursor->elsp_submitted = req0->elsp_submitted;
 			list_move_tail(&req0->execlist_link,
 				       &ring->execlist_retired_req_list);
+			/*
+			 * When merging requests make sure there is still
+			 * something after each batch buffer to wake up waiters.
+			 */
+			if (cursor != req0)
+				execlists_emit_user_interrupt(req0);
 			req0 = cursor;
 		} else {
 			req1 = cursor;
@@ -472,6 +485,12 @@ static bool execlists_check_remove_request(struct intel_engine_cs *ring,
 
 	assert_spin_locked(&ring->execlist_lock);
 
+	/*
+	 * This is effectively a context complete interrupt so wake
+	 * up potential waiters on this ring.
+	 */
+	intel_notify_ring(ring);
+
 	head_req = list_first_entry_or_null(&ring->execlist_queue,
 					    struct drm_i915_gem_request,
 					    execlist_link);
@@ -1812,7 +1831,12 @@ static int gen8_emit_request(struct drm_i915_gem_request *request)
 				(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT)));
 	intel_logical_ring_emit(ringbuf, 0);
 	intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request));
-	intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT);
+	/*
+	 * Following noop is a placeholder for an optional MI_USER_INTERRUPT
+	 * which will be patched in at ELSP submission time if this request
+	 * will not be ending with a context complete interrupt.
+	 */
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
 	intel_logical_ring_emit(ringbuf, MI_NOOP);
 	intel_logical_ring_advance_and_submit(request);
 
-- 
1.9.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed
  2015-12-18 11:59 [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed Tvrtko Ursulin
@ 2015-12-18 12:28 ` Chris Wilson
  2015-12-18 13:51   ` Tvrtko Ursulin
  2015-12-18 12:30 ` ✗ failure: Fi.CI.BAT Patchwork
  2015-12-18 12:31 ` [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed Chris Wilson
  2 siblings, 1 reply; 7+ messages in thread
From: Chris Wilson @ 2015-12-18 12:28 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx

On Fri, Dec 18, 2015 at 11:59:41AM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> We can rely on context complete interrupt to wake up the waiters
> apart in the case where requests are merged into a single ELSP
> submission. In this case we inject MI_USER_INTERRUPTS in the
> ring buffer to ensure prompt wake-ups.
> 
> This optimization has the effect on for example GLBenchmark
> Egypt off-screen test of decreasing the number of generated
> interrupts per second by a factor of two, and context switched
> by factor of five to six.

I half like it. Are the interupts a limiting factor in this case though?
This should be ~100 waits/second with ~1000 batches/second, right? What
is the delay between request completion and client wakeup - difficult to
measure after you remove the user interrupt though! But I estimate it
should be on the order of just a few GPU cycles.

> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
> index 27f06198a51e..d9be878dbde7 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -359,6 +359,13 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>  	spin_unlock(&dev_priv->uncore.lock);
>  }
>  
> +static void execlists_emit_user_interrupt(struct drm_i915_gem_request *req)
> +{
> +	struct intel_ringbuffer *ringbuf = req->ringbuf;
> +
> +	iowrite32(MI_USER_INTERRUPT, ringbuf->virtual_start + req->tail - 8);
> +}
> +
>  static int execlists_update_context(struct drm_i915_gem_request *rq)
>  {
>  	struct intel_engine_cs *ring = rq->ring;
> @@ -433,6 +440,12 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>  			cursor->elsp_submitted = req0->elsp_submitted;
>  			list_move_tail(&req0->execlist_link,
>  				       &ring->execlist_retired_req_list);
> +			/*
> +			 * When merging requests make sure there is still
> +			 * something after each batch buffer to wake up waiters.
> +			 */
> +			if (cursor != req0)
> +				execlists_emit_user_interrupt(req0);

You may have already missed this instruction as you patch it, and keep
doing so as long as the context is resubmitted. I think to be safe, you
need to patch cursor as well. You could then MI_NOOP out the MI_INTERUPT
on the terminal request.

An interesting igt experiement I think would be:

thread A, keep queuing batches with just a single MI_STORE_DWORD_IMM *addr
thread B, waits on batch from A, reads *addr (asynchronously), measures
latency (actual value - expected(batch))

Run for 10s, report min/max/median latency.

Repeat for more threads/contexts and more waiters. Ah, that may be the
demonstration for the thundering herd I've been looking for!
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* ✗ failure: Fi.CI.BAT
  2015-12-18 11:59 [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed Tvrtko Ursulin
  2015-12-18 12:28 ` Chris Wilson
@ 2015-12-18 12:30 ` Patchwork
  2015-12-18 12:31 ` [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed Chris Wilson
  2 siblings, 0 replies; 7+ messages in thread
From: Patchwork @ 2015-12-18 12:30 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: intel-gfx

== Summary ==

HEAD is now at 3c71db8 drm-intel-nightly: 2015y-12m-18d-09h-42m-49s UTC integration manifest
Applying: drm/i915/bdw+: Do not emit user interrupts when not needed
Using index info to reconstruct a base tree...
M	drivers/gpu/drm/i915/intel_lrc.c
Falling back to patching base and 3-way merge...
Auto-merging drivers/gpu/drm/i915/intel_lrc.c
CONFLICT (content): Merge conflict in drivers/gpu/drm/i915/intel_lrc.c
Patch failed at 0001 drm/i915/bdw+: Do not emit user interrupts when not needed

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed
  2015-12-18 11:59 [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed Tvrtko Ursulin
  2015-12-18 12:28 ` Chris Wilson
  2015-12-18 12:30 ` ✗ failure: Fi.CI.BAT Patchwork
@ 2015-12-18 12:31 ` Chris Wilson
  2 siblings, 0 replies; 7+ messages in thread
From: Chris Wilson @ 2015-12-18 12:31 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx

On Fri, Dec 18, 2015 at 11:59:41AM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> We can rely on context complete interrupt to wake up the waiters
> apart in the case where requests are merged into a single ELSP
> submission. In this case we inject MI_USER_INTERRUPTS in the
> ring buffer to ensure prompt wake-ups.
> 
> This optimization has the effect on for example GLBenchmark
> Egypt off-screen test of decreasing the number of generated
> interrupts per second by a factor of two, and context switched
> by factor of five to six.

Another question is how many of those are distinct master interrupts? If
the latency between the user interrupt + context switch interrupt is
small enough not to be noticeable, then it should be small enough to be
in the same IIR (or next in the double buffered IIR)? Why is that
reasoning off?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed
  2015-12-18 12:28 ` Chris Wilson
@ 2015-12-18 13:51   ` Tvrtko Ursulin
  2015-12-18 14:29     ` Chris Wilson
  0 siblings, 1 reply; 7+ messages in thread
From: Tvrtko Ursulin @ 2015-12-18 13:51 UTC (permalink / raw)
  To: Chris Wilson, Intel-gfx, Tvrtko Ursulin


On 18/12/15 12:28, Chris Wilson wrote:
> On Fri, Dec 18, 2015 at 11:59:41AM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> We can rely on context complete interrupt to wake up the waiters
>> apart in the case where requests are merged into a single ELSP
>> submission. In this case we inject MI_USER_INTERRUPTS in the
>> ring buffer to ensure prompt wake-ups.
>>
>> This optimization has the effect on for example GLBenchmark
>> Egypt off-screen test of decreasing the number of generated
>> interrupts per second by a factor of two, and context switched
>> by factor of five to six.
>
> I half like it. Are the interupts a limiting factor in this case though?
> This should be ~100 waits/second with ~1000 batches/second, right? What
> is the delay between request completion and client wakeup - difficult to
> measure after you remove the user interrupt though! But I estimate it
> should be on the order of just a few GPU cycles.

Neither of the two benchmarks I ran (trex onscreen and egypt offscreen) 
show any framerate improvements.

The only thing I did manage to measure is that CPU energy usage goes 
down with the optimisation. Roughly 8-10%, courtesy of RAPL script 
someone posted here.

Benchmarking is generally very hard so it is a pity we don't have a farm 
similar to CI which does it all in a repeatable and solid manner.

>> diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
>> index 27f06198a51e..d9be878dbde7 100644
>> --- a/drivers/gpu/drm/i915/intel_lrc.c
>> +++ b/drivers/gpu/drm/i915/intel_lrc.c
>> @@ -359,6 +359,13 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
>>   	spin_unlock(&dev_priv->uncore.lock);
>>   }
>>
>> +static void execlists_emit_user_interrupt(struct drm_i915_gem_request *req)
>> +{
>> +	struct intel_ringbuffer *ringbuf = req->ringbuf;
>> +
>> +	iowrite32(MI_USER_INTERRUPT, ringbuf->virtual_start + req->tail - 8);
>> +}
>> +
>>   static int execlists_update_context(struct drm_i915_gem_request *rq)
>>   {
>>   	struct intel_engine_cs *ring = rq->ring;
>> @@ -433,6 +440,12 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
>>   			cursor->elsp_submitted = req0->elsp_submitted;
>>   			list_move_tail(&req0->execlist_link,
>>   				       &ring->execlist_retired_req_list);
>> +			/*
>> +			 * When merging requests make sure there is still
>> +			 * something after each batch buffer to wake up waiters.
>> +			 */
>> +			if (cursor != req0)
>> +				execlists_emit_user_interrupt(req0);
>
> You may have already missed this instruction as you patch it, and keep
> doing so as long as the context is resubmitted. I think to be safe, you
> need to patch cursor as well. You could then MI_NOOP out the MI_INTERUPT
> on the terminal request.

I don't at the moment see it could miss it? We don't do preemption, but 
granted I don't understand this code fully.

But patching it out definitely looks safer. And I even don't have to 
unbreak GuC in that case. So I'll try that approach.

> An interesting igt experiement I think would be:
>
> thread A, keep queuing batches with just a single MI_STORE_DWORD_IMM *addr
> thread B, waits on batch from A, reads *addr (asynchronously), measures
> latency (actual value - expected(batch))
>
> Run for 10s, report min/max/median latency.
>
> Repeat for more threads/contexts and more waiters. Ah, that may be the
> demonstration for the thundering herd I've been looking for!

Hm I'll think about it.

Wrt your second reply, that is an interesting question.

All I can tell that empirically it looks interrupts do arrive split, 
otherwise there would be no reduction in interrupt numbers. But why are 
they split I don't know.

I'll try adding some counters to get a feel how often does that happen 
in various scenarios.

Regards,

Tvrtko



_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed
  2015-12-18 13:51   ` Tvrtko Ursulin
@ 2015-12-18 14:29     ` Chris Wilson
  2015-12-19  2:11       ` Chris Wilson
  0 siblings, 1 reply; 7+ messages in thread
From: Chris Wilson @ 2015-12-18 14:29 UTC (permalink / raw)
  To: Tvrtko Ursulin; +Cc: Intel-gfx

On Fri, Dec 18, 2015 at 01:51:38PM +0000, Tvrtko Ursulin wrote:
> 
> On 18/12/15 12:28, Chris Wilson wrote:
> >On Fri, Dec 18, 2015 at 11:59:41AM +0000, Tvrtko Ursulin wrote:
> >>From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> >>
> >>We can rely on context complete interrupt to wake up the waiters
> >>apart in the case where requests are merged into a single ELSP
> >>submission. In this case we inject MI_USER_INTERRUPTS in the
> >>ring buffer to ensure prompt wake-ups.
> >>
> >>This optimization has the effect on for example GLBenchmark
> >>Egypt off-screen test of decreasing the number of generated
> >>interrupts per second by a factor of two, and context switched
> >>by factor of five to six.
> >
> >I half like it. Are the interupts a limiting factor in this case though?
> >This should be ~100 waits/second with ~1000 batches/second, right? What
> >is the delay between request completion and client wakeup - difficult to
> >measure after you remove the user interrupt though! But I estimate it
> >should be on the order of just a few GPU cycles.
> 
> Neither of the two benchmarks I ran (trex onscreen and egypt
> offscreen) show any framerate improvements.

Expected, but nice to be confirmed. I expect you would have to get into
the synchronous workloads before the latency had any impact (1 us on a
1 ms wait will be lost in the noise, but  a1 us on 2 us wait would
quickly build up)
 
> The only thing I did manage to measure is that CPU energy usage goes
> down with the optimisation. Roughly 8-10%, courtesy of RAPL script
> someone posted here.

That's actually v.impressive - are you confident in the result? :)
 
> Benchmarking is generally very hard so it is a pity we don't have a
> farm similar to CI which does it all in a repeatable and solid
> manner.

Indeed. And to have stable, reliable benchmarking. Martin Peres has been
working on getting confidence in the benchmark farm. The caveats tend to
be that we have to run at low frequencies to avoid throttling (CPU/GPU),
run on fixed CPUs etc etc. To get reliable metrics we have to throw out
some interesting and complex variabilty of the real world. :|
 
> >>@@ -433,6 +440,12 @@ static void execlists_context_unqueue(struct intel_engine_cs *ring)
> >>  			cursor->elsp_submitted = req0->elsp_submitted;
> >>  			list_move_tail(&req0->execlist_link,
> >>  				       &ring->execlist_retired_req_list);
> >>+			/*
> >>+			 * When merging requests make sure there is still
> >>+			 * something after each batch buffer to wake up waiters.
> >>+			 */
> >>+			if (cursor != req0)
> >>+				execlists_emit_user_interrupt(req0);
> >
> >You may have already missed this instruction as you patch it, and keep
> >doing so as long as the context is resubmitted. I think to be safe, you
> >need to patch cursor as well. You could then MI_NOOP out the MI_INTERUPT
> >on the terminal request.
> 
> I don't at the moment see it could miss it? We don't do preemption,
> but granted I don't understand this code fully.

The GPU is currently executing the context that was on port[1]. It may
already have executed all the instructions upto and including the
instruction being patched (from the last ELSP write). Ok, that's a
window of just one missed interrupt on the resubmitted request - not as
big a deal as I first thought.
 
> But patching it out definitely looks safer. And I even don't have to
> unbreak GuC in that case. So I'll try that approach.

It still has the same hole on the resubmitted request though, but it may
be simpler.

> >An interesting igt experiement I think would be:
> >
> >thread A, keep queuing batches with just a single MI_STORE_DWORD_IMM *addr
> >thread B, waits on batch from A, reads *addr (asynchronously), measures
> >latency (actual value - expected(batch))
> >
> >Run for 10s, report min/max/median latency.
> >
> >Repeat for more threads/contexts and more waiters. Ah, that may be the
> >demonstration for the thundering herd I've been looking for!
> 
> Hm I'll think about it.

I'm working on this as I have been trying to get something to measure
the thundering herd issue (I have one benchmark that measures how much
CPU time we steal to handle interrupts, but it is horrible).
 
> Wrt your second reply, that is an interesting question.
> 
> All I can tell that empirically it looks interrupts do arrive split,
> otherwise there would be no reduction in interrupt numbers. But why
> are they split I don't know.
> 
> I'll try adding some counters to get a feel how often does that
> happen in various scenarios.

It may just be that on the CPU timescale a few GPU instructions is
enough for us to process the first interrupt and go back to sleep. But
it has to be shorter than the context switch to the bottom-half, I would
guess.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed
  2015-12-18 14:29     ` Chris Wilson
@ 2015-12-19  2:11       ` Chris Wilson
  0 siblings, 0 replies; 7+ messages in thread
From: Chris Wilson @ 2015-12-19  2:11 UTC (permalink / raw)
  To: Tvrtko Ursulin, Intel-gfx, Tvrtko Ursulin

On Fri, Dec 18, 2015 at 02:29:22PM +0000, Chris Wilson wrote:
> On Fri, Dec 18, 2015 at 01:51:38PM +0000, Tvrtko Ursulin wrote:
> > On 18/12/15 12:28, Chris Wilson wrote:
> > >An interesting igt experiement I think would be:
> > >
> > >thread A, keep queuing batches with just a single MI_STORE_DWORD_IMM *addr
> > >thread B, waits on batch from A, reads *addr (asynchronously), measures
> > >latency (actual value - expected(batch))
> > >
> > >Run for 10s, report min/max/median latency.
> > >
> > >Repeat for more threads/contexts and more waiters. Ah, that may be the
> > >demonstration for the thundering herd I've been looking for!
> > 
> > Hm I'll think about it.
> 
> I'm working on this as I have been trying to get something to measure
> the thundering herd issue (I have one benchmark that measures how much
> CPU time we steal to handle interrupts, but it is horrible).

Remembered that we have timestamp registers. So
igt/benchmarks/gem_latency measures the number of engine cycles from the
end of a batch to the time the client is woken up (for a variety of
producer:consumers). That should be suitable to serve as a basis for
measurements.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-12-19  2:11 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-18 11:59 [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed Tvrtko Ursulin
2015-12-18 12:28 ` Chris Wilson
2015-12-18 13:51   ` Tvrtko Ursulin
2015-12-18 14:29     ` Chris Wilson
2015-12-19  2:11       ` Chris Wilson
2015-12-18 12:30 ` ✗ failure: Fi.CI.BAT Patchwork
2015-12-18 12:31 ` [RFC] drm/i915/bdw+: Do not emit user interrupts when not needed Chris Wilson

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.