All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Write RING_TAIL once per-request
@ 2013-08-10 21:16 Chris Wilson
  2013-08-26 20:42 ` Ben Widawsky
  0 siblings, 1 reply; 4+ messages in thread
From: Chris Wilson @ 2013-08-10 21:16 UTC (permalink / raw)
  To: intel-gfx

Ignoring the legacy DRI1 code, and a couple of special cases (to be
discussed later), all access to the ring is mediated through requests.
The first write to a ring will grab a seqno and mark the ring as having
an outstanding_lazy_request. Either through explicitly adding a request
after an execbuffer or through an implicit wait (either by the CPU or by
a semaphore), that sequence of writes will be terminated with a request.
So we can ellide all the intervening writes to the tail register and
send the entire command stream to the GPU at once. This will reduce the
number of *serialising* writes to the tail register by a factor or 3-5
times (depending upon architecture and number of workarounds, context
switches, etc involved). This becomes even more noticeable when the
register write is overloaded with a number of debugging tools. The
astute reader will wonder if it is then possible to overflow the ring
with a single command. It is not. When we start a command sequence to
the ring, we check for available space and issue a wait in case we have
not. The ring wait will in this case be forced to flush the outstanding
register write and then poll the ACTHD for sufficient space to continue.

The exception to the rule where everything is inside a request are a few
initialisation cases where we may want to write GPU commands via the CS
before userspace wakes up and page flips.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_dma.c         |  2 +-
 drivers/gpu/drm/i915/i915_gem_exec.c    |  2 +-
 drivers/gpu/drm/i915/intel_display.c    | 10 +++++-----
 drivers/gpu/drm/i915/intel_ringbuffer.c | 30 ++++++++++++++++--------------
 drivers/gpu/drm/i915/intel_ringbuffer.h |  7 ++++++-
 5 files changed, 29 insertions(+), 22 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index de0b86a..a7a0eb86 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -52,7 +52,7 @@
 	intel_ring_emit(LP_RING(dev_priv), x)
 
 #define ADVANCE_LP_RING() \
-	intel_ring_advance(LP_RING(dev_priv))
+	__intel_ring_advance(LP_RING(dev_priv))
 
 /**
  * Lock test for when it's just for synchronization of ring access.
diff --git a/drivers/gpu/drm/i915/i915_gem_exec.c b/drivers/gpu/drm/i915/i915_gem_exec.c
index a2c6dbf..4da3704 100644
--- a/drivers/gpu/drm/i915/i915_gem_exec.c
+++ b/drivers/gpu/drm/i915/i915_gem_exec.c
@@ -111,7 +111,7 @@ int i915_gem_exec_clear_object(struct drm_i915_gem_object *obj)
 	intel_ring_emit(ring, 0);
 	intel_ring_emit(ring, MI_NOOP);
 
-	intel_ring_advance(ring);
+	__intel_ring_advance(ring);
 	i915_gem_exec_dirty_object(obj, ring);
 
 unpin:
diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index b26b50b..6b7ce06 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -7547,7 +7547,7 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
 	intel_ring_emit(ring, 0); /* aux display base address, unused */
 
 	intel_mark_page_flip_active(intel_crtc);
-	intel_ring_advance(ring);
+	__intel_ring_advance(ring);
 	return 0;
 
 err_unpin:
@@ -7588,7 +7588,7 @@ static int intel_gen3_queue_flip(struct drm_device *dev,
 	intel_ring_emit(ring, MI_NOOP);
 
 	intel_mark_page_flip_active(intel_crtc);
-	intel_ring_advance(ring);
+	__intel_ring_advance(ring);
 	return 0;
 
 err_unpin:
@@ -7636,7 +7636,7 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
 	intel_ring_emit(ring, pf | pipesrc);
 
 	intel_mark_page_flip_active(intel_crtc);
-	intel_ring_advance(ring);
+	__intel_ring_advance(ring);
 	return 0;
 
 err_unpin:
@@ -7680,7 +7680,7 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
 	intel_ring_emit(ring, pf | pipesrc);
 
 	intel_mark_page_flip_active(intel_crtc);
-	intel_ring_advance(ring);
+	__intel_ring_advance(ring);
 	return 0;
 
 err_unpin:
@@ -7736,7 +7736,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
 	intel_ring_emit(ring, (MI_NOOP));
 
 	intel_mark_page_flip_active(intel_crtc);
-	intel_ring_advance(ring);
+	__intel_ring_advance(ring);
 	return 0;
 
 err_unpin:
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index 7de95da..8d5dd65 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -51,6 +51,16 @@ static inline int ring_space(struct intel_ring_buffer *ring)
 	return space;
 }
 
+void __intel_ring_advance(struct intel_ring_buffer *ring)
+{
+	struct drm_i915_private *dev_priv = ring->dev->dev_private;
+
+	ring->tail &= ring->size - 1;
+	if (dev_priv->gpu_error.stop_rings & intel_ring_flag(ring))
+		return;
+	ring->write_tail(ring, ring->tail);
+}
+
 static int
 gen2_render_ring_flush(struct intel_ring_buffer *ring,
 		       u32	invalidate_domains,
@@ -673,7 +683,7 @@ gen6_add_request(struct intel_ring_buffer *ring)
 	intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
 	intel_ring_emit(ring, ring->outstanding_lazy_request);
 	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_advance(ring);
+	__intel_ring_advance(ring);
 
 	return 0;
 }
@@ -787,7 +797,7 @@ pc_render_add_request(struct intel_ring_buffer *ring)
 	intel_ring_emit(ring, pc->gtt_offset | PIPE_CONTROL_GLOBAL_GTT);
 	intel_ring_emit(ring, ring->outstanding_lazy_request);
 	intel_ring_emit(ring, 0);
-	intel_ring_advance(ring);
+	__intel_ring_advance(ring);
 
 	return 0;
 }
@@ -1016,7 +1026,7 @@ i9xx_add_request(struct intel_ring_buffer *ring)
 	intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
 	intel_ring_emit(ring, ring->outstanding_lazy_request);
 	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_advance(ring);
+	__intel_ring_advance(ring);
 
 	return 0;
 }
@@ -1472,6 +1482,9 @@ static int ring_wait_for_space(struct intel_ring_buffer *ring, int n)
 	if (ret != -ENOSPC)
 		return ret;
 
+	/* force the tail write in case we have been skipping them */
+	__intel_ring_advance(ring);
+
 	trace_i915_ring_wait_begin(ring);
 	/* With GEM the hangcheck timer should kick us out of the loop,
 	 * leaving it early runs the risk of corrupting GEM state (due
@@ -1614,17 +1627,6 @@ void intel_ring_init_seqno(struct intel_ring_buffer *ring, u32 seqno)
 	ring->hangcheck.seqno = seqno;
 }
 
-void intel_ring_advance(struct intel_ring_buffer *ring)
-{
-	struct drm_i915_private *dev_priv = ring->dev->dev_private;
-
-	ring->tail &= ring->size - 1;
-	if (dev_priv->gpu_error.stop_rings & intel_ring_flag(ring))
-		return;
-	ring->write_tail(ring, ring->tail);
-}
-
-
 static void gen6_bsd_ring_write_tail(struct intel_ring_buffer *ring,
 				     u32 value)
 {
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 6e38256..d40eff8 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -232,7 +232,12 @@ static inline void intel_ring_emit(struct intel_ring_buffer *ring,
 	iowrite32(data, ring->virtual_start + ring->tail);
 	ring->tail += 4;
 }
-void intel_ring_advance(struct intel_ring_buffer *ring);
+static inline void intel_ring_advance(struct intel_ring_buffer *ring)
+{
+	ring->tail &= ring->size - 1;
+}
+void __intel_ring_advance(struct intel_ring_buffer *ring);
+
 int __must_check intel_ring_idle(struct intel_ring_buffer *ring);
 void intel_ring_init_seqno(struct intel_ring_buffer *ring, u32 seqno);
 int intel_ring_flush_all_caches(struct intel_ring_buffer *ring);
-- 
1.8.4.rc1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/i915: Write RING_TAIL once per-request
  2013-08-10 21:16 [PATCH] drm/i915: Write RING_TAIL once per-request Chris Wilson
@ 2013-08-26 20:42 ` Ben Widawsky
  2013-09-10 13:01   ` Chris Wilson
  0 siblings, 1 reply; 4+ messages in thread
From: Ben Widawsky @ 2013-08-26 20:42 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Sat, Aug 10, 2013 at 10:16:32PM +0100, Chris Wilson wrote:
> Ignoring the legacy DRI1 code, and a couple of special cases (to be
> discussed later), all access to the ring is mediated through requests.
> The first write to a ring will grab a seqno and mark the ring as having
> an outstanding_lazy_request. Either through explicitly adding a request
> after an execbuffer or through an implicit wait (either by the CPU or by
> a semaphore), that sequence of writes will be terminated with a request.
> So we can ellide all the intervening writes to the tail register and
> send the entire command stream to the GPU at once. This will reduce the
> number of *serialising* writes to the tail register by a factor or 3-5
> times (depending upon architecture and number of workarounds, context
> switches, etc involved). This becomes even more noticeable when the
> register write is overloaded with a number of debugging tools. The
> astute reader will wonder if it is then possible to overflow the ring
> with a single command. It is not. When we start a command sequence to
> the ring, we check for available space and issue a wait in case we have
> not. The ring wait will in this case be forced to flush the outstanding
> register write and then poll the ACTHD for sufficient space to continue.
> 
> The exception to the rule where everything is inside a request are a few
> initialisation cases where we may want to write GPU commands via the CS
> before userspace wakes up and page flips.
> 

I'm not a huge fan of having the second intel_ring_advance() that does something
other than it sounds like. I'd *much* prefer to not intel_ring_advance()
if you are certain more emits will be coming like in the case you
mention above. We can add a paranoia check whenever we're about to
return to userspace that tail == RING_TAIL

Also, without performance data, it's hard to say this indirection is
worth it.

> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>  drivers/gpu/drm/i915/i915_dma.c         |  2 +-
>  drivers/gpu/drm/i915/i915_gem_exec.c    |  2 +-
>  drivers/gpu/drm/i915/intel_display.c    | 10 +++++-----
>  drivers/gpu/drm/i915/intel_ringbuffer.c | 30 ++++++++++++++++--------------
>  drivers/gpu/drm/i915/intel_ringbuffer.h |  7 ++++++-
>  5 files changed, 29 insertions(+), 22 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index de0b86a..a7a0eb86 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -52,7 +52,7 @@
>  	intel_ring_emit(LP_RING(dev_priv), x)
>  
>  #define ADVANCE_LP_RING() \
> -	intel_ring_advance(LP_RING(dev_priv))
> +	__intel_ring_advance(LP_RING(dev_priv))
>  
>  /**
>   * Lock test for when it's just for synchronization of ring access.
> diff --git a/drivers/gpu/drm/i915/i915_gem_exec.c b/drivers/gpu/drm/i915/i915_gem_exec.c
> index a2c6dbf..4da3704 100644
> --- a/drivers/gpu/drm/i915/i915_gem_exec.c
> +++ b/drivers/gpu/drm/i915/i915_gem_exec.c
> @@ -111,7 +111,7 @@ int i915_gem_exec_clear_object(struct drm_i915_gem_object *obj)
>  	intel_ring_emit(ring, 0);
>  	intel_ring_emit(ring, MI_NOOP);
>  
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	i915_gem_exec_dirty_object(obj, ring);
>  
>  unpin:
> diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
> index b26b50b..6b7ce06 100644
> --- a/drivers/gpu/drm/i915/intel_display.c
> +++ b/drivers/gpu/drm/i915/intel_display.c
> @@ -7547,7 +7547,7 @@ static int intel_gen2_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, 0); /* aux display base address, unused */
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	return 0;
>  
>  err_unpin:
> @@ -7588,7 +7588,7 @@ static int intel_gen3_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, MI_NOOP);
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	return 0;
>  
>  err_unpin:
> @@ -7636,7 +7636,7 @@ static int intel_gen4_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, pf | pipesrc);
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	return 0;
>  
>  err_unpin:
> @@ -7680,7 +7680,7 @@ static int intel_gen6_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, pf | pipesrc);
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	return 0;
>  
>  err_unpin:
> @@ -7736,7 +7736,7 @@ static int intel_gen7_queue_flip(struct drm_device *dev,
>  	intel_ring_emit(ring, (MI_NOOP));
>  
>  	intel_mark_page_flip_active(intel_crtc);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  	return 0;
>  
>  err_unpin:
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
> index 7de95da..8d5dd65 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.c
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
> @@ -51,6 +51,16 @@ static inline int ring_space(struct intel_ring_buffer *ring)
>  	return space;
>  }
>  
> +void __intel_ring_advance(struct intel_ring_buffer *ring)
> +{
> +	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> +
> +	ring->tail &= ring->size - 1;
> +	if (dev_priv->gpu_error.stop_rings & intel_ring_flag(ring))
> +		return;
> +	ring->write_tail(ring, ring->tail);
> +}
> +
>  static int
>  gen2_render_ring_flush(struct intel_ring_buffer *ring,
>  		       u32	invalidate_domains,
> @@ -673,7 +683,7 @@ gen6_add_request(struct intel_ring_buffer *ring)
>  	intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
>  	intel_ring_emit(ring, ring->outstanding_lazy_request);
>  	intel_ring_emit(ring, MI_USER_INTERRUPT);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  
>  	return 0;
>  }
> @@ -787,7 +797,7 @@ pc_render_add_request(struct intel_ring_buffer *ring)
>  	intel_ring_emit(ring, pc->gtt_offset | PIPE_CONTROL_GLOBAL_GTT);
>  	intel_ring_emit(ring, ring->outstanding_lazy_request);
>  	intel_ring_emit(ring, 0);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  
>  	return 0;
>  }
> @@ -1016,7 +1026,7 @@ i9xx_add_request(struct intel_ring_buffer *ring)
>  	intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
>  	intel_ring_emit(ring, ring->outstanding_lazy_request);
>  	intel_ring_emit(ring, MI_USER_INTERRUPT);
> -	intel_ring_advance(ring);
> +	__intel_ring_advance(ring);
>  
>  	return 0;
>  }
> @@ -1472,6 +1482,9 @@ static int ring_wait_for_space(struct intel_ring_buffer *ring, int n)
>  	if (ret != -ENOSPC)
>  		return ret;
>  
> +	/* force the tail write in case we have been skipping them */
> +	__intel_ring_advance(ring);
> +
>  	trace_i915_ring_wait_begin(ring);
>  	/* With GEM the hangcheck timer should kick us out of the loop,
>  	 * leaving it early runs the risk of corrupting GEM state (due
> @@ -1614,17 +1627,6 @@ void intel_ring_init_seqno(struct intel_ring_buffer *ring, u32 seqno)
>  	ring->hangcheck.seqno = seqno;
>  }
>  
> -void intel_ring_advance(struct intel_ring_buffer *ring)
> -{
> -	struct drm_i915_private *dev_priv = ring->dev->dev_private;
> -
> -	ring->tail &= ring->size - 1;
> -	if (dev_priv->gpu_error.stop_rings & intel_ring_flag(ring))
> -		return;
> -	ring->write_tail(ring, ring->tail);
> -}
> -
> -
>  static void gen6_bsd_ring_write_tail(struct intel_ring_buffer *ring,
>  				     u32 value)
>  {
> diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
> index 6e38256..d40eff8 100644
> --- a/drivers/gpu/drm/i915/intel_ringbuffer.h
> +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
> @@ -232,7 +232,12 @@ static inline void intel_ring_emit(struct intel_ring_buffer *ring,
>  	iowrite32(data, ring->virtual_start + ring->tail);
>  	ring->tail += 4;
>  }
> -void intel_ring_advance(struct intel_ring_buffer *ring);
> +static inline void intel_ring_advance(struct intel_ring_buffer *ring)
> +{
> +	ring->tail &= ring->size - 1;
> +}
> +void __intel_ring_advance(struct intel_ring_buffer *ring);
> +
>  int __must_check intel_ring_idle(struct intel_ring_buffer *ring);
>  void intel_ring_init_seqno(struct intel_ring_buffer *ring, u32 seqno);
>  int intel_ring_flush_all_caches(struct intel_ring_buffer *ring);

Functions you missed which I think should have an actual tail write:
ironlake_enable_rc6
intel_overlay_* (not sure about these, just a guess)


Also, this makes a nice opportunity to not actually switch back on
add_request fail in do_switch()


In any case it's (begrudgingly):
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

-- 
Ben Widawsky, Intel Open Source Technology Center

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/i915: Write RING_TAIL once per-request
  2013-08-26 20:42 ` Ben Widawsky
@ 2013-09-10 13:01   ` Chris Wilson
  2013-09-10 14:13     ` Daniel Vetter
  0 siblings, 1 reply; 4+ messages in thread
From: Chris Wilson @ 2013-09-10 13:01 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

On Mon, Aug 26, 2013 at 01:42:12PM -0700, Ben Widawsky wrote:
> On Sat, Aug 10, 2013 at 10:16:32PM +0100, Chris Wilson wrote:
> > Ignoring the legacy DRI1 code, and a couple of special cases (to be
> > discussed later), all access to the ring is mediated through requests.
> > The first write to a ring will grab a seqno and mark the ring as having
> > an outstanding_lazy_request. Either through explicitly adding a request
> > after an execbuffer or through an implicit wait (either by the CPU or by
> > a semaphore), that sequence of writes will be terminated with a request.
> > So we can ellide all the intervening writes to the tail register and
> > send the entire command stream to the GPU at once. This will reduce the
> > number of *serialising* writes to the tail register by a factor or 3-5
> > times (depending upon architecture and number of workarounds, context
> > switches, etc involved). This becomes even more noticeable when the
> > register write is overloaded with a number of debugging tools. The
> > astute reader will wonder if it is then possible to overflow the ring
> > with a single command. It is not. When we start a command sequence to
> > the ring, we check for available space and issue a wait in case we have
> > not. The ring wait will in this case be forced to flush the outstanding
> > register write and then poll the ACTHD for sufficient space to continue.
> > 
> > The exception to the rule where everything is inside a request are a few
> > initialisation cases where we may want to write GPU commands via the CS
> > before userspace wakes up and page flips.
> > 
> 
> I'm not a huge fan of having the second intel_ring_advance() that does something
> other than it sounds like. I'd *much* prefer to not intel_ring_advance()
> if you are certain more emits will be coming like in the case you
> mention above. We can add a paranoia check whenever we're about to
> return to userspace that tail == RING_TAIL
> 
> Also, without performance data, it's hard to say this indirection is
> worth it.

Just a sample, UXA on i5-2520qm, aa10text:
2580000.0/sec -> 2980000.0/sec
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] drm/i915: Write RING_TAIL once per-request
  2013-09-10 13:01   ` Chris Wilson
@ 2013-09-10 14:13     ` Daniel Vetter
  0 siblings, 0 replies; 4+ messages in thread
From: Daniel Vetter @ 2013-09-10 14:13 UTC (permalink / raw)
  To: Chris Wilson, Ben Widawsky, intel-gfx

On Tue, Sep 10, 2013 at 02:01:20PM +0100, Chris Wilson wrote:
> On Mon, Aug 26, 2013 at 01:42:12PM -0700, Ben Widawsky wrote:
> > On Sat, Aug 10, 2013 at 10:16:32PM +0100, Chris Wilson wrote:
> > > Ignoring the legacy DRI1 code, and a couple of special cases (to be
> > > discussed later), all access to the ring is mediated through requests.
> > > The first write to a ring will grab a seqno and mark the ring as having
> > > an outstanding_lazy_request. Either through explicitly adding a request
> > > after an execbuffer or through an implicit wait (either by the CPU or by
> > > a semaphore), that sequence of writes will be terminated with a request.
> > > So we can ellide all the intervening writes to the tail register and
> > > send the entire command stream to the GPU at once. This will reduce the
> > > number of *serialising* writes to the tail register by a factor or 3-5
> > > times (depending upon architecture and number of workarounds, context
> > > switches, etc involved). This becomes even more noticeable when the
> > > register write is overloaded with a number of debugging tools. The
> > > astute reader will wonder if it is then possible to overflow the ring
> > > with a single command. It is not. When we start a command sequence to
> > > the ring, we check for available space and issue a wait in case we have
> > > not. The ring wait will in this case be forced to flush the outstanding
> > > register write and then poll the ACTHD for sufficient space to continue.
> > > 
> > > The exception to the rule where everything is inside a request are a few
> > > initialisation cases where we may want to write GPU commands via the CS
> > > before userspace wakes up and page flips.
> > > 
> > 
> > I'm not a huge fan of having the second intel_ring_advance() that does something
> > other than it sounds like. I'd *much* prefer to not intel_ring_advance()
> > if you are certain more emits will be coming like in the case you
> > mention above. We can add a paranoia check whenever we're about to
> > return to userspace that tail == RING_TAIL
> > 
> > Also, without performance data, it's hard to say this indirection is
> > worth it.
> 
> Just a sample, UXA on i5-2520qm, aa10text:
> 2580000.0/sec -> 2980000.0/sec

Queued for -next, thanks for the patch.
-Daniel
-- 
Daniel Vetter
Software Engineer, Intel Corporation
+41 (0) 79 365 57 48 - http://blog.ffwll.ch

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2013-09-10 14:13 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-10 21:16 [PATCH] drm/i915: Write RING_TAIL once per-request Chris Wilson
2013-08-26 20:42 ` Ben Widawsky
2013-09-10 13:01   ` Chris Wilson
2013-09-10 14:13     ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.