All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
@ 2011-11-29 15:12 Chris Wilson
  2011-11-29 15:34 ` Daniel Vetter
                   ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Chris Wilson @ 2011-11-29 15:12 UTC (permalink / raw)
  To: intel-gfx

We try to avoid writing the relocations through the uncached GTT, if the
buffer is currently in the CPU write domain and so will be flushed out to
main memory afterwards anyway. Also on SandyBridge we can safely write
to the pages in cacheable memory, so long as the buffer is LLC mapped.
In either of these caches, we therefore do not need to force the
reallocation of the buffer into the mappable region of the GTT, reducing
the aperture pressure.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   24 +++++++++++++++++-------
 1 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index f71f26e..9b8d543 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -265,6 +265,12 @@ eb_destroy(struct eb_objects *eb)
 	kfree(eb);
 }
 
+static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
+{
+	return (obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+		obj->cache_level != I915_CACHE_NONE);
+}
+
 static int
 i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 				   struct eb_objects *eb,
@@ -351,7 +357,7 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 	}
 
 	reloc->delta += target_offset;
-	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) {
+	if (use_cpu_reloc(obj)) {
 		uint32_t page_offset = reloc->offset & ~PAGE_MASK;
 		char *vaddr;
 
@@ -463,6 +469,13 @@ i915_gem_execbuffer_relocate(struct drm_device *dev,
 #define  __EXEC_OBJECT_HAS_FENCE (1<<31)
 
 static int
+need_reloc_mappable(struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
+	return entry->relocation_count && !use_cpu_reloc(obj);
+}
+
+static int
 pin_and_fence_object(struct drm_i915_gem_object *obj,
 		     struct intel_ring_buffer *ring)
 {
@@ -475,8 +488,7 @@ pin_and_fence_object(struct drm_i915_gem_object *obj,
 		has_fenced_gpu_access &&
 		entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 		obj->tiling_mode != I915_TILING_NONE;
-	need_mappable =
-		entry->relocation_count ? true : need_fence;
+	need_mappable = need_fence || need_reloc_mappable(obj);
 
 	ret = i915_gem_object_pin(obj, entry->alignment, need_mappable);
 	if (ret)
@@ -532,8 +544,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 			has_fenced_gpu_access &&
 			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 			obj->tiling_mode != I915_TILING_NONE;
-		need_mappable =
-			entry->relocation_count ? true : need_fence;
+		need_mappable = need_fence || need_reloc_mappable(obj);
 
 		if (need_mappable)
 			list_move(&obj->exec_list, &ordered_objects);
@@ -573,8 +584,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 				has_fenced_gpu_access &&
 				entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 				obj->tiling_mode != I915_TILING_NONE;
-			need_mappable =
-				entry->relocation_count ? true : need_fence;
+			need_mappable = need_fence || need_reloc_mappable(obj);
 
 			if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) ||
 			    (need_mappable && !obj->map_and_fenceable))
-- 
1.7.7.3

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2011-11-29 15:12 [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU Chris Wilson
@ 2011-11-29 15:34 ` Daniel Vetter
  2011-11-29 16:48   ` Chris Wilson
  2011-11-29 19:22 ` Daniel Vetter
  2011-11-29 21:32 ` Ben Widawsky
  2 siblings, 1 reply; 14+ messages in thread
From: Daniel Vetter @ 2011-11-29 15:34 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Tue, Nov 29, 2011 at 03:12:40PM +0000, Chris Wilson wrote:
> We try to avoid writing the relocations through the uncached GTT, if the
> buffer is currently in the CPU write domain and so will be flushed out to
> main memory afterwards anyway. Also on SandyBridge we can safely write
> to the pages in cacheable memory, so long as the buffer is LLC mapped.
> In either of these caches, we therefore do not need to force the
> reallocation of the buffer into the mappable region of the GTT, reducing
> the aperture pressure.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

The error_state capture currently relies on us pinning buffers as mappable
when they contain relocations (and userspace always submitting a
batchbuffers containing relocations). You break that guarantee without
fixing up the error capture code. Otherwise I like this.
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2011-11-29 15:34 ` Daniel Vetter
@ 2011-11-29 16:48   ` Chris Wilson
  2011-11-29 17:03     ` Daniel Vetter
  0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2011-11-29 16:48 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Tue, 29 Nov 2011 16:34:41 +0100, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Tue, Nov 29, 2011 at 03:12:40PM +0000, Chris Wilson wrote:
> > We try to avoid writing the relocations through the uncached GTT, if the
> > buffer is currently in the CPU write domain and so will be flushed out to
> > main memory afterwards anyway. Also on SandyBridge we can safely write
> > to the pages in cacheable memory, so long as the buffer is LLC mapped.
> > In either of these caches, we therefore do not need to force the
> > reallocation of the buffer into the mappable region of the GTT, reducing
> > the aperture pressure.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> 
> The error_state capture currently relies on us pinning buffers as mappable
> when they contain relocations (and userspace always submitting a
> batchbuffers containing relocations). You break that guarantee without
> fixing up the error capture code. Otherwise I like this.

I may have sent that patch a little earlier. ;-)
That particular patch stands by itself since we already do use the full
GTT and so need defense against reading through unmappable PTEs because
having some of our errors, you can't be paranoid enough.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2011-11-29 16:48   ` Chris Wilson
@ 2011-11-29 17:03     ` Daniel Vetter
  2011-11-29 17:15       ` Chris Wilson
  2011-11-29 17:17       ` Daniel Vetter
  0 siblings, 2 replies; 14+ messages in thread
From: Daniel Vetter @ 2011-11-29 17:03 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Tue, Nov 29, 2011 at 04:48:15PM +0000, Chris Wilson wrote:
> On Tue, 29 Nov 2011 16:34:41 +0100, Daniel Vetter <daniel@ffwll.ch> wrote:
> > On Tue, Nov 29, 2011 at 03:12:40PM +0000, Chris Wilson wrote:
> > > We try to avoid writing the relocations through the uncached GTT, if the
> > > buffer is currently in the CPU write domain and so will be flushed out to
> > > main memory afterwards anyway. Also on SandyBridge we can safely write
> > > to the pages in cacheable memory, so long as the buffer is LLC mapped.
> > > In either of these caches, we therefore do not need to force the
> > > reallocation of the buffer into the mappable region of the GTT, reducing
> > > the aperture pressure.
> > > 
> > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > 
> > The error_state capture currently relies on us pinning buffers as mappable
> > when they contain relocations (and userspace always submitting a
> > batchbuffers containing relocations). You break that guarantee without
> > fixing up the error capture code. Otherwise I like this.
> 
> I may have sent that patch a little earlier. ;-)

Yes, I know. My gripe is that this will reduce our chances of successfully
capturing the error_state, because now we expect to hit that case in the
error capture code whereas up to now it would have been a bug somewhere.
So either
- fixup the error_capture to fall back to cpu reads (needs the usual
  clflush dance if the object is not llc cached)
- or drop the pin mappable change in this patch.
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2011-11-29 17:03     ` Daniel Vetter
@ 2011-11-29 17:15       ` Chris Wilson
  2011-11-29 17:17       ` Daniel Vetter
  1 sibling, 0 replies; 14+ messages in thread
From: Chris Wilson @ 2011-11-29 17:15 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Tue, 29 Nov 2011 18:03:53 +0100, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Tue, Nov 29, 2011 at 04:48:15PM +0000, Chris Wilson wrote:
> > On Tue, 29 Nov 2011 16:34:41 +0100, Daniel Vetter <daniel@ffwll.ch> wrote:
> > > On Tue, Nov 29, 2011 at 03:12:40PM +0000, Chris Wilson wrote:
> > > > We try to avoid writing the relocations through the uncached GTT, if the
> > > > buffer is currently in the CPU write domain and so will be flushed out to
> > > > main memory afterwards anyway. Also on SandyBridge we can safely write
> > > > to the pages in cacheable memory, so long as the buffer is LLC mapped.
> > > > In either of these caches, we therefore do not need to force the
> > > > reallocation of the buffer into the mappable region of the GTT, reducing
> > > > the aperture pressure.
> > > > 
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > 
> > > The error_state capture currently relies on us pinning buffers as mappable
> > > when they contain relocations (and userspace always submitting a
> > > batchbuffers containing relocations). You break that guarantee without
> > > fixing up the error capture code. Otherwise I like this.
> > 
> > I may have sent that patch a little earlier. ;-)
> 
> Yes, I know. My gripe is that this will reduce our chances of successfully
> capturing the error_state, because now we expect to hit that case in the
> error capture code whereas up to now it would have been a bug somewhere.
> So either
> - fixup the error_capture to fall back to cpu reads (needs the usual
>   clflush dance if the object is not llc cached)
> - or drop the pin mappable change in this patch.

Ah you forget, I volunteered you to do the error-state capture from a
workqueue so that we could add further complexity... 

We would then be able to allocate enough memory to capture auxiliary
buffers as well, etc.

In the meantime, the paths that hit this code are during warmup (before
any batches have been retired into the userspace bo cache), slow steady
state behaviour (when the caches are being reaped and repopulated), and
when thrashing the hardware. It also requires that the whole mappable
range had already been allocated.

Whilst not negligible, the risk imo is small and all will be solved with
the next generation i915_capture_error().
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2011-11-29 17:03     ` Daniel Vetter
  2011-11-29 17:15       ` Chris Wilson
@ 2011-11-29 17:17       ` Daniel Vetter
  2011-11-29 17:21         ` Daniel Vetter
  1 sibling, 1 reply; 14+ messages in thread
From: Daniel Vetter @ 2011-11-29 17:17 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Tue, Nov 29, 2011 at 06:03:53PM +0100, Daniel Vetter wrote:
> On Tue, Nov 29, 2011 at 04:48:15PM +0000, Chris Wilson wrote:
> > On Tue, 29 Nov 2011 16:34:41 +0100, Daniel Vetter <daniel@ffwll.ch> wrote:
> > > On Tue, Nov 29, 2011 at 03:12:40PM +0000, Chris Wilson wrote:
> > > > We try to avoid writing the relocations through the uncached GTT, if the
> > > > buffer is currently in the CPU write domain and so will be flushed out to
> > > > main memory afterwards anyway. Also on SandyBridge we can safely write
> > > > to the pages in cacheable memory, so long as the buffer is LLC mapped.
> > > > In either of these caches, we therefore do not need to force the
> > > > reallocation of the buffer into the mappable region of the GTT, reducing
> > > > the aperture pressure.
> > > > 
> > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > 
> > > The error_state capture currently relies on us pinning buffers as mappable
> > > when they contain relocations (and userspace always submitting a
> > > batchbuffers containing relocations). You break that guarantee without
> > > fixing up the error capture code. Otherwise I like this.
> > 
> > I may have sent that patch a little earlier. ;-)
> 
> Yes, I know. My gripe is that this will reduce our chances of successfully
> capturing the error_state, because now we expect to hit that case in the
> error capture code whereas up to now it would have been a bug somewhere.
> So either
> - fixup the error_capture to fall back to cpu reads (needs the usual
>   clflush dance if the object is not llc cached)
> - or drop the pin mappable change in this patch.

After some irc discussion with Dave Airlie I think a simple wmb() to flush
the wc buffer might make more sense. I'll try to run that past testers and
gather results. Will take at least a week to get anything conclusive.
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2011-11-29 17:17       ` Daniel Vetter
@ 2011-11-29 17:21         ` Daniel Vetter
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Vetter @ 2011-11-29 17:21 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Tue, Nov 29, 2011 at 06:17:21PM +0100, Daniel Vetter wrote:
> On Tue, Nov 29, 2011 at 06:03:53PM +0100, Daniel Vetter wrote:
> > On Tue, Nov 29, 2011 at 04:48:15PM +0000, Chris Wilson wrote:
> > > On Tue, 29 Nov 2011 16:34:41 +0100, Daniel Vetter <daniel@ffwll.ch> wrote:
> > > > On Tue, Nov 29, 2011 at 03:12:40PM +0000, Chris Wilson wrote:
> > > > > We try to avoid writing the relocations through the uncached GTT, if the
> > > > > buffer is currently in the CPU write domain and so will be flushed out to
> > > > > main memory afterwards anyway. Also on SandyBridge we can safely write
> > > > > to the pages in cacheable memory, so long as the buffer is LLC mapped.
> > > > > In either of these caches, we therefore do not need to force the
> > > > > reallocation of the buffer into the mappable region of the GTT, reducing
> > > > > the aperture pressure.
> > > > > 
> > > > > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > > > 
> > > > The error_state capture currently relies on us pinning buffers as mappable
> > > > when they contain relocations (and userspace always submitting a
> > > > batchbuffers containing relocations). You break that guarantee without
> > > > fixing up the error capture code. Otherwise I like this.
> > > 
> > > I may have sent that patch a little earlier. ;-)
> > 
> > Yes, I know. My gripe is that this will reduce our chances of successfully
> > capturing the error_state, because now we expect to hit that case in the
> > error capture code whereas up to now it would have been a bug somewhere.
> > So either
> > - fixup the error_capture to fall back to cpu reads (needs the usual
> >   clflush dance if the object is not llc cached)
> > - or drop the pin mappable change in this patch.
> 
> After some irc discussion with Dave Airlie I think a simple wmb() to flush
> the wc buffer might make more sense. I'll try to run that past testers and
> gather results. Will take at least a week to get anything conclusive.

Meh, replied to the wrong mail ...
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2011-11-29 15:12 [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU Chris Wilson
  2011-11-29 15:34 ` Daniel Vetter
@ 2011-11-29 19:22 ` Daniel Vetter
  2011-11-29 21:32 ` Ben Widawsky
  2 siblings, 0 replies; 14+ messages in thread
From: Daniel Vetter @ 2011-11-29 19:22 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Tue, Nov 29, 2011 at 03:12:40PM +0000, Chris Wilson wrote:
> We try to avoid writing the relocations through the uncached GTT, if the
> buffer is currently in the CPU write domain and so will be flushed out to
> main memory afterwards anyway. Also on SandyBridge we can safely write
> to the pages in cacheable memory, so long as the buffer is LLC mapped.
> In either of these caches, we therefore do not need to force the
> reallocation of the buffer into the mappable region of the GTT, reducing
> the aperture pressure.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>

With the improved error_state capture code this is
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2011-11-29 15:12 [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU Chris Wilson
  2011-11-29 15:34 ` Daniel Vetter
  2011-11-29 19:22 ` Daniel Vetter
@ 2011-11-29 21:32 ` Ben Widawsky
  2 siblings, 0 replies; 14+ messages in thread
From: Ben Widawsky @ 2011-11-29 21:32 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Tue, Nov 29, 2011 at 03:12:40PM +0000, Chris Wilson wrote:
> We try to avoid writing the relocations through the uncached GTT, if the
> buffer is currently in the CPU write domain and so will be flushed out to
> main memory afterwards anyway. Also on SandyBridge we can safely write
> to the pages in cacheable memory, so long as the buffer is LLC mapped.
> In either of these caches, we therefore do not need to force the
> reallocation of the buffer into the mappable region of the GTT, reducing
> the aperture pressure.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2012-03-26  8:32 ` Chris Wilson
@ 2012-03-27 11:17   ` Daniel Vetter
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Vetter @ 2012-03-27 11:17 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Daniel Vetter, Intel Graphics Development

On Mon, Mar 26, 2012 at 09:32:52AM +0100, Chris Wilson wrote:
> On Mon, 26 Mar 2012 10:10:27 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> > From: Chris Wilson <chris@chris-wilson.co.uk>
> > 
> > We try to avoid writing the relocations through the uncached GTT, if the
> > buffer is currently in the CPU write domain and so will be flushed out to
> > main memory afterwards anyway. Also on SandyBridge we can safely write
> > to the pages in cacheable memory, so long as the buffer is LLC mapped.
> > In either of these caches, we therefore do not need to force the
> s/caches/cases/
> 
> > reallocation of the buffer into the mappable region of the GTT, reducing
> > the aperture pressure.

Applied to dinq with the spelling fixed. Thanks for the patch.
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2012-03-26  8:10 Daniel Vetter
@ 2012-03-26  8:32 ` Chris Wilson
  2012-03-27 11:17   ` Daniel Vetter
  0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2012-03-26  8:32 UTC (permalink / raw)
  To: Daniel Vetter, Intel Graphics Development

On Mon, 26 Mar 2012 10:10:27 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> From: Chris Wilson <chris@chris-wilson.co.uk>
> 
> We try to avoid writing the relocations through the uncached GTT, if the
> buffer is currently in the CPU write domain and so will be flushed out to
> main memory afterwards anyway. Also on SandyBridge we can safely write
> to the pages in cacheable memory, so long as the buffer is LLC mapped.
> In either of these caches, we therefore do not need to force the
s/caches/cases/

> reallocation of the buffer into the mappable region of the GTT, reducing
> the aperture pressure.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
@ 2012-03-26  8:10 Daniel Vetter
  2012-03-26  8:32 ` Chris Wilson
  0 siblings, 1 reply; 14+ messages in thread
From: Daniel Vetter @ 2012-03-26  8:10 UTC (permalink / raw)
  To: Intel Graphics Development

From: Chris Wilson <chris@chris-wilson.co.uk>

We try to avoid writing the relocations through the uncached GTT, if the
buffer is currently in the CPU write domain and so will be flushed out to
main memory afterwards anyway. Also on SandyBridge we can safely write
to the pages in cacheable memory, so long as the buffer is LLC mapped.
In either of these caches, we therefore do not need to force the
reallocation of the buffer into the mappable region of the GTT, reducing
the aperture pressure.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---

Now that the cs prefetch/guard page is sorted out, we can resurrect this. It
obviously only makes sense together with the pwrite rework, for otherwise it'd
still be nigh to impossible to have buffers with relocations in umappable space
(because userspace uses pwrite to upload them).
-Daniel

---
 drivers/gpu/drm/i915/i915_drv.h            |    2 +
 drivers/gpu/drm/i915/i915_gem.c            |    4 +--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   36 +++++++++++++++++++--------
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9c75a7b..348f693 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1270,6 +1270,8 @@ int __must_check
 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
 				  bool write);
 int __must_check
+i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write);
+int __must_check
 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 				     u32 alignment,
 				     struct intel_ring_buffer *pipelined);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0a16366..b704327 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -39,8 +39,6 @@
 static __must_check int i915_gem_object_flush_gpu_write_domain(struct drm_i915_gem_object *obj);
 static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj);
 static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj);
-static __must_check int i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj,
-							  bool write);
 static __must_check int i915_gem_object_set_cpu_read_domain_range(struct drm_i915_gem_object *obj,
 								  uint64_t offset,
 								  uint64_t size);
@@ -3091,7 +3089,7 @@ i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj)
  * This function returns when the move is complete, including waiting on
  * flushes to occur.
  */
-static int
+int
 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 {
 	uint32_t old_write_domain, old_read_domains;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 1fa0131..eb85860 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -266,6 +266,12 @@ eb_destroy(struct eb_objects *eb)
 	kfree(eb);
 }
 
+static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
+{
+	return (obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+		obj->cache_level != I915_CACHE_NONE);
+}
+
 static int
 i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 				   struct eb_objects *eb,
@@ -354,11 +360,19 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 		return ret;
 	}
 
+	/* We can't wait for rendering with pagefaults disabled */
+	if (obj->active && in_atomic())
+		return -EFAULT;
+
 	reloc->delta += target_offset;
-	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) {
+	if (use_cpu_reloc(obj)) {
 		uint32_t page_offset = reloc->offset & ~PAGE_MASK;
 		char *vaddr;
 
+		ret = i915_gem_object_set_to_cpu_domain(obj, 1);
+		if (ret)
+			return ret;
+
 		vaddr = kmap_atomic(obj->pages[reloc->offset >> PAGE_SHIFT]);
 		*(uint32_t *)(vaddr + page_offset) = reloc->delta;
 		kunmap_atomic(vaddr);
@@ -367,10 +381,6 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 		uint32_t __iomem *reloc_entry;
 		void __iomem *reloc_page;
 
-		/* We can't wait for rendering with pagefaults disabled */
-		if (obj->active && in_atomic())
-			return -EFAULT;
-
 		ret = i915_gem_object_set_to_gtt_domain(obj, 1);
 		if (ret)
 			return ret;
@@ -493,6 +503,13 @@ i915_gem_execbuffer_relocate(struct drm_device *dev,
 #define  __EXEC_OBJECT_HAS_FENCE (1<<31)
 
 static int
+need_reloc_mappable(struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
+	return entry->relocation_count && !use_cpu_reloc(obj);
+}
+
+static int
 pin_and_fence_object(struct drm_i915_gem_object *obj,
 		     struct intel_ring_buffer *ring)
 {
@@ -505,8 +522,7 @@ pin_and_fence_object(struct drm_i915_gem_object *obj,
 		has_fenced_gpu_access &&
 		entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 		obj->tiling_mode != I915_TILING_NONE;
-	need_mappable =
-		entry->relocation_count ? true : need_fence;
+	need_mappable = need_fence || need_reloc_mappable(obj);
 
 	ret = i915_gem_object_pin(obj, entry->alignment, need_mappable);
 	if (ret)
@@ -563,8 +579,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 			has_fenced_gpu_access &&
 			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 			obj->tiling_mode != I915_TILING_NONE;
-		need_mappable =
-			entry->relocation_count ? true : need_fence;
+		need_mappable = need_fence || need_reloc_mappable(obj);
 
 		if (need_mappable)
 			list_move(&obj->exec_list, &ordered_objects);
@@ -604,8 +619,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 				has_fenced_gpu_access &&
 				entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 				obj->tiling_mode != I915_TILING_NONE;
-			need_mappable =
-				entry->relocation_count ? true : need_fence;
+			need_mappable = need_fence || need_reloc_mappable(obj);
 
 			if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) ||
 			    (need_mappable && !obj->map_and_fenceable))
-- 
1.7.9

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
  2011-12-04 12:26 ` Chris Wilson
@ 2011-12-05 10:41   ` Daniel Vetter
  0 siblings, 0 replies; 14+ messages in thread
From: Daniel Vetter @ 2011-12-05 10:41 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

On Sun, Dec 04, 2011 at 12:26:41PM +0000, Chris Wilson wrote:
> We try to avoid writing the relocations through the uncached GTT, if the
> buffer is currently in the CPU write domain and so will be flushed out to
> main memory afterwards anyway. Also on SandyBridge we can safely write
> to the pages in cacheable memory, so long as the buffer is LLC mapped.
> In either of these caches, we therefore do not need to force the
> reallocation of the buffer into the mappable region of the GTT, reducing
> the aperture pressure.
>
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Daniel Vetter <daniel@ffwll.ch>
> ---
>
> v2: We need to wait on LLC-mapped buffers before performing the relocs.
> This is neatly done by making the GTT/CPU paths symmetric.
>
> Should fix i-g-t/tests/gem_reloc_vs_gpu when the test case itself is
> debugged ;-)

Testcase is now fixed up and does indeed pass with this patch.

Reviewed-by: Daniel Vetter <daniel.vetter@ffwll.ch>
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU
       [not found] <id:1322579560-15548-1-git-send-email-chris@chris-wilson.co.uk>
@ 2011-12-04 12:26 ` Chris Wilson
  2011-12-05 10:41   ` Daniel Vetter
  0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2011-12-04 12:26 UTC (permalink / raw)
  To: intel-gfx

We try to avoid writing the relocations through the uncached GTT, if the
buffer is currently in the CPU write domain and so will be flushed out to
main memory afterwards anyway. Also on SandyBridge we can safely write
to the pages in cacheable memory, so long as the buffer is LLC mapped.
In either of these caches, we therefore do not need to force the
reallocation of the buffer into the mappable region of the GTT, reducing
the aperture pressure.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Daniel Vetter <daniel@ffwll.ch>
---

v2: We need to wait on LLC-mapped buffers before performing the relocs.
This is neatly done by making the GTT/CPU paths symmetric.

Should fix i-g-t/tests/gem_reloc_vs_gpu when the test case itself is
debugged ;-)

---

 drivers/gpu/drm/i915/i915_drv.h            |    2 +
 drivers/gpu/drm/i915/i915_gem.c            |    4 +--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |   36 +++++++++++++++++++--------
 3 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index bdbd6d8..1cafe32 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1226,6 +1226,8 @@ int __must_check
 i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj,
 				  bool write);
 int __must_check
+i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write);
+int __must_check
 i915_gem_object_pin_to_display_plane(struct drm_i915_gem_object *obj,
 				     u32 alignment,
 				     struct intel_ring_buffer *pipelined);
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 036bc58..7ada9d2 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -39,8 +39,6 @@
 static __must_check int i915_gem_object_flush_gpu_write_domain(struct drm_i915_gem_object *obj);
 static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *obj);
 static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj);
-static __must_check int i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj,
-							  bool write);
 static __must_check int i915_gem_object_set_cpu_read_domain_range(struct drm_i915_gem_object *obj,
 								  uint64_t offset,
 								  uint64_t size);
@@ -3102,7 +3100,7 @@ i915_gem_object_finish_gpu(struct drm_i915_gem_object *obj)
  * This function returns when the move is complete, including waiting on
  * flushes to occur.
  */
-static int
+int
 i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 {
 	uint32_t old_write_domain, old_read_domains;
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index c918124..57e5b78 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -265,6 +265,12 @@ eb_destroy(struct eb_objects *eb)
 	kfree(eb);
 }
 
+static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
+{
+	return (obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+		obj->cache_level != I915_CACHE_NONE);
+}
+
 static int
 i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 				   struct eb_objects *eb,
@@ -350,11 +356,19 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 		return ret;
 	}
 
+	/* We can't wait for rendering with pagefaults disabled */
+	if (obj->active && in_atomic())
+		return -EFAULT;
+
 	reloc->delta += target_offset;
-	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU) {
+	if (use_cpu_reloc(obj)) {
 		uint32_t page_offset = reloc->offset & ~PAGE_MASK;
 		char *vaddr;
 
+		ret = i915_gem_object_set_to_cpu_domain(obj, 1);
+		if (ret)
+			return ret;
+
 		vaddr = kmap_atomic(obj->pages[reloc->offset >> PAGE_SHIFT]);
 		*(uint32_t *)(vaddr + page_offset) = reloc->delta;
 		kunmap_atomic(vaddr);
@@ -363,10 +377,6 @@ i915_gem_execbuffer_relocate_entry(struct drm_i915_gem_object *obj,
 		uint32_t __iomem *reloc_entry;
 		void __iomem *reloc_page;
 
-		/* We can't wait for rendering with pagefaults disabled */
-		if (obj->active && in_atomic())
-			return -EFAULT;
-
 		ret = i915_gem_object_set_to_gtt_domain(obj, 1);
 		if (ret)
 			return ret;
@@ -463,6 +473,13 @@ i915_gem_execbuffer_relocate(struct drm_device *dev,
 #define  __EXEC_OBJECT_HAS_FENCE (1<<31)
 
 static int
+need_reloc_mappable(struct drm_i915_gem_object *obj)
+{
+	struct drm_i915_gem_exec_object2 *entry = obj->exec_entry;
+	return entry->relocation_count && !use_cpu_reloc(obj);
+}
+
+static int
 pin_and_fence_object(struct drm_i915_gem_object *obj,
 		     struct intel_ring_buffer *ring)
 {
@@ -475,8 +492,7 @@ pin_and_fence_object(struct drm_i915_gem_object *obj,
 		has_fenced_gpu_access &&
 		entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 		obj->tiling_mode != I915_TILING_NONE;
-	need_mappable =
-		entry->relocation_count ? true : need_fence;
+	need_mappable = need_fence || need_reloc_mappable(obj);
 
 	ret = i915_gem_object_pin(obj, entry->alignment, need_mappable);
 	if (ret)
@@ -532,8 +548,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 			has_fenced_gpu_access &&
 			entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 			obj->tiling_mode != I915_TILING_NONE;
-		need_mappable =
-			entry->relocation_count ? true : need_fence;
+		need_mappable = need_fence || need_reloc_mappable(obj);
 
 		if (need_mappable)
 			list_move(&obj->exec_list, &ordered_objects);
@@ -573,8 +588,7 @@ i915_gem_execbuffer_reserve(struct intel_ring_buffer *ring,
 				has_fenced_gpu_access &&
 				entry->flags & EXEC_OBJECT_NEEDS_FENCE &&
 				obj->tiling_mode != I915_TILING_NONE;
-			need_mappable =
-				entry->relocation_count ? true : need_fence;
+			need_mappable = need_fence || need_reloc_mappable(obj);
 
 			if ((entry->alignment && obj->gtt_offset & (entry->alignment - 1)) ||
 			    (need_mappable && !obj->map_and_fenceable))
-- 
1.7.7.3

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2012-03-27 11:16 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-11-29 15:12 [PATCH] drm/i915: Avoid using mappable space for relocation processing through the CPU Chris Wilson
2011-11-29 15:34 ` Daniel Vetter
2011-11-29 16:48   ` Chris Wilson
2011-11-29 17:03     ` Daniel Vetter
2011-11-29 17:15       ` Chris Wilson
2011-11-29 17:17       ` Daniel Vetter
2011-11-29 17:21         ` Daniel Vetter
2011-11-29 19:22 ` Daniel Vetter
2011-11-29 21:32 ` Ben Widawsky
     [not found] <id:1322579560-15548-1-git-send-email-chris@chris-wilson.co.uk>
2011-12-04 12:26 ` Chris Wilson
2011-12-05 10:41   ` Daniel Vetter
2012-03-26  8:10 Daniel Vetter
2012-03-26  8:32 ` Chris Wilson
2012-03-27 11:17   ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.