intel-gfx.lists.freedesktop.org archive mirror
 help / color / mirror / Atom feed
* reduced LLC caching series
@ 2011-03-29 23:59 Eric Anholt
  2011-03-29 23:59 ` [PATCH 1/6] drm/i915: Rename agp_type to cache_level Eric Anholt
                   ` (6 more replies)
  0 siblings, 7 replies; 14+ messages in thread
From: Eric Anholt @ 2011-03-29 23:59 UTC (permalink / raw)
  To: intel-gfx

Chris posted a big patch series incorporating the cleaned up LLC
caching changes, but there were a bunch of not-ready-yet patches in
there.  This is a reduced series that I think is ready, with some
fixes from me.  From the final commit:

    Improves full-screen openarena on my laptop 20.3% +/- 4.0% (n=3)
    Improves 800x600 nexuiz on my laptop 12.3% +/- 0.1% (n=3)

Full screen, highest settings OA is now just about at refresh rate on
it.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 1/6] drm/i915: Rename agp_type to cache_level
  2011-03-29 23:59 reduced LLC caching series Eric Anholt
@ 2011-03-29 23:59 ` Eric Anholt
  2011-03-29 23:59 ` [PATCH 2/6] drm/i915: Mark the cursor and the overlay as being part of the display planes Eric Anholt
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Eric Anholt @ 2011-03-29 23:59 UTC (permalink / raw)
  To: intel-gfx

From: Chris Wilson <chris@chris-wilson.co.uk>

... to clarify just how we use it inside the driver. We still need to
translate through agp_type for interface into the fake AGP driver.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/i915/i915_debugfs.c     |   11 ++++++-----
 drivers/gpu/drm/i915/i915_drv.h         |   12 +++++++++---
 drivers/gpu/drm/i915/i915_gem.c         |    2 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c     |   22 ++++++++++++++++------
 drivers/gpu/drm/i915/i915_irq.c         |    2 +-
 drivers/gpu/drm/i915/intel_ringbuffer.c |    4 ++--
 6 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 87c8e29..993e379 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -106,11 +106,12 @@ static const char *get_tiling_flag(struct drm_i915_gem_object *obj)
     }
 }
 
-static const char *agp_type_str(int type)
+static const char *cache_level_str(int type)
 {
 	switch (type) {
-	case 0: return " uncached";
-	case 1: return " snooped";
+	case I915_CACHE_NONE: return " uncached";
+	case I915_CACHE_LLC: return " snooped (LLC)";
+	case I915_CACHE_LLC_MLC: return " snooped (LLC+MLC)";
 	default: return "";
 	}
 }
@@ -127,7 +128,7 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 		   obj->base.write_domain,
 		   obj->last_rendering_seqno,
 		   obj->last_fenced_seqno,
-		   agp_type_str(obj->agp_type == AGP_USER_CACHED_MEMORY),
+		   cache_level_str(obj->cache_level),
 		   obj->dirty ? " dirty" : "",
 		   obj->madv == I915_MADV_DONTNEED ? " purgeable" : "");
 	if (obj->base.name)
@@ -714,7 +715,7 @@ static void print_error_buffers(struct seq_file *m,
 			   dirty_flag(err->dirty),
 			   purgeable_flag(err->purgeable),
 			   ring_str(err->ring),
-			   agp_type_str(err->agp_type));
+			   cache_level_str(err->cache_level));
 
 		if (err->name)
 			seq_printf(m, " (name: %d)", err->name);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 5004724..6d98e0e 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -188,7 +188,7 @@ struct drm_i915_error_state {
 		u32 dirty:1;
 		u32 purgeable:1;
 		u32 ring:4;
-		u32 agp_type:1;
+		u32 cache_level:2;
 	} *active_bo, *pinned_bo;
 	u32 active_bo_count, pinned_bo_count;
 	struct intel_overlay_error_state *overlay;
@@ -705,6 +705,12 @@ typedef struct drm_i915_private {
 	struct drm_property *broadcast_rgb_property;
 } drm_i915_private_t;
 
+enum i915_cache_level {
+	I915_CACHE_NONE,
+	I915_CACHE_LLC,
+	I915_CACHE_LLC_MLC, /* gen6+ */
+};
+
 struct drm_i915_gem_object {
 	struct drm_gem_object base;
 
@@ -791,6 +797,8 @@ struct drm_i915_gem_object {
 	unsigned int pending_fenced_gpu_access:1;
 	unsigned int fenced_gpu_access:1;
 
+	unsigned int cache_level:2;
+
 	struct page **pages;
 
 	/**
@@ -827,8 +835,6 @@ struct drm_i915_gem_object {
 	/** Record of address bit 17 of each page at last unbind. */
 	unsigned long *bit_17;
 
-	/** AGP mapping type (AGP_USER_MEMORY or AGP_USER_CACHED_MEMORY */
-	uint32_t agp_type;
 
 	/**
 	 * If present, while GEM_DOMAIN_CPU is in the read domain this array
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 7ce3f35..264bec8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3569,7 +3569,7 @@ struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev,
 	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
 
-	obj->agp_type = AGP_USER_MEMORY;
+	obj->cache_level = I915_CACHE_NONE;
 	obj->base.driver_private = NULL;
 	obj->fence_reg = I915_FENCE_REG_NONE;
 	INIT_LIST_HEAD(&obj->mm_list);
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index b0abdc6..af3c0e6 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -29,6 +29,17 @@
 #include "i915_trace.h"
 #include "intel_drv.h"
 
+/* XXX kill agp_type! */
+static uint32_t cache_level_to_agp_type(enum i915_cache_level cache_level)
+{
+	switch (cache_level) {
+	default:
+	case I915_CACHE_NONE: return AGP_USER_MEMORY;
+	case I915_CACHE_LLC: return AGP_USER_CACHED_MEMORY;
+	case I915_CACHE_LLC_MLC: return AGP_USER_CACHED_MEMORY_LLC_MLC;
+	}
+}
+
 void i915_gem_restore_gtt_mappings(struct drm_device *dev)
 {
 	struct drm_i915_private *dev_priv = dev->dev_private;
@@ -46,15 +57,14 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
 
 			intel_gtt_insert_sg_entries(obj->sg_list,
 						    obj->num_sg,
-						    obj->gtt_space->start
-							>> PAGE_SHIFT,
-						    obj->agp_type);
+						    obj->gtt_space->start >> PAGE_SHIFT,
+						    cache_level_to_agp_type(obj->cache_level));
 		} else
 			intel_gtt_insert_pages(obj->gtt_space->start
 						   >> PAGE_SHIFT,
 					       obj->base.size >> PAGE_SHIFT,
 					       obj->pages,
-					       obj->agp_type);
+					       cache_level_to_agp_type(obj->cache_level));
 	}
 
 	intel_gtt_chipset_flush();
@@ -77,12 +87,12 @@ int i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj)
 		intel_gtt_insert_sg_entries(obj->sg_list,
 					    obj->num_sg,
 					    obj->gtt_space->start >> PAGE_SHIFT,
-					    obj->agp_type);
+					    cache_level_to_agp_type(obj->cache_level));
 	} else
 		intel_gtt_insert_pages(obj->gtt_space->start >> PAGE_SHIFT,
 				       obj->base.size >> PAGE_SHIFT,
 				       obj->pages,
-				       obj->agp_type);
+				       cache_level_to_agp_type(obj->cache_level));
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 188b497..5c0466e 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -676,7 +676,7 @@ static u32 capture_bo_list(struct drm_i915_error_buffer *err,
 		err->dirty = obj->dirty;
 		err->purgeable = obj->madv != I915_MADV_WILLNEED;
 		err->ring = obj->ring ? obj->ring->id : 0;
-		err->agp_type = obj->agp_type == AGP_USER_CACHED_MEMORY;
+		err->cache_level = obj->cache_level;
 
 		if (++i == count)
 			break;
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index e9e6f71..bb18dfb 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -236,7 +236,7 @@ init_pipe_control(struct intel_ring_buffer *ring)
 		ret = -ENOMEM;
 		goto err;
 	}
-	obj->agp_type = AGP_USER_CACHED_MEMORY;
+	obj->cache_level = I915_CACHE_LLC;
 
 	ret = i915_gem_object_pin(obj, 4096, true);
 	if (ret)
@@ -759,7 +759,7 @@ static int init_status_page(struct intel_ring_buffer *ring)
 		ret = -ENOMEM;
 		goto err;
 	}
-	obj->agp_type = AGP_USER_CACHED_MEMORY;
+	obj->cache_level = I915_CACHE_LLC;
 
 	ret = i915_gem_object_pin(obj, 4096, true);
 	if (ret != 0) {
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/6] drm/i915: Mark the cursor and the overlay as being part of the display planes
  2011-03-29 23:59 reduced LLC caching series Eric Anholt
  2011-03-29 23:59 ` [PATCH 1/6] drm/i915: Rename agp_type to cache_level Eric Anholt
@ 2011-03-29 23:59 ` Eric Anholt
  2011-03-29 23:59 ` [PATCH 3/6] drm/i915: Do not clflush snooped objects Eric Anholt
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Eric Anholt @ 2011-03-29 23:59 UTC (permalink / raw)
  To: intel-gfx

From: Chris Wilson <chris@chris-wilson.co.uk>

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/i915/intel_display.c |    2 +-
 drivers/gpu/drm/i915/intel_overlay.c |    2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/i915/intel_display.c b/drivers/gpu/drm/i915/intel_display.c
index 432fc04..8fdeae6 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -5355,7 +5355,7 @@ static int intel_crtc_cursor_set(struct drm_crtc *crtc,
 			goto fail_locked;
 		}
 
-		ret = i915_gem_object_set_to_gtt_domain(obj, 0);
+		ret = i915_gem_object_set_to_display_plane(obj, NULL);
 		if (ret) {
 			DRM_ERROR("failed to move cursor bo into the GTT\n");
 			goto fail_unpin;
diff --git a/drivers/gpu/drm/i915/intel_overlay.c b/drivers/gpu/drm/i915/intel_overlay.c
index a670c00..e0903c5 100644
--- a/drivers/gpu/drm/i915/intel_overlay.c
+++ b/drivers/gpu/drm/i915/intel_overlay.c
@@ -777,7 +777,7 @@ static int intel_overlay_do_put_image(struct intel_overlay *overlay,
 	if (ret != 0)
 		return ret;
 
-	ret = i915_gem_object_set_to_gtt_domain(new_bo, 0);
+	ret = i915_gem_object_set_to_display_plane(new_bo, NULL);
 	if (ret != 0)
 		goto out_unpin;
 
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/6] drm/i915: Do not clflush snooped objects
  2011-03-29 23:59 reduced LLC caching series Eric Anholt
  2011-03-29 23:59 ` [PATCH 1/6] drm/i915: Rename agp_type to cache_level Eric Anholt
  2011-03-29 23:59 ` [PATCH 2/6] drm/i915: Mark the cursor and the overlay as being part of the display planes Eric Anholt
@ 2011-03-29 23:59 ` Eric Anholt
  2011-03-29 23:59 ` [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level Eric Anholt
                   ` (3 subsequent siblings)
  6 siblings, 0 replies; 14+ messages in thread
From: Eric Anholt @ 2011-03-29 23:59 UTC (permalink / raw)
  To: intel-gfx

From: Chris Wilson <chris@chris-wilson.co.uk>

Rely on the GPU snooping into the CPU cache for appropriately bound
objects on MI_FLUSH. Or perhaps one day we will have a cache-coherent
CPU/GPU package...

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/i915/i915_gem.c |    8 ++++++++
 1 files changed, 8 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 264bec8..fa483d8 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2878,6 +2878,14 @@ i915_gem_clflush_object(struct drm_i915_gem_object *obj)
 	if (obj->pages == NULL)
 		return;
 
+	/* If the GPU is snooping the contents of the CPU cache,
+	 * we do not need to clear the CPU cache lines. Instead we need
+	 * to be sure to flush/invalidate the RENDER cache when the contents
+	 * must be refreshed.
+	 */
+	if (obj->cache_level != I915_CACHE_NONE)
+		return;
+
 	trace_i915_gem_object_clflush(obj);
 
 	drm_clflush_pages(obj->pages, obj->base.size / PAGE_SIZE);
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level
  2011-03-29 23:59 reduced LLC caching series Eric Anholt
                   ` (2 preceding siblings ...)
  2011-03-29 23:59 ` [PATCH 3/6] drm/i915: Do not clflush snooped objects Eric Anholt
@ 2011-03-29 23:59 ` Eric Anholt
  2011-03-30  7:09   ` Chris Wilson
  2011-03-29 23:59 ` [PATCH 5/6] drm/i915: Use the uncached domain for the display planes v2 Eric Anholt
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 14+ messages in thread
From: Eric Anholt @ 2011-03-29 23:59 UTC (permalink / raw)
  To: intel-gfx

From: Chris Wilson <chris@chris-wilson.co.uk>

[v2: Don't forget that when going from cached to uncached, we haven't
been tracking the write domain from the CPU perspective, since we
haven't needed it for GPU coherency.]

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/i915/i915_drv.h         |    6 ++++-
 drivers/gpu/drm/i915/i915_gem.c         |   35 ++++++++++++++++++++++++++++++-
 drivers/gpu/drm/i915/i915_gem_gtt.c     |    8 ++++--
 drivers/gpu/drm/i915/intel_ringbuffer.c |    6 +++-
 4 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 6d98e0e..e38765d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1190,9 +1190,13 @@ void i915_gem_release(struct drm_device *dev, struct drm_file *file);
 uint32_t
 i915_gem_get_unfenced_gtt_alignment(struct drm_i915_gem_object *obj);
 
+int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
+				    enum i915_cache_level cache_level);
+
 /* i915_gem_gtt.c */
 void i915_gem_restore_gtt_mappings(struct drm_device *dev);
-int __must_check i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj);
+int __must_check i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
+					  enum i915_cache_level cache_level);
 void i915_gem_gtt_unbind_object(struct drm_i915_gem_object *obj);
 
 /* i915_gem_evict.c */
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index fa483d8..8389b03 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2831,7 +2831,7 @@ i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj,
 		return ret;
 	}
 
-	ret = i915_gem_gtt_bind_object(obj);
+	ret = i915_gem_gtt_bind_object(obj, obj->cache_level);
 	if (ret) {
 		i915_gem_object_put_pages_gtt(obj);
 		drm_mm_put_block(obj->gtt_space);
@@ -3002,6 +3002,39 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
 	return 0;
 }
 
+int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
+				    enum i915_cache_level cache_level)
+{
+	int ret;
+
+	if (obj->cache_level == cache_level)
+		return 0;
+
+	if (obj->gtt_space) {
+		ret = i915_gem_object_flush_gpu(obj);
+		if (ret)
+			return ret;
+
+		ret = i915_gem_gtt_bind_object(obj, cache_level);
+		if (ret)
+			return ret;
+	}
+
+	if (cache_level == I915_CACHE_NONE) {
+		/* If we're coming frm LLC cached, then we haven't
+		 * actually been tracking whether the data is in the
+		 * CPU cache or not, since we only allow one bit set
+		 * in obj->write_domain.  Just set it to the CPU cache
+		 * for now.
+		 */
+		BUG_ON(obj->base.write_domain & I915_GEM_GPU_DOMAINS);
+		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+	}
+
+	obj->cache_level = cache_level;
+	return 0;
+}
+
 /*
  * Prepare buffer for display plane. Use uninterruptible for possible flush
  * wait, as in modesetting process we're not supposed to be interrupted.
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c
index af3c0e6..47d8bad 100644
--- a/drivers/gpu/drm/i915/i915_gem_gtt.c
+++ b/drivers/gpu/drm/i915/i915_gem_gtt.c
@@ -70,10 +70,12 @@ void i915_gem_restore_gtt_mappings(struct drm_device *dev)
 	intel_gtt_chipset_flush();
 }
 
-int i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj)
+int i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj,
+			     enum i915_cache_level cache_level)
 {
 	struct drm_device *dev = obj->base.dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
+	uint32_t agp_type = cache_level_to_agp_type(cache_level);
 	int ret;
 
 	if (dev_priv->mm.gtt->needs_dmar) {
@@ -87,12 +89,12 @@ int i915_gem_gtt_bind_object(struct drm_i915_gem_object *obj)
 		intel_gtt_insert_sg_entries(obj->sg_list,
 					    obj->num_sg,
 					    obj->gtt_space->start >> PAGE_SHIFT,
-					    cache_level_to_agp_type(obj->cache_level));
+					    agp_type);
 	} else
 		intel_gtt_insert_pages(obj->gtt_space->start >> PAGE_SHIFT,
 				       obj->base.size >> PAGE_SHIFT,
 				       obj->pages,
-				       cache_level_to_agp_type(obj->cache_level));
+				       agp_type);
 
 	return 0;
 }
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c
index bb18dfb..6d25e79 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -236,7 +236,8 @@ init_pipe_control(struct intel_ring_buffer *ring)
 		ret = -ENOMEM;
 		goto err;
 	}
-	obj->cache_level = I915_CACHE_LLC;
+
+	i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
 
 	ret = i915_gem_object_pin(obj, 4096, true);
 	if (ret)
@@ -759,7 +760,8 @@ static int init_status_page(struct intel_ring_buffer *ring)
 		ret = -ENOMEM;
 		goto err;
 	}
-	obj->cache_level = I915_CACHE_LLC;
+
+	i915_gem_object_set_cache_level(obj, I915_CACHE_LLC);
 
 	ret = i915_gem_object_pin(obj, 4096, true);
 	if (ret != 0) {
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 5/6] drm/i915: Use the uncached domain for the display planes v2
  2011-03-29 23:59 reduced LLC caching series Eric Anholt
                   ` (3 preceding siblings ...)
  2011-03-29 23:59 ` [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level Eric Anholt
@ 2011-03-29 23:59 ` Eric Anholt
  2011-03-29 23:59 ` [PATCH 6/6] drm/i915: Use the LLC mode on gen6 for everything but display Eric Anholt
  2011-03-30 22:35 ` reduced LLC caching series Michael Larabel
  6 siblings, 0 replies; 14+ messages in thread
From: Eric Anholt @ 2011-03-29 23:59 UTC (permalink / raw)
  To: intel-gfx

The simplest and common method for ensuring scanout coherency on all
chipsets is to mark the scanout buffers as uncached (and for
userspace to remember to flush the render cache every so often).

We can improve upon this for later generations by marking scanout
objects as GFDT and only flush those cachelines when required. However,
we start simple.

[v2: Move the set to uncached above the clflush.  Otherwise, we'd skip
the clflush and try to scan out data that was still sitting in the
cache.]

Signed-off-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c |   13 +++++++++++++
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 8389b03..de9a446 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3062,6 +3062,19 @@ i915_gem_object_set_to_display_plane(struct drm_i915_gem_object *obj,
 			return ret;
 	}
 
+	/* The display engine is not coherent with the LLC cache on gen6.  As
+	 * a result, we make sure that the pinning that is about to occur is
+	 * done with uncached PTEs. This is lowest common denominator for all
+	 * chipsets.
+	 *
+	 * However for gen6+, we could do better by using the GFDT bit instead
+	 * of uncaching, which would allow us to flush all the LLC-cached data
+	 * with that bit in the PTE to main memory with just one PIPE_CONTROL.
+	 */
+	ret = i915_gem_object_set_cache_level(obj, I915_CACHE_NONE);
+	if (ret)
+		return ret;
+
 	i915_gem_object_flush_cpu_write_domain(obj);
 
 	old_read_domains = obj->base.read_domains;
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 6/6] drm/i915: Use the LLC mode on gen6 for everything but display.
  2011-03-29 23:59 reduced LLC caching series Eric Anholt
                   ` (4 preceding siblings ...)
  2011-03-29 23:59 ` [PATCH 5/6] drm/i915: Use the uncached domain for the display planes v2 Eric Anholt
@ 2011-03-29 23:59 ` Eric Anholt
  2011-03-30 22:35 ` reduced LLC caching series Michael Larabel
  6 siblings, 0 replies; 14+ messages in thread
From: Eric Anholt @ 2011-03-29 23:59 UTC (permalink / raw)
  To: intel-gfx

Improves full-screen openarena on my laptop 20.3% +/- 4.0% (n=3)
Improves 800x600 nexuiz on my laptop 12.3% +/- 0.1% (n=3)

We have more room to improve with doing LLC caching for display using
GFDT, and in doing LLC+MLC caching, but this was an easy performance
win and incremental improvement toward those two.

Signed-off-by: Eric Anholt <eric@anholt.net>
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 drivers/gpu/drm/i915/i915_gem.c |   18 +++++++++++++++++-
 1 files changed, 17 insertions(+), 1 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index de9a446..87d4e17 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3623,7 +3623,23 @@ struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev,
 	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
 
-	obj->cache_level = I915_CACHE_NONE;
+	if (IS_GEN6(dev)) {
+		/* On Gen6, we can have the GPU use the LLC (the CPU
+		 * cache) for about a 10% performance improvement
+		 * compared to uncached.  Graphics requests other than
+		 * display scanout are coherent with the CPU in
+		 * accessing this cache.  This means in this mode we
+		 * don't need to clflush on the CPU side, and on the
+		 * GPU side we only need to flush internal caches to
+		 * get data visible to the CPU.
+		 *
+		 * However, we maintain the display planes as UC, and so
+		 * need to rebind when first used as such.
+		 */
+		obj->cache_level = I915_CACHE_LLC;
+	} else
+		obj->cache_level = I915_CACHE_NONE;
+
 	obj->base.driver_private = NULL;
 	obj->fence_reg = I915_FENCE_REG_NONE;
 	INIT_LIST_HEAD(&obj->mm_list);
-- 
1.7.4.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level
  2011-03-29 23:59 ` [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level Eric Anholt
@ 2011-03-30  7:09   ` Chris Wilson
  2011-03-30 16:59     ` Eric Anholt
  0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2011-03-30  7:09 UTC (permalink / raw)
  To: Eric Anholt, intel-gfx

The series looks really good, only one quibble below.

On Tue, 29 Mar 2011 16:59:53 -0700, Eric Anholt <eric@anholt.net> wrote:
> +int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> +				    enum i915_cache_level cache_level)
> +	if (cache_level == I915_CACHE_NONE) {
> +		/* If we're coming frm LLC cached, then we haven't
> +		 * actually been tracking whether the data is in the
> +		 * CPU cache or not, since we only allow one bit set
> +		 * in obj->write_domain.  Just set it to the CPU cache
> +		 * for now.
> +		 */
> +		BUG_ON(obj->base.write_domain & I915_GEM_GPU_DOMAINS);
> +		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +	}

[We can rearrange the code to convert the BUG_ON into a
 if (WARN_ON()) return -EBUSY;.]

I think this is overkill, at least by my interpretation of the old BLT
docs which imply that cache line invalidation (both CPU and GPU) is done
for snooped PTEs on MI_FLUSH.

LLC -> UC transitions will be rare, only for new buffers (and on the first
page flip) after a modeset, but it's the principle! ;-)

Until we expose cache control through an ioctl we will never be able to
experiment...
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level
  2011-03-30  7:09   ` Chris Wilson
@ 2011-03-30 16:59     ` Eric Anholt
  2011-03-30 17:16       ` Chris Wilson
  0 siblings, 1 reply; 14+ messages in thread
From: Eric Anholt @ 2011-03-30 16:59 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 1265 bytes --]

On Wed, 30 Mar 2011 08:09:47 +0100, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> The series looks really good, only one quibble below.
> 
> On Tue, 29 Mar 2011 16:59:53 -0700, Eric Anholt <eric@anholt.net> wrote:
> > +int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> > +				    enum i915_cache_level cache_level)
> > +	if (cache_level == I915_CACHE_NONE) {
> > +		/* If we're coming frm LLC cached, then we haven't
> > +		 * actually been tracking whether the data is in the
> > +		 * CPU cache or not, since we only allow one bit set
> > +		 * in obj->write_domain.  Just set it to the CPU cache
> > +		 * for now.
> > +		 */
> > +		BUG_ON(obj->base.write_domain & I915_GEM_GPU_DOMAINS);
> > +		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> > +	}
> 
> [We can rearrange the code to convert the BUG_ON into a
>  if (WARN_ON()) return -EBUSY;.]
> 
> I think this is overkill, at least by my interpretation of the old BLT
> docs which imply that cache line invalidation (both CPU and GPU) is done
> for snooped PTEs on MI_FLUSH.

And what about a CPU write through the GTT?  Or the CPU writes to the
pages before we turned them into a BO and cleared their CPU write domain
flag without actually clflushing them?

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level
  2011-03-30 16:59     ` Eric Anholt
@ 2011-03-30 17:16       ` Chris Wilson
  2011-03-30 21:45         ` Eric Anholt
  0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2011-03-30 17:16 UTC (permalink / raw)
  To: Eric Anholt, intel-gfx

On Wed, 30 Mar 2011 09:59:55 -0700, Eric Anholt <eric@anholt.net> wrote:
> On Wed, 30 Mar 2011 08:09:47 +0100, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > The series looks really good, only one quibble below.
> > 
> > On Tue, 29 Mar 2011 16:59:53 -0700, Eric Anholt <eric@anholt.net> wrote:
> > > +int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> > > +				    enum i915_cache_level cache_level)
> > > +	if (cache_level == I915_CACHE_NONE) {
> > > +		/* If we're coming frm LLC cached, then we haven't
> > > +		 * actually been tracking whether the data is in the
> > > +		 * CPU cache or not, since we only allow one bit set
> > > +		 * in obj->write_domain.  Just set it to the CPU cache
> > > +		 * for now.
> > > +		 */
> > > +		BUG_ON(obj->base.write_domain & I915_GEM_GPU_DOMAINS);
> > > +		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> > > +	}
> > 
> > [We can rearrange the code to convert the BUG_ON into a
> >  if (WARN_ON()) return -EBUSY;.]
> > 
> > I think this is overkill, at least by my interpretation of the old BLT
> > docs which imply that cache line invalidation (both CPU and GPU) is done
> > for snooped PTEs on MI_FLUSH.
> 
> And what about a CPU write through the GTT?

Even on SNB these are still UC. And we should try hard not to, as the
specs give dire warnings about writing to snooped PTEs through the GTT.
(Since we will bypass the caches with the write, aiui, and cause
confusion.)

> Or the CPU writes to the
> pages before we turned them into a BO and cleared their CPU write domain
> flag without actually clflushing them?

That's a valid argument. Just frustrated by the necessity.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level
  2011-03-30 17:16       ` Chris Wilson
@ 2011-03-30 21:45         ` Eric Anholt
  2011-03-31  7:29           ` Chris Wilson
  0 siblings, 1 reply; 14+ messages in thread
From: Eric Anholt @ 2011-03-30 21:45 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 2168 bytes --]

On Wed, 30 Mar 2011 18:16:11 +0100, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Wed, 30 Mar 2011 09:59:55 -0700, Eric Anholt <eric@anholt.net> wrote:
> > On Wed, 30 Mar 2011 08:09:47 +0100, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > The series looks really good, only one quibble below.
> > > 
> > > On Tue, 29 Mar 2011 16:59:53 -0700, Eric Anholt <eric@anholt.net> wrote:
> > > > +int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
> > > > +				    enum i915_cache_level cache_level)
> > > > +	if (cache_level == I915_CACHE_NONE) {
> > > > +		/* If we're coming frm LLC cached, then we haven't
> > > > +		 * actually been tracking whether the data is in the
> > > > +		 * CPU cache or not, since we only allow one bit set
> > > > +		 * in obj->write_domain.  Just set it to the CPU cache
> > > > +		 * for now.
> > > > +		 */
> > > > +		BUG_ON(obj->base.write_domain & I915_GEM_GPU_DOMAINS);
> > > > +		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> > > > +	}
> > > 
> > > [We can rearrange the code to convert the BUG_ON into a
> > >  if (WARN_ON()) return -EBUSY;.]
> > > 
> > > I think this is overkill, at least by my interpretation of the old BLT
> > > docs which imply that cache line invalidation (both CPU and GPU) is done
> > > for snooped PTEs on MI_FLUSH.
> > 
> > And what about a CPU write through the GTT?
> 
> Even on SNB these are still UC. And we should try hard not to, as the
> specs give dire warnings about writing to snooped PTEs through the GTT.
> (Since we will bypass the caches with the write, aiui, and cause
> confusion.)

Oh, wow.  That's really bad.  Reject this series if so.

If true, then we need to be accurately tracking the CPU write domain.
Mapping for GTT would need to actually clflush, and we'd need to clflush
to read GTT-written data.

On the other hand, I'm surprised things survived the testing I've done
if that's true, given that we're surely GTT pwriting the batchbuffer
data in, and then reading them through LLC.  I would expect to pull in a
bunch of zeroes in place of actual commands.  Were you perhaps referring
to pre-gen6 chipsets?

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: reduced LLC caching series
  2011-03-29 23:59 reduced LLC caching series Eric Anholt
                   ` (5 preceding siblings ...)
  2011-03-29 23:59 ` [PATCH 6/6] drm/i915: Use the LLC mode on gen6 for everything but display Eric Anholt
@ 2011-03-30 22:35 ` Michael Larabel
  6 siblings, 0 replies; 14+ messages in thread
From: Michael Larabel @ 2011-03-30 22:35 UTC (permalink / raw)
  To: intel-gfx

Eric & Chris,

For those interested, here's some of my test results of the LLC caching 
patch-set compared to a vanilla 2.6.38 and 2.6.39-rc1 on two SNBs. They 
just show some nice gains across the board and I haven't hit any issues 
with the patches.

Core i3 2100:
http://openbenchmarking.org/result/1103303-GR-INTELLCCC18&obr_ab=1

Core i5 2500K:
http://openbenchmarking.org/result/1103303-GR-INTELLCCC27&obr_ab=1

-- Michael

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level
  2011-03-30 21:45         ` Eric Anholt
@ 2011-03-31  7:29           ` Chris Wilson
  2011-03-31 20:10             ` Eric Anholt
  0 siblings, 1 reply; 14+ messages in thread
From: Chris Wilson @ 2011-03-31  7:29 UTC (permalink / raw)
  To: Eric Anholt, intel-gfx

On Wed, 30 Mar 2011 14:45:11 -0700, Eric Anholt <eric@anholt.net> wrote:
> On Wed, 30 Mar 2011 18:16:11 +0100, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > On Wed, 30 Mar 2011 09:59:55 -0700, Eric Anholt <eric@anholt.net> wrote:
> > > And what about a CPU write through the GTT?
> > 
> > Even on SNB these are still UC. And we should try hard not to, as the
> > specs give dire warnings about writing to snooped PTEs through the GTT.
> > (Since we will bypass the caches with the write, aiui, and cause
> > confusion.)
> 
> Oh, wow.  That's really bad.  Reject this series if so.

I plucked that tidbit out of the specs for the BLT engine, which has not
been rigorously updated since gen2... Though don't we also encounter a few
subtleties with movnta (__copy_from_user_nocache_nozero from pwrite) and
data in cachelines?

But it is something that I worry about given my desire to start mapping
user pages and using the BLT engine for C to UC transfers.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level
  2011-03-31  7:29           ` Chris Wilson
@ 2011-03-31 20:10             ` Eric Anholt
  0 siblings, 0 replies; 14+ messages in thread
From: Eric Anholt @ 2011-03-31 20:10 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 1343 bytes --]

On Thu, 31 Mar 2011 08:29:31 +0100, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> On Wed, 30 Mar 2011 14:45:11 -0700, Eric Anholt <eric@anholt.net> wrote:
> > On Wed, 30 Mar 2011 18:16:11 +0100, Chris Wilson <chris@chris-wilson.co.uk> wrote:
> > > On Wed, 30 Mar 2011 09:59:55 -0700, Eric Anholt <eric@anholt.net> wrote:
> > > > And what about a CPU write through the GTT?
> > > 
> > > Even on SNB these are still UC. And we should try hard not to, as the
> > > specs give dire warnings about writing to snooped PTEs through the GTT.
> > > (Since we will bypass the caches with the write, aiui, and cause
> > > confusion.)
> > 
> > Oh, wow.  That's really bad.  Reject this series if so.
> 
> I plucked that tidbit out of the specs for the BLT engine, which has not
> been rigorously updated since gen2... Though don't we also encounter a few
> subtleties with movnta (__copy_from_user_nocache_nozero from pwrite) and
> data in cachelines?

The only tricky correctness bit for those was how to flush the little
tiny movnt cache when you're done, which is "mfence".  As far as other
interesting notes about movnt, it doesn't mean that the destination is
not in cache after the instruction -- if it was already in cache, it
will likely be in cache afterwards.  I don't think that has any impact
on how we do our code.

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2011-03-31 20:10 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-03-29 23:59 reduced LLC caching series Eric Anholt
2011-03-29 23:59 ` [PATCH 1/6] drm/i915: Rename agp_type to cache_level Eric Anholt
2011-03-29 23:59 ` [PATCH 2/6] drm/i915: Mark the cursor and the overlay as being part of the display planes Eric Anholt
2011-03-29 23:59 ` [PATCH 3/6] drm/i915: Do not clflush snooped objects Eric Anholt
2011-03-29 23:59 ` [PATCH 4/6] drm/i915: Add an interface to dynamically change the cache level Eric Anholt
2011-03-30  7:09   ` Chris Wilson
2011-03-30 16:59     ` Eric Anholt
2011-03-30 17:16       ` Chris Wilson
2011-03-30 21:45         ` Eric Anholt
2011-03-31  7:29           ` Chris Wilson
2011-03-31 20:10             ` Eric Anholt
2011-03-29 23:59 ` [PATCH 5/6] drm/i915: Use the uncached domain for the display planes v2 Eric Anholt
2011-03-29 23:59 ` [PATCH 6/6] drm/i915: Use the LLC mode on gen6 for everything but display Eric Anholt
2011-03-30 22:35 ` reduced LLC caching series Michael Larabel

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).