All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/6] RFCish: write only mappings (aka non-blocking)
@ 2011-09-20  4:25 Ben Widawsky
  2011-09-20  4:25 ` [PATCH 1/6] drm/i915: object CPU flush interface Ben Widawsky
                   ` (6 more replies)
  0 siblings, 7 replies; 28+ messages in thread
From: Ben Widawsky @ 2011-09-20  4:25 UTC (permalink / raw)
  To: intel-gfx

I'm going to keep this short...
Patch 5 is my test case.
On Gen6 I see slightly better performance. On Gen5 I see really really
improvements (like 3x) for non GTT write only maps over regular mmaps.
GTT mappings don't really show any improvements as a whole.

Better tests would be nice, but without more significant Mesa changes,
or better benchmarks, I'm not sure how to get those.  While I think
these patches are mostly complete, ideas for better testing are very
welcome. Also of course, general optimizations or pointing out my errors
would be nice.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH 1/6] drm/i915: object CPU flush interface
  2011-09-20  4:25 [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Ben Widawsky
@ 2011-09-20  4:25 ` Ben Widawsky
  2011-09-20  4:25 ` [PATCH 2/6] drm/i915: write only object tracking Ben Widawsky
                   ` (5 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ben Widawsky @ 2011-09-20  4:25 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky

Add an interface to allow flushing. While clflush can be run by anyone,
we have a potential for chipset specific requirements for flushing. The
plan to use this is to have libdrm clients always call flush, and allow
libdrm to decide whether or not to actually use this IOCTL.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_dma.c |    1 +
 drivers/gpu/drm/i915/i915_drv.h |    2 ++
 drivers/gpu/drm/i915/i915_gem.c |    8 ++++++++
 include/drm/i915_drm.h          |    5 +++++
 4 files changed, 16 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 8a3942c..badb555 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -2292,6 +2292,7 @@ struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_GEM_MADVISE, i915_gem_madvise_ioctl, DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(I915_OVERLAY_PUT_IMAGE, intel_overlay_put_image, DRM_MASTER|DRM_CONTROL_ALLOW|DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(I915_OVERLAY_ATTRS, intel_overlay_attrs, DRM_MASTER|DRM_CONTROL_ALLOW|DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(I915_GEM_FLUSH, i915_gem_flush_ioctl, DRM_UNLOCKED),
 };
 
 int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7916bd9..cee396c 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1108,6 +1108,8 @@ int i915_gem_throttle_ioctl(struct drm_device *dev, void *data,
 			    struct drm_file *file_priv);
 int i915_gem_madvise_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
+int i915_gem_flush_ioctl(struct drm_device *dev, void *data,
+			 struct drm_file *file_priv);
 int i915_gem_entervt_ioctl(struct drm_device *dev, void *data,
 			   struct drm_file *file_priv);
 int i915_gem_leavevt_ioctl(struct drm_device *dev, void *data,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a546a71..d4de7f7 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3674,6 +3674,14 @@ unlock:
 	return ret;
 }
 
+int
+i915_gem_flush_ioctl(struct drm_device *dev, void *data,
+		       struct drm_file *file_priv)
+{
+	intel_gtt_chipset_flush();
+	return 0;
+}
+
 struct drm_i915_gem_object *i915_gem_alloc_object(struct drm_device *dev,
 						  size_t size)
 {
diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index 28c0d11..14a633d 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -198,6 +198,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
 #define DRM_I915_OVERLAY_ATTRS	0x28
 #define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_FLUSH	0x2a
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -239,6 +240,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_MADVISE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MADVISE, struct drm_i915_gem_madvise)
 #define DRM_IOCTL_I915_OVERLAY_PUT_IMAGE	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_OVERLAY_PUT_IMAGE, struct drm_intel_overlay_put_image)
 #define DRM_IOCTL_I915_OVERLAY_ATTRS	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_OVERLAY_ATTRS, struct drm_intel_overlay_attrs)
+#define DRM_IOCTL_I915_GEM_FLUSH	DRM_IOW  (DRM_COMMAND_BASE + DRM_I915_GEM_FLUSH , struct drm_i915_gem_flush)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -844,4 +846,7 @@ struct drm_intel_overlay_attrs {
 	__u32 gamma5;
 };
 
+struct drm_i915_gem_flush {
+	__u32 bo_handle;
+};
 #endif				/* _I915_DRM_H_ */
-- 
1.7.6.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/6] drm/i915: write only object tracking
  2011-09-20  4:25 [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Ben Widawsky
  2011-09-20  4:25 ` [PATCH 1/6] drm/i915: object CPU flush interface Ben Widawsky
@ 2011-09-20  4:25 ` Ben Widawsky
  2011-09-20  4:25 ` [PATCH 3/6] drm/i915: Support write only mappings Ben Widawsky
                   ` (4 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ben Widawsky @ 2011-09-20  4:25 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky

Add the struct member to track write only objects, and display them in
our debugfs.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_debugfs.c |   18 +++++++++++++-----
 drivers/gpu/drm/i915/i915_drv.h     |    2 ++
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
index 3c395a5..b0f1964 100644
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -138,12 +138,14 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
 	if (obj->gtt_space != NULL)
 		seq_printf(m, " (gtt offset: %08x, size: %08x)",
 			   obj->gtt_offset, (unsigned int)obj->gtt_space->size);
-	if (obj->pin_mappable || obj->fault_mappable) {
-		char s[3], *t = s;
+	if (obj->pin_mappable || obj->fault_mappable || obj->cpu_write_only) {
+		char s[4], *t = s;
 		if (obj->pin_mappable)
 			*t++ = 'p';
 		if (obj->fault_mappable)
 			*t++ = 'f';
+		if (obj->cpu_write_only)
+			*t++ = 'w';
 		*t = '\0';
 		seq_printf(m, " (%s mappable)", s);
 	}
@@ -224,8 +226,8 @@ static int i915_gem_object_info(struct seq_file *m, void* data)
 	struct drm_info_node *node = (struct drm_info_node *) m->private;
 	struct drm_device *dev = node->minor->dev;
 	struct drm_i915_private *dev_priv = dev->dev_private;
-	u32 count, mappable_count;
-	size_t size, mappable_size;
+	u32 count, mappable_count, womap_count;
+	size_t size, mappable_size, womap_size;
 	struct drm_i915_gem_object *obj;
 	int ret;
 
@@ -263,7 +265,7 @@ static int i915_gem_object_info(struct seq_file *m, void* data)
 	seq_printf(m, "  %u [%u] freed objects, %zu [%zu] bytes\n",
 		   count, mappable_count, size, mappable_size);
 
-	size = count = mappable_size = mappable_count = 0;
+	size = count = mappable_size = mappable_count = womap_count = womap_size = 0;
 	list_for_each_entry(obj, &dev_priv->mm.gtt_list, gtt_list) {
 		if (obj->fault_mappable) {
 			size += obj->gtt_space->size;
@@ -273,11 +275,17 @@ static int i915_gem_object_info(struct seq_file *m, void* data)
 			mappable_size += obj->gtt_space->size;
 			++mappable_count;
 		}
+		if (obj->cpu_write_only) {
+			womap_size += obj->gtt_space->size;
+			++womap_count;
+		}
 	}
 	seq_printf(m, "%u pinned mappable objects, %zu bytes\n",
 		   mappable_count, mappable_size);
 	seq_printf(m, "%u fault mappable objects, %zu bytes\n",
 		   count, size);
+	seq_printf(m, "%u write-only mappable objects, %zu bytes\n",
+		   womap_count, womap_size);
 
 	seq_printf(m, "%zu [%zu] gtt total\n",
 		   dev_priv->mm.gtt_total, dev_priv->mm.mappable_gtt_total);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index cee396c..c960763 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -883,6 +883,8 @@ struct drm_i915_gem_object {
 	 * reaches 0, dev_priv->pending_flip_queue will be woken up.
 	 */
 	atomic_t pending_flip;
+
+	int cpu_write_only;
 };
 
 #define to_intel_bo(x) container_of(x, struct drm_i915_gem_object, base)
-- 
1.7.6.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 3/6] drm/i915: Support write only mappings
  2011-09-20  4:25 [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Ben Widawsky
  2011-09-20  4:25 ` [PATCH 1/6] drm/i915: object CPU flush interface Ben Widawsky
  2011-09-20  4:25 ` [PATCH 2/6] drm/i915: write only object tracking Ben Widawsky
@ 2011-09-20  4:25 ` Ben Widawsky
  2011-09-20  5:29   ` Keith Packard
  2011-09-20  8:30   ` Chris Wilson
  2011-09-20  4:25 ` [PATCH 4/6] intel: write only map support Ben Widawsky
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 28+ messages in thread
From: Ben Widawsky @ 2011-09-20  4:25 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky

When doing a GTT mapping, the pages are not backed until taking a fault.
If we know that the object is write only, when the fault happens we do not need
to make the pages coherent with the CPU.  This allows for semi fast prefaults
to occur in libdrm at map time.

For the non-GTT case, there isn't much to do except make sure we handle things
properly we we move an object out of the GTT.

Finally, modify the set domain IOCTL to actually mark objects as write only.
Current limitation in that once an object is marked write only, it cannot be
readable.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_gem.c |   32 ++++++++++++++++++++++++--------
 1 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d4de7f7..48ea4bc 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -1047,6 +1047,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 	struct drm_i915_gem_object *obj;
 	uint32_t read_domains = args->read_domains;
 	uint32_t write_domain = args->write_domain;
+	int write_only = 0;
 	int ret;
 
 	if (!(dev->driver->driver_features & DRIVER_GEM))
@@ -1059,11 +1060,15 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 	if (read_domains & I915_GEM_GPU_DOMAINS)
 		return -EINVAL;
 
-	/* Having something in the write domain implies it's in the read
-	 * domain, and only that read domain.  Enforce that in the request.
-	 */
-	if (write_domain != 0 && read_domains != write_domain)
+	if (write_domain != 0 && read_domains == 0) {
+		write_only = 1;
+	} else if (write_domain != 0 && read_domains != write_domain) {
+		/* Having something in the write domain implies it's in the read
+		 * domain, and only that read domain. Enforce that in the
+		 * request.
+		 */
 		return -EINVAL;
+	}
 
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
@@ -1084,6 +1089,13 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
 		 */
 		if (ret == -EINVAL)
 			ret = 0;
+	} else if (obj->cpu_write_only && !write_only) {
+		DRM_ERROR("Not yet supported\n");
+		ret = -EINVAL;
+		goto unlock;
+	} else if (write_only) {
+		atomic_set(&obj->cpu_write_only, 1);
+		obj->base.read_domains = 0;
 	} else {
 		ret = i915_gem_object_set_to_cpu_domain(obj, write_domain != 0);
 	}
@@ -1217,9 +1229,13 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		if (ret)
 			goto unlock;
 
-		ret = i915_gem_object_set_to_gtt_domain(obj, write);
-		if (ret)
-			goto unlock;
+		if (obj->cpu_write_only) {
+			i915_gem_object_flush_cpu_write_domain(obj);
+		} else {
+			ret = i915_gem_object_set_to_gtt_domain(obj, write);
+			if (ret)
+				goto unlock;
+		}
 	}
 
 	if (obj->tiling_mode == I915_TILING_NONE)
@@ -1586,7 +1602,7 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
 		obj->dirty = 0;
 
 	for (i = 0; i < page_count; i++) {
-		if (obj->dirty)
+		if (obj->dirty || obj->cpu_write_only)
 			set_page_dirty(obj->pages[i]);
 
 		if (obj->madv == I915_MADV_WILLNEED)
-- 
1.7.6.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 4/6] intel: write only map support
  2011-09-20  4:25 [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Ben Widawsky
                   ` (2 preceding siblings ...)
  2011-09-20  4:25 ` [PATCH 3/6] drm/i915: Support write only mappings Ben Widawsky
@ 2011-09-20  4:25 ` Ben Widawsky
  2011-09-20  4:25 ` [PATCH 5/6] write-only mappings Ben Widawsky
                   ` (2 subsequent siblings)
  6 siblings, 0 replies; 28+ messages in thread
From: Ben Widawsky @ 2011-09-20  4:25 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky

Support the kernel write only interface. Also add an interface to ease user
flushing for non LLC coherent architectures.

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 include/drm/i915_drm.h   |    5 ++
 intel/intel_bufmgr.h     |    9 ++++
 intel/intel_bufmgr_gem.c |  105 ++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 115 insertions(+), 4 deletions(-)

diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index adc2392..576d419 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -189,6 +189,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
 #define DRM_I915_OVERLAY_ATTRS	0x28
 #define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_FLUSH	0x2a
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -230,6 +231,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_MADVISE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MADVISE, struct drm_i915_gem_madvise)
 #define DRM_IOCTL_I915_OVERLAY_PUT_IMAGE	DRM_IOW(DRM_COMMAND_BASE + DRM_IOCTL_I915_OVERLAY_ATTRS, struct drm_intel_overlay_put_image)
 #define DRM_IOCTL_I915_OVERLAY_ATTRS	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_OVERLAY_ATTRS, struct drm_intel_overlay_attrs)
+#define DRM_IOCTL_I915_GEM_FLUSH	DRM_IOW  (DRM_COMMAND_BASE + DRM_I915_GEM_FLUSH , struct drm_i915_gem_flush)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -835,4 +837,7 @@ struct drm_intel_overlay_attrs {
 	__u32 gamma5;
 };
 
+struct drm_i915_gem_flush {
+	__u32 bo_handle;
+};
 #endif				/* _I915_DRM_H_ */
diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
index 889ef46..c6fd907 100644
--- a/intel/intel_bufmgr.h
+++ b/intel/intel_bufmgr.h
@@ -81,6 +81,9 @@ struct _drm_intel_bo {
 	 * MM-specific handle for accessing object
 	 */
 	int handle;
+
+	/** Buffer is only writable from CPU */
+	int write_only;
 };
 
 #define BO_ALLOC_FOR_RENDER (1<<0)
@@ -145,6 +148,12 @@ drm_intel_bo *drm_intel_bo_gem_create_from_name(drm_intel_bufmgr *bufmgr,
 						unsigned int handle);
 void drm_intel_bufmgr_gem_enable_reuse(drm_intel_bufmgr *bufmgr);
 void drm_intel_bufmgr_gem_enable_fenced_relocs(drm_intel_bufmgr *bufmgr);
+int drm_intel_gem_bo_flush(drm_intel_bo *bo, uint32_t offset, size_t size);
+#define drm_intel_gem_bo_flush_obj(bo) drm_intel_gem_bo_flush(bo, 0, bo->size);
+int drm_intel_gem_bo_map_wo(drm_intel_bo *bo);
+int drm_intel_gem_bo_map_gtt_wo(drm_intel_bo *bo,
+				uint32_t prefault_offset,
+				int prefault_pages);
 int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo);
 int drm_intel_gem_bo_unmap_gtt(drm_intel_bo *bo);
 void drm_intel_gem_bo_start_gtt_access(drm_intel_bo *bo, int write_enable);
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 4f4de92..b9e8936 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -998,7 +998,7 @@ static void drm_intel_gem_bo_unreference(drm_intel_bo *bo)
 	}
 }
 
-static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
+static int do_bo_map(drm_intel_bo *bo, int read_enable, int write_enable)
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
 	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
@@ -1036,6 +1036,9 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
 	    bo_gem->mem_virtual);
 	bo->virtual = bo_gem->mem_virtual;
 
+	if (read_enable == 0 && bo->write_only)
+		goto map_done;
+
 	set_domain.handle = bo_gem->gem_handle;
 	set_domain.read_domains = I915_GEM_DOMAIN_CPU;
 	if (write_enable)
@@ -1050,13 +1053,73 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
 		    __FILE__, __LINE__, bo_gem->gem_handle,
 		    strerror(errno));
 	}
+	if (read_enable == 0)
+		bo->write_only = 1;
 
+map_done:
 	pthread_mutex_unlock(&bufmgr_gem->lock);
 
 	return 0;
 }
 
-int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
+static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
+{
+	return do_bo_map(bo, 1, write_enable);
+}
+
+static int
+get_cacheline_size(void)
+{
+	int ret = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+	if (ret == -1)
+		ret = 64;
+
+	return ret;
+}
+
+int drm_intel_gem_bo_flush(drm_intel_bo *bo, uint32_t offset, size_t size)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	const int cacheline_size = get_cacheline_size();
+	int num_cachelines;
+	struct drm_i915_gem_flush args;
+	int i;
+
+	if (bufmgr_gem->gen >= 6)
+		return 0;
+
+	if (bo_gem->mem_virtual == NULL)
+		return -1;
+
+	/* round offset down to cacheline size */
+	offset &= ~(cacheline_size - 1);
+
+	num_cachelines = (bo->size - offset) / size;
+	if (num_cachelines <= 0)
+		return 0;
+
+	if ((offset + size) > bo->size)
+		return -1;
+
+	for(i = 0; i < num_cachelines; i++) {
+		void *addr = ((uint8_t *)bo_gem->mem_virtual) + (i * cacheline_size);
+		asm volatile("clflush (%0)" :: "r" (addr));
+	}
+
+	args.bo_handle = 0;
+	return ioctl(bufmgr_gem->fd, DRM_IOCTL_I915_GEM_FLUSH, &args);
+}
+
+/**
+ * Any buffers mapped write only must use the flush interface
+ */
+int drm_intel_gem_bo_map_wo(drm_intel_bo *bo)
+{
+	return do_bo_map(bo, 0, 1);
+}
+
+static int do_bo_map_gtt(drm_intel_bo *bo, int read, int write)
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
 	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
@@ -1112,8 +1175,8 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
 
 	/* Now move it to the GTT domain so that the CPU caches are flushed */
 	set_domain.handle = bo_gem->gem_handle;
-	set_domain.read_domains = I915_GEM_DOMAIN_GTT;
-	set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+	set_domain.read_domains = read;
+	set_domain.write_domain = write;
 	ret = drmIoctl(bufmgr_gem->fd,
 		       DRM_IOCTL_I915_GEM_SET_DOMAIN,
 		       &set_domain);
@@ -1128,6 +1191,40 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
 	return 0;
 }
 
+int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
+{
+	return do_bo_map_gtt(bo, I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
+}
+
+int drm_intel_gem_bo_map_gtt_wo(drm_intel_bo *bo,
+				uint32_t prefault_offset,
+				int prefault_pages)
+{
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	volatile uint8_t *faultin_start;
+	int ret, i;
+
+	prefault_offset &= ~(4096 - 1);
+
+	if (prefault_offset + (prefault_pages * 4096) > bo->size)
+		return -EINVAL;
+
+	ret = do_bo_map_gtt(bo, 0, I915_GEM_DOMAIN_GTT);
+	if (ret)
+		return ret;
+
+	if (prefault_pages <= 0)
+		return 0;
+
+	faultin_start = ((uint8_t *)bo_gem->gtt_virtual) + prefault_offset;
+	for (i = 0; i < prefault_pages; i+=4096) {
+		volatile uint8_t *bar = (uint8_t *)(faultin_start + i);
+		*bar;
+	}
+
+	return ret;
+}
+
 int drm_intel_gem_bo_unmap_gtt(drm_intel_bo *bo)
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
-- 
1.7.6.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 5/6] write-only mappings
  2011-09-20  4:25 [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Ben Widawsky
                   ` (3 preceding siblings ...)
  2011-09-20  4:25 ` [PATCH 4/6] intel: write only map support Ben Widawsky
@ 2011-09-20  4:25 ` Ben Widawsky
  2011-09-20  4:25 ` [PATCH 6/6] intel: use write only maps for MapRangeBuffer Ben Widawsky
  2011-09-20 11:06 ` [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Daniel Vetter
  6 siblings, 0 replies; 28+ messages in thread
From: Ben Widawsky @ 2011-09-20  4:25 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky

This test probably won't get pushed. It's a very synthetic test to help me
benchmark

Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 tests/Makefile.am  |    1 +
 tests/gem_wo_map.c |  311 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 tests/test.sh      |   10 ++
 3 files changed, 322 insertions(+), 0 deletions(-)
 create mode 100644 tests/gem_wo_map.c
 create mode 100755 tests/test.sh

diff --git a/tests/Makefile.am b/tests/Makefile.am
index 80a9c94..65cf128 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -44,6 +44,7 @@ TESTS = getversion \
 	gen3_mixed_blits \
 	gem_storedw_loop \
 	gem_storedw_batches_loop \
+	gem_wo_map \
 	$(NULL)
 
 HANG = \
diff --git a/tests/gem_wo_map.c b/tests/gem_wo_map.c
new file mode 100644
index 0000000..5c18e20
--- /dev/null
+++ b/tests/gem_wo_map.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Ben Widawsky <ben@bwidawsk.net>
+ *
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <errno.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <pthread.h>
+#include "drm.h"
+#include "i915_drm.h"
+#include "drmtest.h"
+#include "intel_bufmgr.h"
+#include "intel_batchbuffer.h"
+#include "intel_gpu_tools.h"
+
+static drm_intel_bufmgr *bufmgr;
+struct intel_batchbuffer *batch;
+drm_intel_bo *src, *dst;
+int fd;
+pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
+int num_writes = 1;
+
+#define NUM_MAPS 128 * 1024
+#define NUM_BLITS 128 * 1024
+#define SURFACE_SIZE (1<<20)
+#define SURFACE_PAGEES (SURFACE_SIZE >> 12)
+#define SURFACE_PAGES (SURFACE_SIZE >> 12)
+
+bool thread_ret;
+
+/* acts like a gpu memcpy */
+static void
+blit(void)
+{
+	uint32_t pitch = 4096;
+	uint32_t lines = SURFACE_SIZE / pitch;
+	uint32_t line_width = pitch;
+
+	BEGIN_BATCH(8);
+	OUT_BATCH((2<<29) | (0x43<<22) | (0x3<<20) | 4);
+	OUT_BATCH((0x3<<24) | (0xcc<<16) | pitch);
+	OUT_BATCH((lines<<16) | line_width);
+	OUT_RELOC(dst, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0);
+	OUT_BATCH(pitch);
+	OUT_RELOC(src, I915_GEM_DOMAIN_RENDER, 0, 0);
+	ADVANCE_BATCH();
+
+	intel_batchbuffer_flush(batch);
+}
+
+static inline int
+get_offset(void)
+{
+	return (rand() % SURFACE_PAGES) * 4096;
+}
+
+static void
+touch_page(char * const base)
+{
+	int i = 0;
+	for(i = 0; i < num_writes; i++) {
+		base[get_offset()] = 1;
+	}
+}
+
+static void *
+blits(void *arg)
+{
+	int i = 0;
+	for (i = 0; i < NUM_BLITS; i++) {
+		blit();
+		usleep(100);
+	}
+	pthread_exit(NULL);
+}
+
+static void *
+maps(void *arg)
+{
+	int i = 0;
+	for (i = 0; i < NUM_MAPS; i++) {
+		drm_intel_bo_map(src, 1);
+		touch_page(src->virtual);
+		drm_intel_bo_unmap(src);
+	}
+	pthread_exit(NULL);
+}
+
+static void *
+maps_gtt(void *arg)
+{
+	int i = 0;
+	for (i = 0; i < NUM_MAPS; i++) {
+		drm_intel_gem_bo_map_gtt(src);
+		touch_page(src->virtual);
+		drm_intel_bo_unmap(src);
+	}
+	pthread_exit(NULL);
+}
+
+static void *
+maps_wo(void *arg)
+{
+	int i = 0;
+	for (i = 0; i < NUM_MAPS; i++) {
+		drm_intel_gem_bo_map_wo(src);
+		touch_page(src->virtual);
+		drm_intel_bo_unmap(src);
+	}
+
+	drm_intel_gem_bo_flush_obj(src);
+
+	pthread_exit(NULL);
+}
+
+static void *
+maps_gtt_wo(void *arg)
+{
+	int i = 0;
+	for (i = 0; i < NUM_MAPS; i++) {
+		drm_intel_gem_bo_map_gtt_wo(src, 0, 0);
+		touch_page(src->virtual);
+		drm_intel_bo_unmap(src);
+	}
+	pthread_exit(&thread_ret);
+}
+
+static void *
+functional_test_wo(void *arg)
+{
+	drm_intel_gem_bo_map_wo(src);
+	touch_page(src->virtual);
+	((char *)src->virtual)[1000] = 1;
+	drm_intel_bo_unmap(src);
+	blit();
+	drm_intel_bo_wait_rendering(batch->bo);
+
+	drm_intel_bo_map(dst, 0);
+	thread_ret = (((char *)dst->virtual)[1000] == 1);
+	drm_intel_bo_unmap(dst);
+	pthread_exit(&thread_ret);
+}
+
+static void *
+functional_test_gtt_wo(void *arg)
+{
+	drm_intel_gem_bo_map_gtt_wo(src, 0, 0);
+	touch_page(src->virtual);
+	((char *)src->virtual)[1000] = 1;
+	drm_intel_bo_unmap(src);
+	blit();
+	drm_intel_bo_wait_rendering(batch->bo);
+
+	drm_intel_bo_map(dst, 0);
+	thread_ret = (((char *)dst->virtual)[1000] == 1);
+	drm_intel_bo_unmap(dst);
+	pthread_exit(&thread_ret);
+}
+
+static void *
+functional_test_gtt(void *arg)
+{
+	drm_intel_gem_bo_map_gtt(src);
+	((char *)src->virtual)[1000] = 1;
+	drm_intel_bo_unmap(src);
+	blit();
+	drm_intel_bo_wait_rendering(batch->bo);
+
+	drm_intel_bo_map(dst, 0);
+	thread_ret = (((char *)dst->virtual)[1000] == 1);
+	drm_intel_bo_unmap(dst);
+	pthread_exit(&thread_ret);
+}
+
+static void *
+functional_test(void *arg)
+{
+	drm_intel_bo_map(src, 1);
+	((char *)src->virtual)[1000] = 1;
+	drm_intel_bo_unmap(src);
+	blit();
+	drm_intel_bo_wait_rendering(batch->bo);
+
+	drm_intel_bo_map(dst, 0);
+	thread_ret = (((char *)dst->virtual)[1000] == 1);
+	drm_intel_bo_unmap(dst);
+	pthread_exit(&thread_ret);
+}
+
+typedef void * (*thread_function)(void *);
+
+thread_function ffuncs[2][2] = {
+	{functional_test, functional_test_gtt},
+	{functional_test_wo, functional_test_gtt_wo}
+};
+
+thread_function sfuncs[2][2] = {
+	{maps, maps_gtt},
+	{maps_wo, maps_gtt_wo}
+};
+
+static void usage(const char *name)
+{
+	fprintf(stderr, "Usage: %s -[s|f] [-w write only] [-g gtt]\n", name);
+}
+
+int main(int argc, char **argv)
+{
+	pthread_t blit_thread;
+	pthread_t map_thread;
+	int test = 0;
+	int write_only = 0;
+	int gtt = 0;
+	char opt;
+	bool ret[1];
+
+	while ((opt = getopt(argc, argv, "sfwgc:")) != -1) {
+		switch(opt) {
+		case 'c':
+			num_writes = atoi(optarg);
+			break;
+		case 's':
+			test = 1;
+			break;
+		case 'f':
+			test = 2;
+			break;
+		case 'w':
+			write_only = 1;
+			break;
+		case 'g':
+			gtt = 1;
+			break;
+		case 'h':
+		case '?':
+		default:
+			usage(argv[0]);
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	srandom(0xdeadbeef);
+	fd = drm_open_any();
+
+	bufmgr = drm_intel_bufmgr_gem_init(fd, 4096);
+	drm_intel_bufmgr_gem_enable_reuse(bufmgr);
+	batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd));
+
+	src = drm_intel_bo_alloc(bufmgr, "src", SURFACE_SIZE, 4096);
+	dst = drm_intel_bo_alloc(bufmgr, "dst", SURFACE_SIZE, 4096);
+
+	switch (test) {
+	case 0:
+		usage(argv[0]);
+		exit(EXIT_FAILURE);
+		break;
+	case 2:
+		pthread_create(&map_thread, NULL, ffuncs[gtt][write_only], NULL);
+		pthread_join(map_thread, (void **)&ret);
+		assert(*ret == true);
+		break;
+	default:
+		pthread_create(&blit_thread, NULL, blits, NULL);
+		pthread_create(&map_thread, NULL, sfuncs[gtt][write_only], NULL);
+		pthread_join(map_thread, NULL);
+		/* We don't want to time blit performance for this benchmark,
+		 * though it is relevant.
+		 */
+		pthread_join(blit_thread, NULL);
+		break;
+	}
+
+	intel_batchbuffer_free(batch);
+	drm_intel_bufmgr_destroy(bufmgr);
+
+	close(fd);
+
+	return 0;
+}
diff --git a/tests/test.sh b/tests/test.sh
new file mode 100755
index 0000000..8a03249
--- /dev/null
+++ b/tests/test.sh
@@ -0,0 +1,10 @@
+#!/bin/sh
+
+echo "Map"
+time ./gem_wo_map -s -c 50
+echo "Map GTT"
+time ./gem_wo_map -sg -c 50
+echo "Map WO"
+time ./gem_wo_map -sw -c 50
+echo "Map GTT WO"
+time ./gem_wo_map -sgw -c 50
-- 
1.7.6.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 6/6] intel: use write only maps for MapRangeBuffer
  2011-09-20  4:25 [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Ben Widawsky
                   ` (4 preceding siblings ...)
  2011-09-20  4:25 ` [PATCH 5/6] write-only mappings Ben Widawsky
@ 2011-09-20  4:25 ` Ben Widawsky
  2011-09-20 11:06 ` [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Daniel Vetter
  6 siblings, 0 replies; 28+ messages in thread
From: Ben Widawsky @ 2011-09-20  4:25 UTC (permalink / raw)
  To: intel-gfx; +Cc: mesa-dev, Ben Widawsky


Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 src/mesa/drivers/dri/intel/intel_buffer_objects.c |   14 ++++++++++----
 1 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/intel/intel_buffer_objects.c b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
index d35a50e..18bed4d 100644
--- a/src/mesa/drivers/dri/intel/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
@@ -296,8 +296,6 @@ intel_bufferobj_get_subdata(struct gl_context * ctx,
    }
 }
 
-
-
 /**
  * Called via glMapBufferRange and glMapBuffer
  *
@@ -401,8 +399,16 @@ intel_bufferobj_map_range(struct gl_context * ctx,
    }
 
    if (!(access & GL_MAP_READ_BIT)) {
-      drm_intel_gem_bo_map_gtt(intel_obj->buffer);
-      intel_obj->mapped_gtt = GL_TRUE;
+      if (intel->gen >= 6) {
+	 drm_intel_gem_bo_map_wo(intel_obj->buffer);
+	 intel_obj->mapped_gtt = GL_FALSE;
+      } else if (access & GL_MAP_UNSYNCHRONIZED_BIT) {
+         drm_intel_gem_bo_map_gtt_wo(intel_obj->buffer, 0, 1);
+         intel_obj->mapped_gtt = GL_TRUE;
+      } else {
+         drm_intel_gem_bo_map_gtt(intel_obj->buffer);
+         intel_obj->mapped_gtt = GL_TRUE;
+      }
    } else {
       drm_intel_bo_map(intel_obj->buffer, (access & GL_MAP_WRITE_BIT) != 0);
       intel_obj->mapped_gtt = GL_FALSE;
-- 
1.7.6.1

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/6] drm/i915: Support write only mappings
  2011-09-20  4:25 ` [PATCH 3/6] drm/i915: Support write only mappings Ben Widawsky
@ 2011-09-20  5:29   ` Keith Packard
  2011-09-20  5:37     ` Ben Widawsky
  2011-09-20  8:30   ` Chris Wilson
  1 sibling, 1 reply; 28+ messages in thread
From: Keith Packard @ 2011-09-20  5:29 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky


[-- Attachment #1.1: Type: text/plain, Size: 785 bytes --]

On Mon, 19 Sep 2011 21:25:03 -0700, Ben Widawsky <ben@bwidawsk.net> wrote:
> When doing a GTT mapping, the pages are not backed until taking a fault.
> If we know that the object is write only, when the fault happens we do not need
> to make the pages coherent with the CPU.  This allows for semi fast prefaults
> to occur in libdrm at map time.

Every time I see this stuff, I get confused by the term
'write-only'. Reading through the code, I remind myself that what you're
actually promising is that any CPU accesses to the object will not
overlap any potential writes by the GPU. The CPU could be reading or
writing, it really doesn't matter.

Really what you want is a kernel call that does everything except block
on the GPU, right?

-- 
keith.packard@intel.com

[-- Attachment #1.2: Type: application/pgp-signature, Size: 189 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/6] drm/i915: Support write only mappings
  2011-09-20  5:29   ` Keith Packard
@ 2011-09-20  5:37     ` Ben Widawsky
  0 siblings, 0 replies; 28+ messages in thread
From: Ben Widawsky @ 2011-09-20  5:37 UTC (permalink / raw)
  To: Keith Packard; +Cc: Daniel Vetter, intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 1101 bytes --]

On Mon, Sep 19, 2011 at 10:29:15PM -0700, Keith Packard wrote:
> On Mon, 19 Sep 2011 21:25:03 -0700, Ben Widawsky <ben@bwidawsk.net> wrote:
> > When doing a GTT mapping, the pages are not backed until taking a fault.
> > If we know that the object is write only, when the fault happens we do not need
> > to make the pages coherent with the CPU.  This allows for semi fast prefaults
> > to occur in libdrm at map time.
> 
> Every time I see this stuff, I get confused by the term
> 'write-only'. Reading through the code, I remind myself that what you're
> actually promising is that any CPU accesses to the object will not
> overlap any potential writes by the GPU. The CPU could be reading or
> writing, it really doesn't matter.

I think of it as, we (libdrm + kernel) promise that the buffer is not
guaranteed to be coherent with the GPU. Mesa/client is who should be
making the promise to not write to a part of the buffer that the GPU may
be accessing.
> 
> Really what you want is a kernel call that does everything except block
> on the GPU, right?

Yes, exactly.

Ben

[-- Attachment #1.2: Type: application/pgp-signature, Size: 490 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 3/6] drm/i915: Support write only mappings
  2011-09-20  4:25 ` [PATCH 3/6] drm/i915: Support write only mappings Ben Widawsky
  2011-09-20  5:29   ` Keith Packard
@ 2011-09-20  8:30   ` Chris Wilson
  1 sibling, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2011-09-20  8:30 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky

On Mon, 19 Sep 2011 21:25:03 -0700, Ben Widawsky <ben@bwidawsk.net> wrote:
> When doing a GTT mapping, the pages are not backed until taking a fault.
> If we know that the object is write only, when the fault happens we do not need
> to make the pages coherent with the CPU.  This allows for semi fast prefaults
> to occur in libdrm at map time.
> 
> For the non-GTT case, there isn't much to do except make sure we handle things
> properly we we move an object out of the GTT.
> 
> Finally, modify the set domain IOCTL to actually mark objects as write only.
> Current limitation in that once an object is marked write only, it cannot be
> readable.

I couldn't see any reason for this restriction and it does impose a
significant burden on the userspace cache; any buffer used for
write-only mappings must be purged and can't be reused.

> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_gem.c |   32 ++++++++++++++++++++++++--------
>  1 files changed, 24 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index d4de7f7..48ea4bc 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -1047,6 +1047,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
>  	struct drm_i915_gem_object *obj;
>  	uint32_t read_domains = args->read_domains;
>  	uint32_t write_domain = args->write_domain;
> +	int write_only = 0;
>  	int ret;
>  
>  	if (!(dev->driver->driver_features & DRIVER_GEM))
> @@ -1059,11 +1060,15 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
>  	if (read_domains & I915_GEM_GPU_DOMAINS)
>  		return -EINVAL;
>  
> -	/* Having something in the write domain implies it's in the read
> -	 * domain, and only that read domain.  Enforce that in the request.
> -	 */
> -	if (write_domain != 0 && read_domains != write_domain)
> +	if (write_domain != 0 && read_domains == 0) {

Neat extension of the existing ioctl. Just a little wary of the
potential for mass foot shootings - but this whole extension is about
giving them a machinegun. 

> +		write_only = 1;
> +	} else if (write_domain != 0 && read_domains != write_domain) {
> +		/* Having something in the write domain implies it's in the read
> +		 * domain, and only that read domain. Enforce that in the
> +		 * request.
> +		 */
>  		return -EINVAL;
> +	}
>  
>  	ret = i915_mutex_lock_interruptible(dev);
>  	if (ret)
> @@ -1084,6 +1089,13 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
>  		 */
>  		if (ret == -EINVAL)
>  			ret = 0;
> +	} else if (obj->cpu_write_only && !write_only) {
> +		DRM_ERROR("Not yet supported\n");
> +		ret = -EINVAL;
> +		goto unlock;
> +	} else if (write_only) {
> +		atomic_set(&obj->cpu_write_only, 1);

Atomic? Did you intend for obj->cpu_write_only be atomic_t?

> +		obj->base.read_domains = 0;
>  	} else {
>  		ret = i915_gem_object_set_to_cpu_domain(obj, write_domain != 0);
>  	}
> @@ -1217,9 +1229,13 @@ int i915_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>  		if (ret)
>  			goto unlock;
>  
> -		ret = i915_gem_object_set_to_gtt_domain(obj, write);
> -		if (ret)
> -			goto unlock;
> +		if (obj->cpu_write_only) {
> +			i915_gem_object_flush_cpu_write_domain(obj);
> +		} else {
> +			ret = i915_gem_object_set_to_gtt_domain(obj, write);
> +			if (ret)
> +				goto unlock;
> +		}
>  	}
>  
>  	if (obj->tiling_mode == I915_TILING_NONE)
> @@ -1586,7 +1602,7 @@ i915_gem_object_put_pages_gtt(struct drm_i915_gem_object *obj)
>  		obj->dirty = 0;
>  
>  	for (i = 0; i < page_count; i++) {
> -		if (obj->dirty)
> +		if (obj->dirty || obj->cpu_write_only)

Could this be simplified by marking the bo as dirty upon setting the
object write-only?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/6] RFCish: write only mappings (aka non-blocking)
  2011-09-20  4:25 [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Ben Widawsky
                   ` (5 preceding siblings ...)
  2011-09-20  4:25 ` [PATCH 6/6] intel: use write only maps for MapRangeBuffer Ben Widawsky
@ 2011-09-20 11:06 ` Daniel Vetter
  2011-09-20 17:17   ` Eric Anholt
                     ` (2 more replies)
  6 siblings, 3 replies; 28+ messages in thread
From: Daniel Vetter @ 2011-09-20 11:06 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

On Mon, Sep 19, 2011 at 09:25:00PM -0700, Ben Widawsky wrote:
> I'm going to keep this short...
> Patch 5 is my test case.
> On Gen6 I see slightly better performance. On Gen5 I see really really
> improvements (like 3x) for non GTT write only maps over regular mmaps.
> GTT mappings don't really show any improvements as a whole.
>
> Better tests would be nice, but without more significant Mesa changes,
> or better benchmarks, I'm not sure how to get those.  While I think
> these patches are mostly complete, ideas for better testing are very
> welcome. Also of course, general optimizations or pointing out my errors
> would be nice.

Ok, I'm gonna be the dense annoying bastard here:

- Can we stop calling this mappings write-only. Afaics the distinguishing
  feature is that they're non-blocking. And yes, current users only use
  non-blocking paths to upload data because the amount of data we're
  currently downloading is so small. Hence we can use on bo for each
  download without wasting too much space and still avoid unnecessary
  blocking. Bit I think this will change, e.g. with designs like sna that
  tightly integrate gpu and sw rendering. Or OpenCL.

- Why do we need any patches for gtt non-blocking mmaps? I've re-read our
  code, and afaics we're only calling wait_rendering from gem_fault if
  obj->gtt_space == NULL. I.e. there's no way the gpu is currently using
  the data and hence no way for us to block on it. I think the only thing
  needed is a small libdrm batch to enable non-blocking gtt mmaps

  void drm_intel_enable_non_blocking_gtt_mmap(obj)

  which sets a bit somewhere and moves the obj (once) into the gtt domain.
  And a corresponding change in gtt_mmap to disable the set_domain call.
  This only works as long as no one else access the object from the cpu
  domain, but afaics we'll use non-blocking mmaps only for unshared
  buffers, so that should be fine.

  I might also just be dense and not see the issue ...

- I'm sorry having suggested to implement the clflush ioctl, I think it's
  a foolish idea, now. Non-blocking mmaps is a performance optimization,
  needing to sync caches with clflush is very much the opposite. So I
  think we can dustbin this.

  Now non-blocking cpu mmaps make very much sense on llc/snooped buffer
  objects. So I think we actually need an ioctl to get obj->cache_level so
  userspace can decide whether it should use non-blocking gtt mmaps or cpu
  (non-blocking) cpu mmaps. We might as well go full-circle, make Chris
  happy and merge the corresponding set_cache_level ioclt to enable
  snooped buffers on machines with ilk-like coherency (i.e. that atom
  thing I'm hearing about ...). But imo that's material for non-blocking
  mmaps, step 2.

Cheers, Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/6] RFCish: write only mappings (aka non-blocking)
  2011-09-20 11:06 ` [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Daniel Vetter
@ 2011-09-20 17:17   ` Eric Anholt
  2011-09-20 19:19     ` Daniel Vetter
  2011-09-20 21:16   ` [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Chris Wilson
  2011-09-20 22:19   ` Ben Widawsky
  2 siblings, 1 reply; 28+ messages in thread
From: Eric Anholt @ 2011-09-20 17:17 UTC (permalink / raw)
  To: Daniel Vetter, Ben Widawsky; +Cc: intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 1178 bytes --]

On Tue, 20 Sep 2011 13:06:43 +0200, Daniel Vetter <daniel@ffwll.ch> wrote:
> - Why do we need any patches for gtt non-blocking mmaps? I've re-read our
>   code, and afaics we're only calling wait_rendering from gem_fault if
>   obj->gtt_space == NULL. I.e. there's no way the gpu is currently using
>   the data and hence no way for us to block on it. I think the only thing
>   needed is a small libdrm batch to enable non-blocking gtt mmaps
> 
>   void drm_intel_enable_non_blocking_gtt_mmap(obj)
> 
>   which sets a bit somewhere and moves the obj (once) into the gtt domain.
>   And a corresponding change in gtt_mmap to disable the set_domain call.
>   This only works as long as no one else access the object from the cpu
>   domain, but afaics we'll use non-blocking mmaps only for unshared
>   buffers, so that should be fine.
> 
>   I might also just be dense and not see the issue ...

This was what I was looking for.  Ben was concerned that while warming
up towards steady state, the page faults for new pages of the giant
vertex buffer (for example) would end up blocking in the fault handler.
I really have a hard time caring about that case.

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/6] RFCish: write only mappings (aka non-blocking)
  2011-09-20 17:17   ` Eric Anholt
@ 2011-09-20 19:19     ` Daniel Vetter
  2011-09-21  8:19       ` [PATCH] intel: non-blocking mmaps on the cheap Daniel Vetter
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Vetter @ 2011-09-20 19:19 UTC (permalink / raw)
  To: Eric Anholt; +Cc: Ben Widawsky, intel-gfx

On Tue, Sep 20, 2011 at 10:17:25AM -0700, Eric Anholt wrote:
> On Tue, 20 Sep 2011 13:06:43 +0200, Daniel Vetter <daniel@ffwll.ch> wrote:
> > - Why do we need any patches for gtt non-blocking mmaps? I've re-read our
> >   code, and afaics we're only calling wait_rendering from gem_fault if
> >   obj->gtt_space == NULL. I.e. there's no way the gpu is currently using
> >   the data and hence no way for us to block on it. I think the only thing
> >   needed is a small libdrm batch to enable non-blocking gtt mmaps
> > 
> >   void drm_intel_enable_non_blocking_gtt_mmap(obj)
> > 
> >   which sets a bit somewhere and moves the obj (once) into the gtt domain.
> >   And a corresponding change in gtt_mmap to disable the set_domain call.
> >   This only works as long as no one else access the object from the cpu
> >   domain, but afaics we'll use non-blocking mmaps only for unshared
> >   buffers, so that should be fine.
> > 
> >   I might also just be dense and not see the issue ...
> 
> This was what I was looking for.  Ben was concerned that while warming
> up towards steady state, the page faults for new pages of the giant
> vertex buffer (for example) would end up blocking in the fault handler.
> I really have a hard time caring about that case.

Well, that can easily be handled by just prefaulting the full range on the
enable_non_blocking call. The thing I was concerned about was when we need
to move around the bo in the gtt to make some space and shoot down the
mappings to do so: On the first fault the bo is naturally not busy, but on
subsequent faults on other parts of the bo the gpu might already be using
it. But I've double-checked, and it looks like with your revert (commit
e92d03bf) we should be safe.

Now for cpu coherent mmaps on machines/kernels support llc caching: Would
you prefer libdrm to transparently use that for non-blocking maps if
available, or is an explicit feature-check with a sepearte cpu map
function preferred? I'm thinking of adding a new map_non_blocking
functions to add to libdrm that either uses gtt mmaps, or cpu mmaps if
they're coherent. The risk I'm seeing with that approach is that future
hw gens might have slightly different semantics for these (e.g funny
games with swizzling) so transparently using one instead of the other may
end up in headaches.  Otoh for untiled buffers to upload vertices, pixels,
whatnoelse, we should be fairly safe. And cpu mmaps for tiled buffers are
broken already, thanks to bit17 swizzling.

I think I can etch out a bit of time and whip up an rfc patchset in the
coming days.

Cheers, Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/6] RFCish: write only mappings (aka non-blocking)
  2011-09-20 11:06 ` [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Daniel Vetter
  2011-09-20 17:17   ` Eric Anholt
@ 2011-09-20 21:16   ` Chris Wilson
  2011-09-21  7:02     ` Daniel Vetter
  2011-09-20 22:19   ` Ben Widawsky
  2 siblings, 1 reply; 28+ messages in thread
From: Chris Wilson @ 2011-09-20 21:16 UTC (permalink / raw)
  To: Daniel Vetter, Ben Widawsky; +Cc: intel-gfx

On Tue, 20 Sep 2011 13:06:43 +0200, Daniel Vetter <daniel@ffwll.ch> wrote:
>   Now non-blocking cpu mmaps make very much sense on llc/snooped buffer
>   objects. So I think we actually need an ioctl to get obj->cache_level so
>   userspace can decide whether it should use non-blocking gtt mmaps or cpu
>   (non-blocking) cpu mmaps. We might as well go full-circle, make Chris
>   happy and merge the corresponding set_cache_level ioclt to enable
>   snooped buffers on machines with ilk-like coherency (i.e. that atom
>   thing I'm hearing about ...). But imo that's material for non-blocking
>   mmaps, step 2.

That's not enough to make me completely happy. I also have the use-case
of wanting to efficiently handle compositing with client allocated
memory i.e. persistent ShmPixmaps, where coherency either needs to be
transparent to the client or (worse) synchronicity imposed by the server.
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/6] RFCish: write only mappings (aka non-blocking)
  2011-09-20 11:06 ` [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Daniel Vetter
  2011-09-20 17:17   ` Eric Anholt
  2011-09-20 21:16   ` [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Chris Wilson
@ 2011-09-20 22:19   ` Ben Widawsky
  2011-09-21  7:07     ` Daniel Vetter
  2 siblings, 1 reply; 28+ messages in thread
From: Ben Widawsky @ 2011-09-20 22:19 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: intel-gfx

On Tue, 20 Sep 2011 13:06:43 +0200
Daniel Vetter <daniel@ffwll.ch> wrote:

> On Mon, Sep 19, 2011 at 09:25:00PM -0700, Ben Widawsky wrote:
> > I'm going to keep this short...
> > Patch 5 is my test case.
> > On Gen6 I see slightly better performance. On Gen5 I see really
> > really improvements (like 3x) for non GTT write only maps over
> > regular mmaps. GTT mappings don't really show any improvements as a
> > whole.
> >
> > Better tests would be nice, but without more significant Mesa
> > changes, or better benchmarks, I'm not sure how to get those.
> > While I think these patches are mostly complete, ideas for better
> > testing are very welcome. Also of course, general optimizations or
> > pointing out my errors would be nice.
> 
> Ok, I'm gonna be the dense annoying bastard here:
> 
> - Can we stop calling this mappings write-only. Afaics the
> distinguishing feature is that they're non-blocking. And yes, current
> users only use non-blocking paths to upload data because the amount
> of data we're currently downloading is so small. Hence we can use on
> bo for each download without wasting too much space and still avoid
> unnecessary blocking. Bit I think this will change, e.g. with designs
> like sna that tightly integrate gpu and sw rendering. Or OpenCL.

I really wanted to argue this point, but you're right. I think what I'll do is
make the libdrm stub be called non-blocking, and then per gen we can do
whatever with set_domain.

> 
> - Why do we need any patches for gtt non-blocking mmaps? I've re-read
> our code, and afaics we're only calling wait_rendering from gem_fault
> if obj->gtt_space == NULL. I.e. there's no way the gpu is currently
> using the data and hence no way for us to block on it.

I think you're right. I misread this before, we take an early exit if
its write domain is already GTT, so we don't sync flush at that point. I
believe this is rooted in the fact that my original versions of the
patch wouldn't call set domain at all (it used create() and mmap()
ioctls to mark buffers as write only), but with those changes, I think
this is unnecessary with prefaulting.

> I think the only thing needed is a small libdrm batch to enable non-blocking
> gtt mmaps
> 
>   void drm_intel_enable_non_blocking_gtt_mmap(obj)
> 
>   which sets a bit somewhere and moves the obj (once) into the gtt domain.
>   And a corresponding change in gtt_mmap to disable the set_domain call. This
>   only works as long as no one else access the object from the cpu domain,
>   but afaics we'll use non-blocking mmaps only for unshared buffers, so that
>   should be fine.
> 
>   I might also just be dense and not see the issue ...

I should have documented that these buffers should not be flinked. I
should probably enforce that in the driver. I will take a note to change
that. Anyway, I think you're right, and the result of this is to remove
the hunk from i915_gem_fault, and prefault do you agree?

> 
> - I'm sorry having suggested to implement the clflush ioctl, I think
> it's a foolish idea, now. Non-blocking mmaps is a performance
> optimization, needing to sync caches with clflush is very much the
> opposite. So I think we can dustbin this.

I disagree. I think it's nice function to add for people too lazy to do
micro-optimizations. The flushing of the full object is almost
guaranteed to make performance worse though, that should really only be
for testing purposes.

> 
>   Now non-blocking cpu mmaps make very much sense on llc/snooped
> buffer objects. So I think we actually need an ioctl to get
> obj->cache_level so userspace can decide whether it should use
> non-blocking gtt mmaps or cpu (non-blocking) cpu mmaps. We might as
> well go full-circle, make Chris happy and merge the corresponding
> set_cache_level ioclt to enable snooped buffers on machines with
> ilk-like coherency (i.e. that atom thing I'm hearing about ...). But
> imo that's material for non-blocking mmaps, step 2.

I'd need to research this a bit more, let me defer response on this
part. By the way, which set_cache_level ioctl are you referring to?

> 
> Cheers, Daniel

Ben

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/6] RFCish: write only mappings (aka non-blocking)
  2011-09-20 21:16   ` [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Chris Wilson
@ 2011-09-21  7:02     ` Daniel Vetter
  0 siblings, 0 replies; 28+ messages in thread
From: Daniel Vetter @ 2011-09-21  7:02 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Ben Widawsky, intel-gfx

On Tue, Sep 20, 2011 at 10:16:39PM +0100, Chris Wilson wrote:
> On Tue, 20 Sep 2011 13:06:43 +0200, Daniel Vetter <daniel@ffwll.ch> wrote:
> >   Now non-blocking cpu mmaps make very much sense on llc/snooped buffer
> >   objects. So I think we actually need an ioctl to get obj->cache_level so
> >   userspace can decide whether it should use non-blocking gtt mmaps or cpu
> >   (non-blocking) cpu mmaps. We might as well go full-circle, make Chris
> >   happy and merge the corresponding set_cache_level ioclt to enable
> >   snooped buffers on machines with ilk-like coherency (i.e. that atom
> >   thing I'm hearing about ...). But imo that's material for non-blocking
> >   mmaps, step 2.
> 
> That's not enough to make me completely happy. I also have the use-case
> of wanting to efficiently handle compositing with client allocated
> memory i.e. persistent ShmPixmaps, where coherency either needs to be
> transparent to the client or (worse) synchronicity imposed by the server.

I know, just snoopable bo support only makes you half-happy and you need
fancy stuff in addition ;-) But if your sna branch can halfway easily
profit from simle snoopable bo support on !llc machines, I think that'll
give us an easy way to check api sanity for the new ioctl.
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 1/6] RFCish: write only mappings (aka non-blocking)
  2011-09-20 22:19   ` Ben Widawsky
@ 2011-09-21  7:07     ` Daniel Vetter
  0 siblings, 0 replies; 28+ messages in thread
From: Daniel Vetter @ 2011-09-21  7:07 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

On Tue, Sep 20, 2011 at 03:19:53PM -0700, Ben Widawsky wrote:
> On Tue, 20 Sep 2011 13:06:43 +0200
> Daniel Vetter <daniel@ffwll.ch> wrote:
> > - I'm sorry having suggested to implement the clflush ioctl, I think
> > it's a foolish idea, now. Non-blocking mmaps is a performance
> > optimization, needing to sync caches with clflush is very much the
> > opposite. So I think we can dustbin this.
> 
> I disagree. I think it's nice function to add for people too lazy to do
> micro-optimizations. The flushing of the full object is almost
> guaranteed to make performance worse though, that should really only be
> for testing purposes.

Let me whip up a simple patch for libdrm to show why I think we don't need
this.

> >   Now non-blocking cpu mmaps make very much sense on llc/snooped
> > buffer objects. So I think we actually need an ioctl to get
> > obj->cache_level so userspace can decide whether it should use
> > non-blocking gtt mmaps or cpu (non-blocking) cpu mmaps. We might as
> > well go full-circle, make Chris happy and merge the corresponding
> > set_cache_level ioclt to enable snooped buffers on machines with
> > ilk-like coherency (i.e. that atom thing I'm hearing about ...). But
> > imo that's material for non-blocking mmaps, step 2.
> 
> I'd need to research this a bit more, let me defer response on this
> part. By the way, which set_cache_level ioctl are you referring to?

The set_cache_level ioctl that doesn't exist yet ;-) Essentially something
to set/get obj->cache_level, so that Chris can use snoopable bos on !llc
machines. But as I've said, that's probably something for the extend
non-blocking mmap support and not something we need right away.

Cheers, Daniel

-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] intel: non-blocking mmaps on the cheap
  2011-09-20 19:19     ` Daniel Vetter
@ 2011-09-21  8:19       ` Daniel Vetter
  2011-09-21 18:11         ` Eric Anholt
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Vetter @ 2011-09-21  8:19 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky

This adds a new mode to map gem objects in a non-blocking way. This
needs to be enabled on a per-object basis with object_enable_nonblocking.

The new kernel interface required to get the caching level/coherency
is not yet wired up. All the code to transparently choose between gtt
mappings or (if coherent) cpu mappings is already in place, though.

Cc: Eric Anholt <eric@anholt.net>
Cc: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
Hi Eric,

Not really tested, but can you please take a quick lock and see if this is
suitable for mesa and ack the general approach?

Thanks, Daniel

 intel/intel_bufmgr.h     |    4 +
 intel/intel_bufmgr_gem.c |  165 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 152 insertions(+), 17 deletions(-)

diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
index 889ef46..95216dd 100644
--- a/intel/intel_bufmgr.h
+++ b/intel/intel_bufmgr.h
@@ -149,6 +149,10 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo);
 int drm_intel_gem_bo_unmap_gtt(drm_intel_bo *bo);
 void drm_intel_gem_bo_start_gtt_access(drm_intel_bo *bo, int write_enable);
 
+int drm_intel_gem_bo_enable_nonblocking_map(drm_intel_bo *bo);
+int drm_intel_gem_bo_map_nonblocking(drm_intel_bo *bo);
+int drm_intel_gem_bo_unmap_nonblocking(drm_intel_bo *bo);
+
 int drm_intel_get_pipe_from_crtc_id(drm_intel_bufmgr *bufmgr, int crtc_id);
 
 int drm_intel_get_aperture_sizes(int fd, size_t *mappable, size_t *total);
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 4f4de92..124d372 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -141,6 +141,9 @@ struct _drm_intel_bo_gem {
 	uint32_t swizzle_mode;
 	unsigned long stride;
 
+	unsigned nonblocking_mmap : 1;
+	unsigned gpu_coherent_cpu_mmap : 1;
+
 	time_t free_time;
 
 	/** Array passed to the DRM containing relocation information. */
@@ -937,6 +940,7 @@ drm_intel_gem_bo_unreference_final(drm_intel_bo *bo, time_t time)
 	}
 	bo_gem->reloc_count = 0;
 	bo_gem->used_as_reloc_target = 0;
+	bo_gem->nonblocking_mmap = 0;
 
 	DBG("bo_unreference final: %d (%s)\n",
 	    bo_gem->gem_handle, bo_gem->name);
@@ -998,15 +1002,11 @@ static void drm_intel_gem_bo_unreference(drm_intel_bo *bo)
 	}
 }
 
-static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
+static int do_mmap_cpu(drm_intel_bufmgr_gem *bufmgr_gem,
+		       drm_intel_bo_gem *bo_gem)
 {
-	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
-	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
-	struct drm_i915_gem_set_domain set_domain;
 	int ret;
 
-	pthread_mutex_lock(&bufmgr_gem->lock);
-
 	/* Allow recursive mapping. Mesa may recursively map buffers with
 	 * nested display loops.
 	 */
@@ -1018,7 +1018,7 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
 		memset(&mmap_arg, 0, sizeof(mmap_arg));
 		mmap_arg.handle = bo_gem->gem_handle;
 		mmap_arg.offset = 0;
-		mmap_arg.size = bo->size;
+		mmap_arg.size = bo_gem->bo.size;
 		ret = drmIoctl(bufmgr_gem->fd,
 			       DRM_IOCTL_I915_GEM_MMAP,
 			       &mmap_arg);
@@ -1027,11 +1027,28 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
 			DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
 			    __FILE__, __LINE__, bo_gem->gem_handle,
 			    bo_gem->name, strerror(errno));
-			pthread_mutex_unlock(&bufmgr_gem->lock);
 			return ret;
 		}
 		bo_gem->mem_virtual = (void *)(uintptr_t) mmap_arg.addr_ptr;
 	}
+	return 0;
+}
+
+static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	struct drm_i915_gem_set_domain set_domain;
+	int ret;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+	assert(!bo_gem->nonblocking_mmap);
+
+	ret = do_mmap_cpu(bufmgr_gem, bo_gem);
+	if (ret != 0) {
+		pthread_mutex_unlock(&bufmgr_gem->lock);
+		return ret;
+	}
 	DBG("bo_map: %d (%s) -> %p\n", bo_gem->gem_handle, bo_gem->name,
 	    bo_gem->mem_virtual);
 	bo->virtual = bo_gem->mem_virtual;
@@ -1056,15 +1073,11 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
 	return 0;
 }
 
-int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
+static int do_mmap_gtt(drm_intel_bufmgr_gem *bufmgr_gem,
+		       drm_intel_bo_gem *bo_gem)
 {
-	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
-	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
-	struct drm_i915_gem_set_domain set_domain;
 	int ret;
 
-	pthread_mutex_lock(&bufmgr_gem->lock);
-
 	/* Get a mapping of the buffer if we haven't before. */
 	if (bo_gem->gtt_virtual == NULL) {
 		struct drm_i915_gem_mmap_gtt mmap_arg;
@@ -1085,12 +1098,11 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
 			    __FILE__, __LINE__,
 			    bo_gem->gem_handle, bo_gem->name,
 			    strerror(errno));
-			pthread_mutex_unlock(&bufmgr_gem->lock);
 			return ret;
 		}
 
 		/* and mmap it */
-		bo_gem->gtt_virtual = mmap(0, bo->size, PROT_READ | PROT_WRITE,
+		bo_gem->gtt_virtual = mmap(0, bo_gem->bo.size, PROT_READ | PROT_WRITE,
 					   MAP_SHARED, bufmgr_gem->fd,
 					   mmap_arg.offset);
 		if (bo_gem->gtt_virtual == MAP_FAILED) {
@@ -1100,11 +1112,29 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
 			    __FILE__, __LINE__,
 			    bo_gem->gem_handle, bo_gem->name,
 			    strerror(errno));
-			pthread_mutex_unlock(&bufmgr_gem->lock);
 			return ret;
 		}
 	}
 
+	return 0;
+}
+
+int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	struct drm_i915_gem_set_domain set_domain;
+	int ret;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+	assert(!bo_gem->nonblocking_mmap);
+
+	ret = do_mmap_gtt(bufmgr_gem, bo_gem);
+	if (ret != 0) {
+		pthread_mutex_unlock(&bufmgr_gem->lock);
+		return ret;
+	}
+
 	bo->virtual = bo_gem->gtt_virtual;
 
 	DBG("bo_map_gtt: %d (%s) -> %p\n", bo_gem->gem_handle, bo_gem->name,
@@ -1284,6 +1314,105 @@ drm_intel_gem_bo_start_gtt_access(drm_intel_bo *bo, int write_enable)
 	}
 }
 
+/**
+ * Enable non-blocking mmap on this object.
+ *
+ * The object may not be tiled and cannot be shared with flink. The other mmap
+ * functions will be disabled.
+ */
+int
+drm_intel_gem_bo_enable_nonblocking_map(drm_intel_bo *bo)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	struct drm_i915_gem_set_domain set_domain;
+	int ret;
+
+	assert(bo_gem->tiling_mode == I915_TILING_NONE);
+	assert(bo_gem->global_name == 0);
+
+	/* Move object to the gtt domain _once_. Thats the right thing even when
+	 * using cpu mmaps, because we'll be using them only when they're fully
+	 * coherent with the gtt mappings. */
+	set_domain.handle = bo_gem->gem_handle;
+	set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+	set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+	ret = drmIoctl(bufmgr_gem->fd,
+		       DRM_IOCTL_I915_GEM_SET_DOMAIN,
+		       &set_domain);
+	if (ret != 0) {
+		DBG("%s:%d: Error setting domain %d: %s\n",
+		    __FILE__, __LINE__, bo_gem->gem_handle,
+		    strerror(errno));
+		return ret;
+	}
+
+	bo_gem->nonblocking_mmap = 1;
+	/* TODO: ask kernel about llc caching */
+	bo_gem->gpu_coherent_cpu_mmap = 0;
+
+	return 0;
+}
+
+/**
+ * map an object in the non-blocking mode
+ *
+ * This automagically chooses either the gtt mapping or if coherent and faster,
+ * the cpu mapping.
+ */
+int drm_intel_gem_bo_map_nonblocking(drm_intel_bo *bo)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	int ret;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+	assert(!bo_gem->nonblocking_mmap);
+
+	if (bo_gem->gpu_coherent_cpu_mmap) {
+		ret = do_mmap_cpu(bufmgr_gem, bo_gem);
+		if (ret != 0) {
+			pthread_mutex_unlock(&bufmgr_gem->lock);
+			return ret;
+		}
+
+		bo->virtual = bo_gem->mem_virtual;
+	} else {
+		ret = do_mmap_gtt(bufmgr_gem, bo_gem);
+		if (ret != 0) {
+			pthread_mutex_unlock(&bufmgr_gem->lock);
+			return ret;
+		}
+
+		bo->virtual = bo_gem->gtt_virtual;
+	}
+
+	DBG("bo_map_nonblocking: %d (%s) -> %p\n", bo_gem->gem_handle, bo_gem->name,
+	    bo_gem->gtt_virtual);
+
+	pthread_mutex_unlock(&bufmgr_gem->lock);
+
+	return 0;
+}
+
+/**
+ * unmap an object in the non-blocking mode
+ */
+int drm_intel_gem_bo_unmap_nonblocking(drm_intel_bo *bo)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	int ret = 0;
+
+	if (bo == NULL)
+		return 0;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+	bo->virtual = NULL;
+	pthread_mutex_unlock(&bufmgr_gem->lock);
+
+	return ret;
+}
+
 static void
 drm_intel_bufmgr_gem_destroy(drm_intel_bufmgr *bufmgr)
 {
@@ -1790,6 +1919,8 @@ drm_intel_gem_bo_flink(drm_intel_bo *bo, uint32_t * name)
 	struct drm_gem_flink flink;
 	int ret;
 
+	assert(!bo_gem->nonblocking_mmap);
+
 	if (!bo_gem->global_name) {
 		memset(&flink, 0, sizeof(flink));
 		flink.handle = bo_gem->gem_handle;
-- 
1.7.6.2

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] intel: non-blocking mmaps on the cheap
  2011-09-21  8:19       ` [PATCH] intel: non-blocking mmaps on the cheap Daniel Vetter
@ 2011-09-21 18:11         ` Eric Anholt
  2011-09-21 19:19           ` Daniel Vetter
  0 siblings, 1 reply; 28+ messages in thread
From: Eric Anholt @ 2011-09-21 18:11 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky


[-- Attachment #1.1: Type: text/plain, Size: 1395 bytes --]

On Wed, 21 Sep 2011 10:19:13 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> This adds a new mode to map gem objects in a non-blocking way. This
> needs to be enabled on a per-object basis with object_enable_nonblocking.
> 
> The new kernel interface required to get the caching level/coherency
> is not yet wired up. All the code to transparently choose between gtt
> mappings or (if coherent) cpu mappings is already in place, though.
> 
> Cc: Eric Anholt <eric@anholt.net>
> Cc: Ben Widawsky <ben@bwidawsk.net>
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
> Hi Eric,
> 
> Not really tested, but can you please take a quick lock and see if this is
> suitable for mesa and ack the general approach?

It doesn't quite look like what I want for Mesa, but close.

Don't-block-I-know-what-I'm-doing is not a long-term property of a
mapping for GL, it's something you say when you're getting a pointer to
a piece of the existing mapping, and you may ask for normal blocking
mode again later.  So I'd rather see the the map/unmap track "oh, we
already set to GTT and we haven't done something that would have knocked
it out of there, so no need to go set to GTT again", instead of setting
a flag.  Also, I think you need to re-set to GTT after a CPU mapping on
!llc, since clflushing is coarser granularity than GL buffer mappings
(byte boundary).

[-- Attachment #1.2: Type: application/pgp-signature, Size: 197 bytes --]

[-- Attachment #2: Type: text/plain, Size: 159 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] intel: non-blocking mmaps on the cheap
  2011-09-21 18:11         ` Eric Anholt
@ 2011-09-21 19:19           ` Daniel Vetter
  2011-09-22  1:47             ` [PATCH cont'd] " Ben Widawsky
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Vetter @ 2011-09-21 19:19 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky

This adds a new mode to map gem objects in a non-blocking way.

The new kernel interface required to get the caching level/coherency
is not yet wired up. All the code to transparently choose between gtt
mappings or (if coherent) cpu mappings is already in place, though.

To make this useable for mesa, we need to allow other buffer acces in
between non-blocking modes. It's not a big problem if this accidentaly
blocks once when switching back to non-blocking maps, but domains need
to be correctly tracked.

I've gone through historical kernel versions and luckily the only
place we move a bo from the gtt domain to the cpu domain is in the
set_ioctl. pwrite/pread don't move objects into the cpu domain as long
as they're untiled or llc cached.

So track whether an object is in the gtt domain and invalidate this
only on access to cpu mappings.

Cc: Eric Anholt <eric@anholt.net>
Cc: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 intel/intel_bufmgr.h     |    3 +
 intel/intel_bufmgr_gem.c |  156 +++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 142 insertions(+), 17 deletions(-)

diff --git a/intel/intel_bufmgr.h b/intel/intel_bufmgr.h
index 889ef46..47f3416 100644
--- a/intel/intel_bufmgr.h
+++ b/intel/intel_bufmgr.h
@@ -149,6 +149,9 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo);
 int drm_intel_gem_bo_unmap_gtt(drm_intel_bo *bo);
 void drm_intel_gem_bo_start_gtt_access(drm_intel_bo *bo, int write_enable);
 
+int drm_intel_gem_bo_map_nonblocking(drm_intel_bo *bo);
+int drm_intel_gem_bo_unmap_nonblocking(drm_intel_bo *bo);
+
 int drm_intel_get_pipe_from_crtc_id(drm_intel_bufmgr *bufmgr, int crtc_id);
 
 int drm_intel_get_aperture_sizes(int fd, size_t *mappable, size_t *total);
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index 4f4de92..ff4b663 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -141,6 +141,8 @@ struct _drm_intel_bo_gem {
 	uint32_t swizzle_mode;
 	unsigned long stride;
 
+	unsigned in_gtt_domain : 1;
+
 	time_t free_time;
 
 	/** Array passed to the DRM containing relocation information. */
@@ -998,15 +1000,11 @@ static void drm_intel_gem_bo_unreference(drm_intel_bo *bo)
 	}
 }
 
-static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
+static int do_mmap_cpu(drm_intel_bufmgr_gem *bufmgr_gem,
+		       drm_intel_bo_gem *bo_gem)
 {
-	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
-	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
-	struct drm_i915_gem_set_domain set_domain;
 	int ret;
 
-	pthread_mutex_lock(&bufmgr_gem->lock);
-
 	/* Allow recursive mapping. Mesa may recursively map buffers with
 	 * nested display loops.
 	 */
@@ -1018,7 +1016,7 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
 		memset(&mmap_arg, 0, sizeof(mmap_arg));
 		mmap_arg.handle = bo_gem->gem_handle;
 		mmap_arg.offset = 0;
-		mmap_arg.size = bo->size;
+		mmap_arg.size = bo_gem->bo.size;
 		ret = drmIoctl(bufmgr_gem->fd,
 			       DRM_IOCTL_I915_GEM_MMAP,
 			       &mmap_arg);
@@ -1027,11 +1025,27 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
 			DBG("%s:%d: Error mapping buffer %d (%s): %s .\n",
 			    __FILE__, __LINE__, bo_gem->gem_handle,
 			    bo_gem->name, strerror(errno));
-			pthread_mutex_unlock(&bufmgr_gem->lock);
 			return ret;
 		}
 		bo_gem->mem_virtual = (void *)(uintptr_t) mmap_arg.addr_ptr;
 	}
+	return 0;
+}
+
+static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	struct drm_i915_gem_set_domain set_domain;
+	int ret;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+
+	ret = do_mmap_cpu(bufmgr_gem, bo_gem);
+	if (ret != 0) {
+		pthread_mutex_unlock(&bufmgr_gem->lock);
+		return ret;
+	}
 	DBG("bo_map: %d (%s) -> %p\n", bo_gem->gem_handle, bo_gem->name,
 	    bo_gem->mem_virtual);
 	bo->virtual = bo_gem->mem_virtual;
@@ -1051,20 +1065,19 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
 		    strerror(errno));
 	}
 
+	/* TODO ask kernel if llc cached, in that case don't clear this */
+	bo_gem->in_gtt_domain = 0;
+
 	pthread_mutex_unlock(&bufmgr_gem->lock);
 
 	return 0;
 }
 
-int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
+static int do_mmap_gtt(drm_intel_bufmgr_gem *bufmgr_gem,
+		       drm_intel_bo_gem *bo_gem)
 {
-	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
-	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
-	struct drm_i915_gem_set_domain set_domain;
 	int ret;
 
-	pthread_mutex_lock(&bufmgr_gem->lock);
-
 	/* Get a mapping of the buffer if we haven't before. */
 	if (bo_gem->gtt_virtual == NULL) {
 		struct drm_i915_gem_mmap_gtt mmap_arg;
@@ -1085,12 +1098,11 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
 			    __FILE__, __LINE__,
 			    bo_gem->gem_handle, bo_gem->name,
 			    strerror(errno));
-			pthread_mutex_unlock(&bufmgr_gem->lock);
 			return ret;
 		}
 
 		/* and mmap it */
-		bo_gem->gtt_virtual = mmap(0, bo->size, PROT_READ | PROT_WRITE,
+		bo_gem->gtt_virtual = mmap(0, bo_gem->bo.size, PROT_READ | PROT_WRITE,
 					   MAP_SHARED, bufmgr_gem->fd,
 					   mmap_arg.offset);
 		if (bo_gem->gtt_virtual == MAP_FAILED) {
@@ -1100,11 +1112,28 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
 			    __FILE__, __LINE__,
 			    bo_gem->gem_handle, bo_gem->name,
 			    strerror(errno));
-			pthread_mutex_unlock(&bufmgr_gem->lock);
 			return ret;
 		}
 	}
 
+	return 0;
+}
+
+int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	struct drm_i915_gem_set_domain set_domain;
+	int ret;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+
+	ret = do_mmap_gtt(bufmgr_gem, bo_gem);
+	if (ret != 0) {
+		pthread_mutex_unlock(&bufmgr_gem->lock);
+		return ret;
+	}
+
 	bo->virtual = bo_gem->gtt_virtual;
 
 	DBG("bo_map_gtt: %d (%s) -> %p\n", bo_gem->gem_handle, bo_gem->name,
@@ -1123,6 +1152,8 @@ int drm_intel_gem_bo_map_gtt(drm_intel_bo *bo)
 		    strerror(errno));
 	}
 
+	bo_gem->in_gtt_domain = 1;
+
 	pthread_mutex_unlock(&bufmgr_gem->lock);
 
 	return 0;
@@ -1282,6 +1313,97 @@ drm_intel_gem_bo_start_gtt_access(drm_intel_bo *bo, int write_enable)
 		    set_domain.read_domains, set_domain.write_domain,
 		    strerror(errno));
 	}
+
+	bo_gem->in_gtt_domain = 1;
+}
+
+/**
+ * map an object in the non-blocking mode
+ *
+ * This automagically chooses either the gtt mapping or if coherent and faster,
+ * the cpu mapping. Might still blocking under a few conditions (first
+ * non-blocking access on a new buffer object and accessing the buffer throuhg a
+ * blocking cpu map).
+ *
+ * Not allowed on tiled buffers (to prevent headaches with swizzling and
+ * tracking the gem domain).
+ *
+ * Probably not a good idea to share these buffers with flink.
+ */
+int drm_intel_gem_bo_map_nonblocking(drm_intel_bo *bo)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	struct drm_i915_gem_set_domain set_domain;
+	int gpu_coherent_cpu_map;
+	int ret;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+	assert(bo_gem->tiling_mode != I915_TILING_NONE);
+
+	/* TODO ask the kernel about this */
+	gpu_coherent_cpu_map = 1;
+
+	if (gpu_coherent_cpu_map) {
+		ret = do_mmap_cpu(bufmgr_gem, bo_gem);
+		if (ret != 0) {
+			pthread_mutex_unlock(&bufmgr_gem->lock);
+			return ret;
+		}
+
+		bo->virtual = bo_gem->mem_virtual;
+	} else {
+		ret = do_mmap_gtt(bufmgr_gem, bo_gem);
+		if (ret != 0) {
+			pthread_mutex_unlock(&bufmgr_gem->lock);
+			return ret;
+		}
+
+		bo->virtual = bo_gem->gtt_virtual;
+	}
+
+	DBG("bo_map_nonblocking: %d (%s) -> %p\n", bo_gem->gem_handle, bo_gem->name,
+	    bo_gem->gtt_virtual);
+
+	/* Move it to the GTT domain in case it isn't there yet */
+	if (!bo_gem->in_gtt_domain) {
+		set_domain.handle = bo_gem->gem_handle;
+		set_domain.read_domains = I915_GEM_DOMAIN_GTT;
+		set_domain.write_domain = I915_GEM_DOMAIN_GTT;
+		ret = drmIoctl(bufmgr_gem->fd,
+			       DRM_IOCTL_I915_GEM_SET_DOMAIN,
+			       &set_domain);
+		if (ret != 0) {
+			DBG("%s:%d: Error setting domain %d: %s\n",
+			    __FILE__, __LINE__, bo_gem->gem_handle,
+			    strerror(errno));
+		}
+
+		bo_gem->in_gtt_domain = 1;
+	}
+
+
+	pthread_mutex_unlock(&bufmgr_gem->lock);
+
+	return 0;
+}
+
+/**
+ * unmap an object in the non-blocking mode
+ */
+int drm_intel_gem_bo_unmap_nonblocking(drm_intel_bo *bo)
+{
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	int ret = 0;
+
+	if (bo == NULL)
+		return 0;
+
+	pthread_mutex_lock(&bufmgr_gem->lock);
+	bo->virtual = NULL;
+	pthread_mutex_unlock(&bufmgr_gem->lock);
+
+	return ret;
 }
 
 static void
-- 
1.7.6.2

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH cont'd] intel: non-blocking mmaps on the cheap
  2011-09-21 19:19           ` Daniel Vetter
@ 2011-09-22  1:47             ` Ben Widawsky
  2011-09-22  1:47               ` [PATCH] drm/i915: ioctl to query a bo's cache level Ben Widawsky
                                 ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Ben Widawsky @ 2011-09-22  1:47 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter

The drm patch goes on top of Daniel's patch. I'll squash before pushing,
it's just separate for review.

The kernel stuff is straight forward, just an IOCTL to query the cache
level.

Both patches will get better commit messages when resubmitted.

I ran a modified version of the test I posted earlier.
On Sandybridge I see no improvements on the test I posted earlier.
On Ironlake, improvement over regular GTT mappings:
21.7486% +/- 1.17445%

These results are similar to my earlier patch series.

Overall, I like this series much better. Do we want to sneak in a set
cache level with the same IOCTL while we're at it?

Ben

^ permalink raw reply	[flat|nested] 28+ messages in thread

* [PATCH] drm/i915: ioctl to query a bo's cache level
  2011-09-22  1:47             ` [PATCH cont'd] " Ben Widawsky
@ 2011-09-22  1:47               ` Ben Widawsky
  2011-09-22  7:35                 ` Daniel Vetter
  2011-09-22  1:47               ` [PATCH] on top of daniel Ben Widawsky
  2011-09-22  7:33               ` [PATCH cont'd] intel: non-blocking mmaps on the cheap Daniel Vetter
  2 siblings, 1 reply; 28+ messages in thread
From: Ben Widawsky @ 2011-09-22  1:47 UTC (permalink / raw)
  To: intel-gfx; +Cc: Daniel Vetter, Ben Widawsky


Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 drivers/gpu/drm/i915/i915_dma.c |    1 +
 drivers/gpu/drm/i915/i915_drv.h |    2 ++
 drivers/gpu/drm/i915/i915_gem.c |   23 +++++++++++++++++++++++
 include/drm/i915_drm.h          |    7 +++++++
 4 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 8a3942c..8178cbb 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -2292,6 +2292,7 @@ struct drm_ioctl_desc i915_ioctls[] = {
 	DRM_IOCTL_DEF_DRV(I915_GEM_MADVISE, i915_gem_madvise_ioctl, DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(I915_OVERLAY_PUT_IMAGE, intel_overlay_put_image, DRM_MASTER|DRM_CONTROL_ALLOW|DRM_UNLOCKED),
 	DRM_IOCTL_DEF_DRV(I915_OVERLAY_ATTRS, intel_overlay_attrs, DRM_MASTER|DRM_CONTROL_ALLOW|DRM_UNLOCKED),
+	DRM_IOCTL_DEF_DRV(I915_GEM_GET_CACHE_TYPE, i915_gem_get_cache_type_ioctl, DRM_UNLOCKED),
 };
 
 int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls);
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 7916bd9..2dfcb3d 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -1118,6 +1118,8 @@ int i915_gem_get_tiling(struct drm_device *dev, void *data,
 			struct drm_file *file_priv);
 int i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
 				struct drm_file *file_priv);
+int i915_gem_get_cache_type_ioctl(struct drm_device *dev, void *data,
+				  struct drm_file *file_priv);
 void i915_gem_load(struct drm_device *dev);
 int i915_gem_init_object(struct drm_gem_object *obj);
 int __must_check i915_gem_flush_ring(struct intel_ring_buffer *ring,
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a546a71..362da16 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -3777,6 +3777,29 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj)
 }
 
 int
+i915_gem_get_cache_type_ioctl(struct drm_device *dev, void *data,
+			      struct drm_file *file_priv)
+{
+
+	struct drm_i915_gem_get_cache_type *args = data;
+	struct drm_i915_gem_object *obj;
+	int ret = 0;
+
+	obj = to_intel_bo(drm_gem_object_lookup(dev, file_priv, args->handle));
+	if (&obj->base == NULL) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	args->cache_level = obj->cache_level;
+
+	drm_gem_object_unreference(&obj->base);
+
+out:
+	return ret;
+}
+
+int
 i915_gem_idle(struct drm_device *dev)
 {
 	drm_i915_private_t *dev_priv = dev->dev_private;
diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index 28c0d11..a0dff8a 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -198,6 +198,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
 #define DRM_I915_OVERLAY_ATTRS	0x28
 #define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_GET_CACHE_TYPE	0x2a
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -239,6 +240,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_MADVISE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MADVISE, struct drm_i915_gem_madvise)
 #define DRM_IOCTL_I915_OVERLAY_PUT_IMAGE	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_OVERLAY_PUT_IMAGE, struct drm_intel_overlay_put_image)
 #define DRM_IOCTL_I915_OVERLAY_ATTRS	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_OVERLAY_ATTRS, struct drm_intel_overlay_attrs)
+#define DRM_IOCTL_I915_GEM_GET_CACHE_TYPE	DRM_IOR  (DRM_COMMAND_BASE + DRM_I915_GEM_GET_CACHE_TYPE, struct drm_i915_gem_get_cache_type)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -844,4 +846,9 @@ struct drm_intel_overlay_attrs {
 	__u32 gamma5;
 };
 
+struct drm_i915_gem_get_cache_type {
+	__u32 handle;
+	__u32 cache_level;
+};
+
 #endif				/* _I915_DRM_H_ */
-- 
1.7.6.3

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH] on top of daniel
  2011-09-22  1:47             ` [PATCH cont'd] " Ben Widawsky
  2011-09-22  1:47               ` [PATCH] drm/i915: ioctl to query a bo's cache level Ben Widawsky
@ 2011-09-22  1:47               ` Ben Widawsky
  2011-09-22  7:39                 ` Daniel Vetter
  2011-09-22  7:33               ` [PATCH cont'd] intel: non-blocking mmaps on the cheap Daniel Vetter
  2 siblings, 1 reply; 28+ messages in thread
From: Ben Widawsky @ 2011-09-22  1:47 UTC (permalink / raw)
  To: intel-gfx; +Cc: Ben Widawsky

Daniel Vetter <daniel.vetter@ffwll.ch>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
---
 include/drm/i915_drm.h   |    7 +++++++
 intel/intel_bufmgr_gem.c |   36 ++++++++++++++++++++++++++++++------
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
index adc2392..4b62222 100644
--- a/include/drm/i915_drm.h
+++ b/include/drm/i915_drm.h
@@ -189,6 +189,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
 #define DRM_I915_OVERLAY_ATTRS	0x28
 #define DRM_I915_GEM_EXECBUFFER2	0x29
+#define DRM_I915_GEM_GET_CACHE_TYPE	0x2a
 
 #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
 #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
@@ -230,6 +231,7 @@ typedef struct _drm_i915_sarea {
 #define DRM_IOCTL_I915_GEM_MADVISE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MADVISE, struct drm_i915_gem_madvise)
 #define DRM_IOCTL_I915_OVERLAY_PUT_IMAGE	DRM_IOW(DRM_COMMAND_BASE + DRM_IOCTL_I915_OVERLAY_ATTRS, struct drm_intel_overlay_put_image)
 #define DRM_IOCTL_I915_OVERLAY_ATTRS	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_OVERLAY_ATTRS, struct drm_intel_overlay_attrs)
+#define DRM_IOCTL_I915_GEM_GET_CACHE_TYPE	DRM_IOR  (DRM_COMMAND_BASE + DRM_I915_GEM_GET_CACHE_TYPE, struct drm_i915_gem_get_cache_type)
 
 /* Allow drivers to submit batchbuffers directly to hardware, relying
  * on the security mechanisms provided by hardware.
@@ -835,4 +837,9 @@ struct drm_intel_overlay_attrs {
 	__u32 gamma5;
 };
 
+struct drm_i915_gem_get_cache_type {
+	__u32 handle;
+	__u32 cache_level;
+};
+
 #endif				/* _I915_DRM_H_ */
diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
index ff4b663..65a77d6 100644
--- a/intel/intel_bufmgr_gem.c
+++ b/intel/intel_bufmgr_gem.c
@@ -1032,6 +1032,30 @@ static int do_mmap_cpu(drm_intel_bufmgr_gem *bufmgr_gem,
 	return 0;
 }
 
+enum i915_cache_level {
+	I915_CACHE_NONE,
+	I915_CACHE_LLC,
+	I915_CACHE_LLC_MLC, /* gen6+ */
+};
+
+static int get_cache_type(drm_intel_bo *bo)
+{
+	struct drm_i915_gem_get_cache_type cache = {0, 0};
+	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
+	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
+	int ret = 0;
+
+	cache.handle = bo_gem->gem_handle;
+	ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GEM_GET_CACHE_TYPE,
+		       &cache);
+
+	/* This should maintain old behavior */
+	if (ret)
+		return 0;
+
+	return cache.cache_level;
+}
+
 static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
 {
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
@@ -1065,8 +1089,8 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
 		    strerror(errno));
 	}
 
-	/* TODO ask kernel if llc cached, in that case don't clear this */
-	bo_gem->in_gtt_domain = 0;
+	if (get_cache_type(bo) != I915_CACHE_LLC)
+		bo_gem->in_gtt_domain = 0;
 
 	pthread_mutex_unlock(&bufmgr_gem->lock);
 
@@ -1335,14 +1359,14 @@ int drm_intel_gem_bo_map_nonblocking(drm_intel_bo *bo)
 	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
 	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
 	struct drm_i915_gem_set_domain set_domain;
-	int gpu_coherent_cpu_map;
+	int gpu_coherent_cpu_map = 0;
 	int ret;
 
 	pthread_mutex_lock(&bufmgr_gem->lock);
-	assert(bo_gem->tiling_mode != I915_TILING_NONE);
+	assert(bo_gem->tiling_mode == I915_TILING_NONE);
 
-	/* TODO ask the kernel about this */
-	gpu_coherent_cpu_map = 1;
+	if (bufmgr_gem->gen >= 6)
+		gpu_coherent_cpu_map = 1;
 
 	if (gpu_coherent_cpu_map) {
 		ret = do_mmap_cpu(bufmgr_gem, bo_gem);
-- 
1.7.6.3

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH cont'd] intel: non-blocking mmaps on the cheap
  2011-09-22  1:47             ` [PATCH cont'd] " Ben Widawsky
  2011-09-22  1:47               ` [PATCH] drm/i915: ioctl to query a bo's cache level Ben Widawsky
  2011-09-22  1:47               ` [PATCH] on top of daniel Ben Widawsky
@ 2011-09-22  7:33               ` Daniel Vetter
  2 siblings, 0 replies; 28+ messages in thread
From: Daniel Vetter @ 2011-09-22  7:33 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Daniel Vetter, intel-gfx

On Wed, Sep 21, 2011 at 06:47:09PM -0700, Ben Widawsky wrote:
> Overall, I like this series much better. Do we want to sneak in a set
> cache level with the same IOCTL while we're at it?

I think the set_cache_level can wait a tad bit, and this stuff here looks
(almost) ready. A few nitpicks remains, though.
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] drm/i915: ioctl to query a bo's cache level
  2011-09-22  1:47               ` [PATCH] drm/i915: ioctl to query a bo's cache level Ben Widawsky
@ 2011-09-22  7:35                 ` Daniel Vetter
  2011-09-22 15:36                   ` Ben Widawsky
  0 siblings, 1 reply; 28+ messages in thread
From: Daniel Vetter @ 2011-09-22  7:35 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: Daniel Vetter, intel-gfx

On Wed, Sep 21, 2011 at 06:47:10PM -0700, Ben Widawsky wrote:
> 
> Cc: Daniel Vetter <daniel.vetter@ffwll.ch>
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_dma.c |    1 +
>  drivers/gpu/drm/i915/i915_drv.h |    2 ++
>  drivers/gpu/drm/i915/i915_gem.c |   23 +++++++++++++++++++++++
>  include/drm/i915_drm.h          |    7 +++++++
>  4 files changed, 33 insertions(+), 0 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
> index 8a3942c..8178cbb 100644
> --- a/drivers/gpu/drm/i915/i915_dma.c
> +++ b/drivers/gpu/drm/i915/i915_dma.c
> @@ -2292,6 +2292,7 @@ struct drm_ioctl_desc i915_ioctls[] = {
>  	DRM_IOCTL_DEF_DRV(I915_GEM_MADVISE, i915_gem_madvise_ioctl, DRM_UNLOCKED),
>  	DRM_IOCTL_DEF_DRV(I915_OVERLAY_PUT_IMAGE, intel_overlay_put_image, DRM_MASTER|DRM_CONTROL_ALLOW|DRM_UNLOCKED),
>  	DRM_IOCTL_DEF_DRV(I915_OVERLAY_ATTRS, intel_overlay_attrs, DRM_MASTER|DRM_CONTROL_ALLOW|DRM_UNLOCKED),
> +	DRM_IOCTL_DEF_DRV(I915_GEM_GET_CACHE_TYPE, i915_gem_get_cache_type_ioctl, DRM_UNLOCKED),
>  };
>  
>  int i915_max_ioctl = DRM_ARRAY_SIZE(i915_ioctls);
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 7916bd9..2dfcb3d 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -1118,6 +1118,8 @@ int i915_gem_get_tiling(struct drm_device *dev, void *data,
>  			struct drm_file *file_priv);
>  int i915_gem_get_aperture_ioctl(struct drm_device *dev, void *data,
>  				struct drm_file *file_priv);
> +int i915_gem_get_cache_type_ioctl(struct drm_device *dev, void *data,
> +				  struct drm_file *file_priv);
>  void i915_gem_load(struct drm_device *dev);
>  int i915_gem_init_object(struct drm_gem_object *obj);
>  int __must_check i915_gem_flush_ring(struct intel_ring_buffer *ring,
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index a546a71..362da16 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -3777,6 +3777,29 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj)
>  }
>  
>  int
> +i915_gem_get_cache_type_ioctl(struct drm_device *dev, void *data,
> +			      struct drm_file *file_priv)
> +{
> +
> +	struct drm_i915_gem_get_cache_type *args = data;
> +	struct drm_i915_gem_object *obj;
> +	int ret = 0;
> +
> +	obj = to_intel_bo(drm_gem_object_lookup(dev, file_priv, args->handle));
> +	if (&obj->base == NULL) {
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +
> +	args->cache_level = obj->cache_level;

Grab struct_mutex around this, obj->cache_level might change over the
lifetime of the bo. Yeah, our locking is a mess, I know ;-)

> +	drm_gem_object_unreference(&obj->base);
> +
> +out:
> +	return ret;
> +}
> +
> +int
>  i915_gem_idle(struct drm_device *dev)
>  {
>  	drm_i915_private_t *dev_priv = dev->dev_private;
> diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
> index 28c0d11..a0dff8a 100644
> --- a/include/drm/i915_drm.h
> +++ b/include/drm/i915_drm.h
> @@ -198,6 +198,7 @@ typedef struct _drm_i915_sarea {
>  #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
>  #define DRM_I915_OVERLAY_ATTRS	0x28
>  #define DRM_I915_GEM_EXECBUFFER2	0x29
> +#define DRM_I915_GEM_GET_CACHE_TYPE	0x2a
>  
>  #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
>  #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
> @@ -239,6 +240,7 @@ typedef struct _drm_i915_sarea {
>  #define DRM_IOCTL_I915_GEM_MADVISE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MADVISE, struct drm_i915_gem_madvise)
>  #define DRM_IOCTL_I915_OVERLAY_PUT_IMAGE	DRM_IOW(DRM_COMMAND_BASE + DRM_I915_OVERLAY_PUT_IMAGE, struct drm_intel_overlay_put_image)
>  #define DRM_IOCTL_I915_OVERLAY_ATTRS	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_OVERLAY_ATTRS, struct drm_intel_overlay_attrs)
> +#define DRM_IOCTL_I915_GEM_GET_CACHE_TYPE	DRM_IOR  (DRM_COMMAND_BASE + DRM_I915_GEM_GET_CACHE_TYPE, struct drm_i915_gem_get_cache_type)
>  
>  /* Allow drivers to submit batchbuffers directly to hardware, relying
>   * on the security mechanisms provided by hardware.
> @@ -844,4 +846,9 @@ struct drm_intel_overlay_attrs {
>  	__u32 gamma5;
>  };
>  
> +struct drm_i915_gem_get_cache_type {
> +	__u32 handle;
> +	__u32 cache_level;
> +};
> +
>  #endif				/* _I915_DRM_H_ */
> -- 
> 1.7.6.3
> 

-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] on top of daniel
  2011-09-22  1:47               ` [PATCH] on top of daniel Ben Widawsky
@ 2011-09-22  7:39                 ` Daniel Vetter
  0 siblings, 0 replies; 28+ messages in thread
From: Daniel Vetter @ 2011-09-22  7:39 UTC (permalink / raw)
  To: Ben Widawsky; +Cc: intel-gfx

On Wed, Sep 21, 2011 at 06:47:11PM -0700, Ben Widawsky wrote:
> Daniel Vetter <daniel.vetter@ffwll.ch>
> Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
> ---
>  include/drm/i915_drm.h   |    7 +++++++
>  intel/intel_bufmgr_gem.c |   36 ++++++++++++++++++++++++++++++------
>  2 files changed, 37 insertions(+), 6 deletions(-)
> 
> diff --git a/include/drm/i915_drm.h b/include/drm/i915_drm.h
> index adc2392..4b62222 100644
> --- a/include/drm/i915_drm.h
> +++ b/include/drm/i915_drm.h
> @@ -189,6 +189,7 @@ typedef struct _drm_i915_sarea {
>  #define DRM_I915_OVERLAY_PUT_IMAGE	0x27
>  #define DRM_I915_OVERLAY_ATTRS	0x28
>  #define DRM_I915_GEM_EXECBUFFER2	0x29
> +#define DRM_I915_GEM_GET_CACHE_TYPE	0x2a
>  
>  #define DRM_IOCTL_I915_INIT		DRM_IOW( DRM_COMMAND_BASE + DRM_I915_INIT, drm_i915_init_t)
>  #define DRM_IOCTL_I915_FLUSH		DRM_IO ( DRM_COMMAND_BASE + DRM_I915_FLUSH)
> @@ -230,6 +231,7 @@ typedef struct _drm_i915_sarea {
>  #define DRM_IOCTL_I915_GEM_MADVISE	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_GEM_MADVISE, struct drm_i915_gem_madvise)
>  #define DRM_IOCTL_I915_OVERLAY_PUT_IMAGE	DRM_IOW(DRM_COMMAND_BASE + DRM_IOCTL_I915_OVERLAY_ATTRS, struct drm_intel_overlay_put_image)
>  #define DRM_IOCTL_I915_OVERLAY_ATTRS	DRM_IOWR(DRM_COMMAND_BASE + DRM_I915_OVERLAY_ATTRS, struct drm_intel_overlay_attrs)
> +#define DRM_IOCTL_I915_GEM_GET_CACHE_TYPE	DRM_IOR  (DRM_COMMAND_BASE + DRM_I915_GEM_GET_CACHE_TYPE, struct drm_i915_gem_get_cache_type)
>  
>  /* Allow drivers to submit batchbuffers directly to hardware, relying
>   * on the security mechanisms provided by hardware.
> @@ -835,4 +837,9 @@ struct drm_intel_overlay_attrs {
>  	__u32 gamma5;
>  };
>  
> +struct drm_i915_gem_get_cache_type {
> +	__u32 handle;
> +	__u32 cache_level;
> +};
> +
>  #endif				/* _I915_DRM_H_ */
> diff --git a/intel/intel_bufmgr_gem.c b/intel/intel_bufmgr_gem.c
> index ff4b663..65a77d6 100644
> --- a/intel/intel_bufmgr_gem.c
> +++ b/intel/intel_bufmgr_gem.c
> @@ -1032,6 +1032,30 @@ static int do_mmap_cpu(drm_intel_bufmgr_gem *bufmgr_gem,
>  	return 0;
>  }
>  
> +enum i915_cache_level {
> +	I915_CACHE_NONE,
> +	I915_CACHE_LLC,
> +	I915_CACHE_LLC_MLC, /* gen6+ */
> +};
> +
> +static int get_cache_type(drm_intel_bo *bo)
> +{
> +	struct drm_i915_gem_get_cache_type cache = {0, 0};
> +	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
> +	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
> +	int ret = 0;
> +
> +	cache.handle = bo_gem->gem_handle;
> +	ret = drmIoctl(bufmgr_gem->fd, DRM_IOCTL_I915_GEM_GET_CACHE_TYPE,
> +		       &cache);
> +
> +	/* This should maintain old behavior */
> +	if (ret)
> +		return 0;

Can you return I915_CACHE_NONE instead of this magic 0?

> +	return cache.cache_level;

Can we do

#define CACHE_LEVEL 0xff
	return cache.cache_leve & CACHE_LEVEL

instead so that we retain a few bits for flags (e.g. gfdt or whatever
other crazy stuff future hw might bring)? Alternatively, just add 64 bits
of pad to the ioctl struct.

> +}
> +
>  static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
>  {
>  	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
> @@ -1065,8 +1089,8 @@ static int drm_intel_gem_bo_map(drm_intel_bo *bo, int write_enable)
>  		    strerror(errno));
>  	}
>  
> -	/* TODO ask kernel if llc cached, in that case don't clear this */
> -	bo_gem->in_gtt_domain = 0;
> +	if (get_cache_type(bo) != I915_CACHE_LLC)
> +		bo_gem->in_gtt_domain = 0;
>  
>  	pthread_mutex_unlock(&bufmgr_gem->lock);
>  
> @@ -1335,14 +1359,14 @@ int drm_intel_gem_bo_map_nonblocking(drm_intel_bo *bo)
>  	drm_intel_bufmgr_gem *bufmgr_gem = (drm_intel_bufmgr_gem *) bo->bufmgr;
>  	drm_intel_bo_gem *bo_gem = (drm_intel_bo_gem *) bo;
>  	struct drm_i915_gem_set_domain set_domain;
> -	int gpu_coherent_cpu_map;
> +	int gpu_coherent_cpu_map = 0;
>  	int ret;
>  
>  	pthread_mutex_lock(&bufmgr_gem->lock);
> -	assert(bo_gem->tiling_mode != I915_TILING_NONE);
> +	assert(bo_gem->tiling_mode == I915_TILING_NONE);

Oops, good catch ;-)

> -	/* TODO ask the kernel about this */
> -	gpu_coherent_cpu_map = 1;
> +	if (bufmgr_gem->gen >= 6)
> +		gpu_coherent_cpu_map = 1;
>  
>  	if (gpu_coherent_cpu_map) {
>  		ret = do_mmap_cpu(bufmgr_gem, bo_gem);
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] drm/i915: ioctl to query a bo's cache level
  2011-09-22  7:35                 ` Daniel Vetter
@ 2011-09-22 15:36                   ` Ben Widawsky
  2011-09-22 15:49                     ` Chris Wilson
  0 siblings, 1 reply; 28+ messages in thread
From: Ben Widawsky @ 2011-09-22 15:36 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Daniel Vetter, intel-gfx

On Thu, Sep 22, 2011 at 09:35:12AM +0200, Daniel Vetter wrote:
> On Wed, Sep 21, 2011 at 06:47:10PM -0700, Ben Widawsky wrote:
> > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > index a546a71..362da16 100644
> > --- a/drivers/gpu/drm/i915/i915_gem.c
> > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > @@ -3777,6 +3777,29 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj)
> >  }
> >  
> >  int
> > +i915_gem_get_cache_type_ioctl(struct drm_device *dev, void *data,
> > +			      struct drm_file *file_priv)
> > +{
> > +
> > +	struct drm_i915_gem_get_cache_type *args = data;
> > +	struct drm_i915_gem_object *obj;
> > +	int ret = 0;
> > +
> > +	obj = to_intel_bo(drm_gem_object_lookup(dev, file_priv, args->handle));
> > +	if (&obj->base == NULL) {
> > +		ret = -ENOENT;
> > +		goto out;
> > +	}
> > +
> > +	args->cache_level = obj->cache_level;
> 
> Grab struct_mutex around this, obj->cache_level might change over the
> lifetime of the bo. Yeah, our locking is a mess, I know ;-)

I don't see how a lock by itself helps. The cache_level could change as soon as
you release the lock which results in the same problem.

Ben

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] drm/i915: ioctl to query a bo's cache level
  2011-09-22 15:36                   ` Ben Widawsky
@ 2011-09-22 15:49                     ` Chris Wilson
  0 siblings, 0 replies; 28+ messages in thread
From: Chris Wilson @ 2011-09-22 15:49 UTC (permalink / raw)
  To: Ben Widawsky, Daniel Vetter; +Cc: Daniel Vetter, intel-gfx

On Thu, 22 Sep 2011 15:36:51 +0000, Ben Widawsky <ben@bwidawsk.net> wrote:
> On Thu, Sep 22, 2011 at 09:35:12AM +0200, Daniel Vetter wrote:
> > On Wed, Sep 21, 2011 at 06:47:10PM -0700, Ben Widawsky wrote:
> > > diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> > > index a546a71..362da16 100644
> > > --- a/drivers/gpu/drm/i915/i915_gem.c
> > > +++ b/drivers/gpu/drm/i915/i915_gem.c
> > > @@ -3777,6 +3777,29 @@ void i915_gem_free_object(struct drm_gem_object *gem_obj)
> > >  }
> > >  
> > >  int
> > > +i915_gem_get_cache_type_ioctl(struct drm_device *dev, void *data,
> > > +			      struct drm_file *file_priv)
> > > +{
> > > +
> > > +	struct drm_i915_gem_get_cache_type *args = data;
> > > +	struct drm_i915_gem_object *obj;
> > > +	int ret = 0;
> > > +
> > > +	obj = to_intel_bo(drm_gem_object_lookup(dev, file_priv, args->handle));
> > > +	if (&obj->base == NULL) {
> > > +		ret = -ENOENT;
> > > +		goto out;
> > > +	}
> > > +
> > > +	args->cache_level = obj->cache_level;
> > 
> > Grab struct_mutex around this, obj->cache_level might change over the
> > lifetime of the bo. Yeah, our locking is a mess, I know ;-)
> 
> I don't see how a lock by itself helps. The cache_level could change as soon as
> you release the lock which results in the same problem.

Yes, you will always have the issue of coordination between processes,
but you do need the look around the unref, or use
  drm_gem_object_unreference_unlocked(&obj->base);
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2011-09-22 15:51 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-09-20  4:25 [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Ben Widawsky
2011-09-20  4:25 ` [PATCH 1/6] drm/i915: object CPU flush interface Ben Widawsky
2011-09-20  4:25 ` [PATCH 2/6] drm/i915: write only object tracking Ben Widawsky
2011-09-20  4:25 ` [PATCH 3/6] drm/i915: Support write only mappings Ben Widawsky
2011-09-20  5:29   ` Keith Packard
2011-09-20  5:37     ` Ben Widawsky
2011-09-20  8:30   ` Chris Wilson
2011-09-20  4:25 ` [PATCH 4/6] intel: write only map support Ben Widawsky
2011-09-20  4:25 ` [PATCH 5/6] write-only mappings Ben Widawsky
2011-09-20  4:25 ` [PATCH 6/6] intel: use write only maps for MapRangeBuffer Ben Widawsky
2011-09-20 11:06 ` [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Daniel Vetter
2011-09-20 17:17   ` Eric Anholt
2011-09-20 19:19     ` Daniel Vetter
2011-09-21  8:19       ` [PATCH] intel: non-blocking mmaps on the cheap Daniel Vetter
2011-09-21 18:11         ` Eric Anholt
2011-09-21 19:19           ` Daniel Vetter
2011-09-22  1:47             ` [PATCH cont'd] " Ben Widawsky
2011-09-22  1:47               ` [PATCH] drm/i915: ioctl to query a bo's cache level Ben Widawsky
2011-09-22  7:35                 ` Daniel Vetter
2011-09-22 15:36                   ` Ben Widawsky
2011-09-22 15:49                     ` Chris Wilson
2011-09-22  1:47               ` [PATCH] on top of daniel Ben Widawsky
2011-09-22  7:39                 ` Daniel Vetter
2011-09-22  7:33               ` [PATCH cont'd] intel: non-blocking mmaps on the cheap Daniel Vetter
2011-09-20 21:16   ` [PATCH 1/6] RFCish: write only mappings (aka non-blocking) Chris Wilson
2011-09-21  7:02     ` Daniel Vetter
2011-09-20 22:19   ` Ben Widawsky
2011-09-21  7:07     ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.