All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path
@ 2012-03-25 17:47 Daniel Vetter
  2012-03-25 17:47 ` [PATCH 02/16] drm/i915: merge shmem_pread slow&fast-path Daniel Vetter
                   ` (14 more replies)
  0 siblings, 15 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

With the previous rewrite, they've become essential identical.

v2: Simplify the page_do_bit17_swizzling logic as suggested by Chris
Wilson.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |  126 ++++++++++----------------------------
 1 files changed, 33 insertions(+), 93 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 0a16366..6a636ca 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -711,84 +711,11 @@ out_unpin_pages:
 	return ret;
 }
 
-/**
- * This is the fast shmem pwrite path, which attempts to directly
- * copy_from_user into the kmapped pages backing the object.
- */
-static int
-i915_gem_shmem_pwrite_fast(struct drm_device *dev,
-			   struct drm_i915_gem_object *obj,
-			   struct drm_i915_gem_pwrite *args,
-			   struct drm_file *file)
-{
-	struct address_space *mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping;
-	ssize_t remain;
-	loff_t offset;
-	char __user *user_data;
-	int page_offset, page_length;
-
-	user_data = (char __user *) (uintptr_t) args->data_ptr;
-	remain = args->size;
-
-	offset = args->offset;
-	obj->dirty = 1;
-
-	while (remain > 0) {
-		struct page *page;
-		char *vaddr;
-		int ret;
-
-		/* Operation in this page
-		 *
-		 * page_offset = offset within page
-		 * page_length = bytes to copy for this page
-		 */
-		page_offset = offset_in_page(offset);
-		page_length = remain;
-		if ((page_offset + remain) > PAGE_SIZE)
-			page_length = PAGE_SIZE - page_offset;
-
-		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
-
-		vaddr = kmap_atomic(page);
-		ret = __copy_from_user_inatomic(vaddr + page_offset,
-						user_data,
-						page_length);
-		kunmap_atomic(vaddr);
-
-		set_page_dirty(page);
-		mark_page_accessed(page);
-		page_cache_release(page);
-
-		/* If we get a fault while copying data, then (presumably) our
-		 * source page isn't available.  Return the error and we'll
-		 * retry in the slow path.
-		 */
-		if (ret)
-			return -EFAULT;
-
-		remain -= page_length;
-		user_data += page_length;
-		offset += page_length;
-	}
-
-	return 0;
-}
-
-/**
- * This is the fallback shmem pwrite path, which uses get_user_pages to pin
- * the memory and maps it using kmap_atomic for copying.
- *
- * This avoids taking mmap_sem for faulting on the user's address while the
- * struct_mutex is held.
- */
 static int
-i915_gem_shmem_pwrite_slow(struct drm_device *dev,
-			   struct drm_i915_gem_object *obj,
-			   struct drm_i915_gem_pwrite *args,
-			   struct drm_file *file)
+i915_gem_shmem_pwrite(struct drm_device *dev,
+		      struct drm_i915_gem_object *obj,
+		      struct drm_i915_gem_pwrite *args,
+		      struct drm_file *file)
 {
 	struct address_space *mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping;
 	ssize_t remain;
@@ -796,6 +723,7 @@ i915_gem_shmem_pwrite_slow(struct drm_device *dev,
 	char __user *user_data;
 	int shmem_page_offset, page_length, ret = 0;
 	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
+	int hit_slowpath = 0;
 
 	user_data = (char __user *) (uintptr_t) args->data_ptr;
 	remain = args->size;
@@ -805,8 +733,6 @@ i915_gem_shmem_pwrite_slow(struct drm_device *dev,
 	offset = args->offset;
 	obj->dirty = 1;
 
-	mutex_unlock(&dev->struct_mutex);
-
 	while (remain > 0) {
 		struct page *page;
 		char *vaddr;
@@ -831,6 +757,21 @@ i915_gem_shmem_pwrite_slow(struct drm_device *dev,
 		page_do_bit17_swizzling = obj_do_bit17_swizzling &&
 			(page_to_phys(page) & (1 << 17)) != 0;
 
+		if (!page_do_bit17_swizzling) {
+			vaddr = kmap_atomic(page);
+			ret = __copy_from_user_inatomic(vaddr + shmem_page_offset,
+							user_data,
+							page_length);
+			kunmap_atomic(vaddr);
+
+			if (ret == 0)
+				goto next_page;
+		}
+
+		hit_slowpath = 1;
+
+		mutex_unlock(&dev->struct_mutex);
+
 		vaddr = kmap(page);
 		if (page_do_bit17_swizzling)
 			ret = __copy_from_user_swizzled(vaddr, shmem_page_offset,
@@ -842,6 +783,8 @@ i915_gem_shmem_pwrite_slow(struct drm_device *dev,
 					       page_length);
 		kunmap(page);
 
+		mutex_lock(&dev->struct_mutex);
+next_page:
 		set_page_dirty(page);
 		mark_page_accessed(page);
 		page_cache_release(page);
@@ -857,15 +800,16 @@ i915_gem_shmem_pwrite_slow(struct drm_device *dev,
 	}
 
 out:
-	mutex_lock(&dev->struct_mutex);
-	/* Fixup: Kill any reinstated backing storage pages */
-	if (obj->madv == __I915_MADV_PURGED)
-		i915_gem_object_truncate(obj);
-	/* and flush dirty cachelines in case the object isn't in the cpu write
-	 * domain anymore. */
-	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
-		i915_gem_clflush_object(obj);
-		intel_gtt_chipset_flush();
+	if (hit_slowpath) {
+		/* Fixup: Kill any reinstated backing storage pages */
+		if (obj->madv == __I915_MADV_PURGED)
+			i915_gem_object_truncate(obj);
+		/* and flush dirty cachelines in case the object isn't in the cpu write
+		 * domain anymore. */
+		if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
+			i915_gem_clflush_object(obj);
+			intel_gtt_chipset_flush();
+		}
 	}
 
 	return ret;
@@ -959,11 +903,7 @@ out_unpin:
 	if (ret)
 		goto out;
 
-	ret = -EFAULT;
-	if (!i915_gem_object_needs_bit17_swizzle(obj))
-		ret = i915_gem_shmem_pwrite_fast(dev, obj, args, file);
-	if (ret == -EFAULT)
-		ret = i915_gem_shmem_pwrite_slow(dev, obj, args, file);
+	ret = i915_gem_shmem_pwrite(dev, obj, args, file);
 
 out:
 	drm_gem_object_unreference(&obj->base);
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 02/16] drm/i915: merge shmem_pread slow&fast-path
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 03/16] drm: add helper to clflush a virtual address range Daniel Vetter
                   ` (13 subsequent siblings)
  14 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

With the previous rewrite, they've become essential identical.

v2: Simplify the page_do_bit17_swizzling logic as suggested by Chris
Wilson.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |  108 ++++++++++-----------------------------
 1 files changed, 27 insertions(+), 81 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 6a636ca..681ead9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -259,66 +259,6 @@ static int i915_gem_object_needs_bit17_swizzle(struct drm_i915_gem_object *obj)
 		obj->tiling_mode != I915_TILING_NONE;
 }
 
-/**
- * This is the fast shmem pread path, which attempts to copy_from_user directly
- * from the backing pages of the object to the user's address space.  On a
- * fault, it fails so we can fall back to i915_gem_shmem_pwrite_slow().
- */
-static int
-i915_gem_shmem_pread_fast(struct drm_device *dev,
-			  struct drm_i915_gem_object *obj,
-			  struct drm_i915_gem_pread *args,
-			  struct drm_file *file)
-{
-	struct address_space *mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping;
-	ssize_t remain;
-	loff_t offset;
-	char __user *user_data;
-	int page_offset, page_length;
-
-	user_data = (char __user *) (uintptr_t) args->data_ptr;
-	remain = args->size;
-
-	offset = args->offset;
-
-	while (remain > 0) {
-		struct page *page;
-		char *vaddr;
-		int ret;
-
-		/* Operation in this page
-		 *
-		 * page_offset = offset within page
-		 * page_length = bytes to copy for this page
-		 */
-		page_offset = offset_in_page(offset);
-		page_length = remain;
-		if ((page_offset + remain) > PAGE_SIZE)
-			page_length = PAGE_SIZE - page_offset;
-
-		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
-
-		vaddr = kmap_atomic(page);
-		ret = __copy_to_user_inatomic(user_data,
-					      vaddr + page_offset,
-					      page_length);
-		kunmap_atomic(vaddr);
-
-		mark_page_accessed(page);
-		page_cache_release(page);
-		if (ret)
-			return -EFAULT;
-
-		remain -= page_length;
-		user_data += page_length;
-		offset += page_length;
-	}
-
-	return 0;
-}
-
 static inline int
 __copy_to_user_swizzled(char __user *cpu_vaddr,
 			const char *gpu_vaddr, int gpu_offset,
@@ -371,17 +311,11 @@ __copy_from_user_swizzled(char __user *gpu_vaddr, int gpu_offset,
 	return 0;
 }
 
-/**
- * This is the fallback shmem pread path, which allocates temporary storage
- * in kernel space to copy_to_user into outside of the struct_mutex, so we
- * can copy out of the object's backing pages while holding the struct mutex
- * and not take page faults.
- */
 static int
-i915_gem_shmem_pread_slow(struct drm_device *dev,
-			  struct drm_i915_gem_object *obj,
-			  struct drm_i915_gem_pread *args,
-			  struct drm_file *file)
+i915_gem_shmem_pread(struct drm_device *dev,
+		     struct drm_i915_gem_object *obj,
+		     struct drm_i915_gem_pread *args,
+		     struct drm_file *file)
 {
 	struct address_space *mapping = obj->base.filp->f_path.dentry->d_inode->i_mapping;
 	char __user *user_data;
@@ -389,6 +323,7 @@ i915_gem_shmem_pread_slow(struct drm_device *dev,
 	loff_t offset;
 	int shmem_page_offset, page_length, ret = 0;
 	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
+	int hit_slowpath = 0;
 
 	user_data = (char __user *) (uintptr_t) args->data_ptr;
 	remain = args->size;
@@ -397,8 +332,6 @@ i915_gem_shmem_pread_slow(struct drm_device *dev,
 
 	offset = args->offset;
 
-	mutex_unlock(&dev->struct_mutex);
-
 	while (remain > 0) {
 		struct page *page;
 		char *vaddr;
@@ -422,6 +355,20 @@ i915_gem_shmem_pread_slow(struct drm_device *dev,
 		page_do_bit17_swizzling = obj_do_bit17_swizzling &&
 			(page_to_phys(page) & (1 << 17)) != 0;
 
+		if (!page_do_bit17_swizzling) {
+			vaddr = kmap_atomic(page);
+			ret = __copy_to_user_inatomic(user_data,
+						      vaddr + shmem_page_offset,
+						      page_length);
+			kunmap_atomic(vaddr);
+			if (ret == 0) 
+				goto next_page;
+		}
+
+		hit_slowpath = 1;
+
+		mutex_unlock(&dev->struct_mutex);
+
 		vaddr = kmap(page);
 		if (page_do_bit17_swizzling)
 			ret = __copy_to_user_swizzled(user_data,
@@ -433,6 +380,8 @@ i915_gem_shmem_pread_slow(struct drm_device *dev,
 					     page_length);
 		kunmap(page);
 
+		mutex_lock(&dev->struct_mutex);
+next_page:
 		mark_page_accessed(page);
 		page_cache_release(page);
 
@@ -447,10 +396,11 @@ i915_gem_shmem_pread_slow(struct drm_device *dev,
 	}
 
 out:
-	mutex_lock(&dev->struct_mutex);
-	/* Fixup: Kill any reinstated backing storage pages */
-	if (obj->madv == __I915_MADV_PURGED)
-		i915_gem_object_truncate(obj);
+	if (hit_slowpath) {
+		/* Fixup: Kill any reinstated backing storage pages */
+		if (obj->madv == __I915_MADV_PURGED)
+			i915_gem_object_truncate(obj);
+	}
 
 	return ret;
 }
@@ -506,11 +456,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 	if (ret)
 		goto out;
 
-	ret = -EFAULT;
-	if (!i915_gem_object_needs_bit17_swizzle(obj))
-		ret = i915_gem_shmem_pread_fast(dev, obj, args, file);
-	if (ret == -EFAULT)
-		ret = i915_gem_shmem_pread_slow(dev, obj, args, file);
+	ret = i915_gem_shmem_pread(dev, obj, args, file);
 
 out:
 	drm_gem_object_unreference(&obj->base);
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 03/16] drm: add helper to clflush a virtual address range
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
  2012-03-25 17:47 ` [PATCH 02/16] drm/i915: merge shmem_pread slow&fast-path Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 04/16] drm/i915: move clflushing into shmem_pread Daniel Vetter
                   ` (12 subsequent siblings)
  14 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter, dri-devel

Useful when the page is already mapped to copy date in/out.

For -stable because the next patch (fixing phys obj pwrite) needs this
little helper function.

Cc: dri-devel@lists.freedesktop.org
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/drm_cache.c |   23 +++++++++++++++++++++++
 include/drm/drmP.h          |    1 +
 2 files changed, 24 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/drm_cache.c b/drivers/gpu/drm/drm_cache.c
index 5928653..c7c8f6b 100644
--- a/drivers/gpu/drm/drm_cache.c
+++ b/drivers/gpu/drm/drm_cache.c
@@ -98,3 +98,26 @@ drm_clflush_pages(struct page *pages[], unsigned long num_pages)
 #endif
 }
 EXPORT_SYMBOL(drm_clflush_pages);
+
+void
+drm_clflush_virt_range(char *addr, unsigned long length)
+{
+#if defined(CONFIG_X86)
+	if (cpu_has_clflush) {
+		char *end = addr + length;
+		mb();
+		for (; addr < end; addr += boot_cpu_data.x86_clflush_size)
+			clflush(addr);
+		clflush(end - 1);
+		mb();
+		return;
+	}
+
+	if (on_each_cpu(drm_clflush_ipi_handler, NULL, 1) != 0)
+		printk(KERN_ERR "Timed out waiting for cache flush.\n");
+#else
+	printk(KERN_ERR "Architecture has no drm_cache.c support\n");
+	WARN_ON_ONCE(1);
+#endif
+}
+EXPORT_SYMBOL(drm_clflush_virt_range);
diff --git a/include/drm/drmP.h b/include/drm/drmP.h
index 92f0981..d33597b 100644
--- a/include/drm/drmP.h
+++ b/include/drm/drmP.h
@@ -1332,6 +1332,7 @@ extern int drm_remove_magic(struct drm_master *master, drm_magic_t magic);
 
 /* Cache management (drm_cache.c) */
 void drm_clflush_pages(struct page *pages[], unsigned long num_pages);
+void drm_clflush_virt_range(char *addr, unsigned long length);
 
 				/* Locking IOCTL support (drm_lock.h) */
 extern int drm_lock(struct drm_device *dev, void *data,
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 04/16] drm/i915: move clflushing into shmem_pread
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
  2012-03-25 17:47 ` [PATCH 02/16] drm/i915: merge shmem_pread slow&fast-path Daniel Vetter
  2012-03-25 17:47 ` [PATCH 03/16] drm: add helper to clflush a virtual address range Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 05/16] drm/i915: kill ranged cpu read domain support Daniel Vetter
                   ` (11 subsequent siblings)
  14 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

This is obviously gonna slow down pread. But for a half-way realistic
micro-benchmark, it doesn't matter: Non-broken userspace reads back
data from the gpu once before the gpu again dirties it.

So all this ranged clflush tracking is just a waste of time.

No pread performance change (neglecting the dumb benchmark of
constantly reading the same data) measured.

As an added bonus, this avoids clflush on read on coherent objects.
Which means that partial preads on snb are now roughly 4x as fast.
This will be usefull for e.g. the libva encoder - when I finally get
around to fix that up.

v2: Properly sync with the gpu on LLC machines.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |   26 ++++++++++++++++++++------
 1 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 681ead9..a5545c9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -324,12 +324,25 @@ i915_gem_shmem_pread(struct drm_device *dev,
 	int shmem_page_offset, page_length, ret = 0;
 	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
 	int hit_slowpath = 0;
+	int needs_clflush = 0;
 
 	user_data = (char __user *) (uintptr_t) args->data_ptr;
 	remain = args->size;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
 
+	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)) {
+		/* If we're not in the cpu read domain, set ourself into the gtt
+		 * read domain and manually flush cachelines (if required). This
+		 * optimizes for the case when the gpu will dirty the data
+		 * anyway again before the next pread happens. */
+		if (obj->cache_level == I915_CACHE_NONE)
+			needs_clflush = 1;
+		ret = i915_gem_object_set_to_gtt_domain(obj, false);
+		if (ret)
+			return ret;
+	}
+
 	offset = args->offset;
 
 	while (remain > 0) {
@@ -357,6 +370,9 @@ i915_gem_shmem_pread(struct drm_device *dev,
 
 		if (!page_do_bit17_swizzling) {
 			vaddr = kmap_atomic(page);
+			if (needs_clflush)
+				drm_clflush_virt_range(vaddr + shmem_page_offset,
+						       page_length);
 			ret = __copy_to_user_inatomic(user_data,
 						      vaddr + shmem_page_offset,
 						      page_length);
@@ -370,6 +386,10 @@ i915_gem_shmem_pread(struct drm_device *dev,
 		mutex_unlock(&dev->struct_mutex);
 
 		vaddr = kmap(page);
+		if (needs_clflush)
+			drm_clflush_virt_range(vaddr + shmem_page_offset,
+					       page_length);
+
 		if (page_do_bit17_swizzling)
 			ret = __copy_to_user_swizzled(user_data,
 						      vaddr, shmem_page_offset,
@@ -450,12 +470,6 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 
 	trace_i915_gem_object_pread(obj, args->offset, args->size);
 
-	ret = i915_gem_object_set_cpu_read_domain_range(obj,
-							args->offset,
-							args->size);
-	if (ret)
-		goto out;
-
 	ret = i915_gem_shmem_pread(dev, obj, args, file);
 
 out:
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 05/16] drm/i915: kill ranged cpu read domain support
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (2 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 04/16] drm/i915: move clflushing into shmem_pread Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 06/16] drm/i915: don't use gtt_pwrite on LLC cached objects Daniel Vetter
                   ` (10 subsequent siblings)
  14 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

No longer needed.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_drv.h |    7 --
 drivers/gpu/drm/i915/i915_gem.c |  117 ---------------------------------------
 2 files changed, 0 insertions(+), 124 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 9c75a7b..313e854 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -927,13 +927,6 @@ struct drm_i915_gem_object {
 	/** Record of address bit 17 of each page at last unbind. */
 	unsigned long *bit_17;
 
-
-	/**
-	 * If present, while GEM_DOMAIN_CPU is in the read domain this array
-	 * flags which individual pages are valid.
-	 */
-	uint8_t *page_cpu_valid;
-
 	/** User space pin count and filp owning the pin */
 	uint32_t user_pin_count;
 	struct drm_file *pin_filp;
diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index a5545c9..2c6c1dc 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -41,10 +41,6 @@ static void i915_gem_object_flush_gtt_write_domain(struct drm_i915_gem_object *o
 static void i915_gem_object_flush_cpu_write_domain(struct drm_i915_gem_object *obj);
 static __must_check int i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj,
 							  bool write);
-static __must_check int i915_gem_object_set_cpu_read_domain_range(struct drm_i915_gem_object *obj,
-								  uint64_t offset,
-								  uint64_t size);
-static void i915_gem_object_set_to_full_cpu_read_domain(struct drm_i915_gem_object *obj);
 static __must_check int i915_gem_object_bind_to_gtt(struct drm_i915_gem_object *obj,
 						    unsigned alignment,
 						    bool map_and_fenceable);
@@ -3010,11 +3006,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 
 	i915_gem_object_flush_gtt_write_domain(obj);
 
-	/* If we have a partially-valid cache of the object in the CPU,
-	 * finish invalidating it and free the per-page flags.
-	 */
-	i915_gem_object_set_to_full_cpu_read_domain(obj);
-
 	old_write_domain = obj->base.write_domain;
 	old_read_domains = obj->base.read_domains;
 
@@ -3045,113 +3036,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 	return 0;
 }
 
-/**
- * Moves the object from a partially CPU read to a full one.
- *
- * Note that this only resolves i915_gem_object_set_cpu_read_domain_range(),
- * and doesn't handle transitioning from !(read_domains & I915_GEM_DOMAIN_CPU).
- */
-static void
-i915_gem_object_set_to_full_cpu_read_domain(struct drm_i915_gem_object *obj)
-{
-	if (!obj->page_cpu_valid)
-		return;
-
-	/* If we're partially in the CPU read domain, finish moving it in.
-	 */
-	if (obj->base.read_domains & I915_GEM_DOMAIN_CPU) {
-		int i;
-
-		for (i = 0; i <= (obj->base.size - 1) / PAGE_SIZE; i++) {
-			if (obj->page_cpu_valid[i])
-				continue;
-			drm_clflush_pages(obj->pages + i, 1);
-		}
-	}
-
-	/* Free the page_cpu_valid mappings which are now stale, whether
-	 * or not we've got I915_GEM_DOMAIN_CPU.
-	 */
-	kfree(obj->page_cpu_valid);
-	obj->page_cpu_valid = NULL;
-}
-
-/**
- * Set the CPU read domain on a range of the object.
- *
- * The object ends up with I915_GEM_DOMAIN_CPU in its read flags although it's
- * not entirely valid.  The page_cpu_valid member of the object flags which
- * pages have been flushed, and will be respected by
- * i915_gem_object_set_to_cpu_domain() if it's called on to get a valid mapping
- * of the whole object.
- *
- * This function returns when the move is complete, including waiting on
- * flushes to occur.
- */
-static int
-i915_gem_object_set_cpu_read_domain_range(struct drm_i915_gem_object *obj,
-					  uint64_t offset, uint64_t size)
-{
-	uint32_t old_read_domains;
-	int i, ret;
-
-	if (offset == 0 && size == obj->base.size)
-		return i915_gem_object_set_to_cpu_domain(obj, 0);
-
-	ret = i915_gem_object_flush_gpu_write_domain(obj);
-	if (ret)
-		return ret;
-
-	ret = i915_gem_object_wait_rendering(obj);
-	if (ret)
-		return ret;
-
-	i915_gem_object_flush_gtt_write_domain(obj);
-
-	/* If we're already fully in the CPU read domain, we're done. */
-	if (obj->page_cpu_valid == NULL &&
-	    (obj->base.read_domains & I915_GEM_DOMAIN_CPU) != 0)
-		return 0;
-
-	/* Otherwise, create/clear the per-page CPU read domain flag if we're
-	 * newly adding I915_GEM_DOMAIN_CPU
-	 */
-	if (obj->page_cpu_valid == NULL) {
-		obj->page_cpu_valid = kzalloc(obj->base.size / PAGE_SIZE,
-					      GFP_KERNEL);
-		if (obj->page_cpu_valid == NULL)
-			return -ENOMEM;
-	} else if ((obj->base.read_domains & I915_GEM_DOMAIN_CPU) == 0)
-		memset(obj->page_cpu_valid, 0, obj->base.size / PAGE_SIZE);
-
-	/* Flush the cache on any pages that are still invalid from the CPU's
-	 * perspective.
-	 */
-	for (i = offset / PAGE_SIZE; i <= (offset + size - 1) / PAGE_SIZE;
-	     i++) {
-		if (obj->page_cpu_valid[i])
-			continue;
-
-		drm_clflush_pages(obj->pages + i, 1);
-
-		obj->page_cpu_valid[i] = 1;
-	}
-
-	/* It should now be out of any other write domains, and we can update
-	 * the domain values for our changes.
-	 */
-	BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
-
-	old_read_domains = obj->base.read_domains;
-	obj->base.read_domains |= I915_GEM_DOMAIN_CPU;
-
-	trace_i915_gem_object_change_domain(obj,
-					    old_read_domains,
-					    obj->base.write_domain);
-
-	return 0;
-}
-
 /* Throttle our rendering by waiting until the ring has completed our requests
  * emitted over 20 msec ago.
  *
@@ -3576,7 +3460,6 @@ static void i915_gem_free_object_tail(struct drm_i915_gem_object *obj)
 	drm_gem_object_release(&obj->base);
 	i915_gem_info_remove_obj(dev_priv, obj->base.size);
 
-	kfree(obj->page_cpu_valid);
 	kfree(obj->bit_17);
 	kfree(obj);
 }
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 06/16] drm/i915: don't use gtt_pwrite on LLC cached objects
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (3 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 05/16] drm/i915: kill ranged cpu read domain support Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 07/16] drm/i915: don't call shmem_read_mapping unnecessarily Daniel Vetter
                   ` (9 subsequent siblings)
  14 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

~120 µs instead fo ~210 µs to write 1mb on my snb. I like this.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2c6c1dc..c5b250c 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -828,6 +828,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 	}
 
 	if (obj->gtt_space &&
+	    obj->cache_level == I915_CACHE_NONE &&
 	    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
 		ret = i915_gem_object_pin(obj, 0, true);
 		if (ret)
-- 
1.7.7.6

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 07/16] drm/i915: don't call shmem_read_mapping unnecessarily
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (4 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 06/16] drm/i915: don't use gtt_pwrite on LLC cached objects Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-26  9:10   ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 08/16] drm/i915: drop gtt slowpath Daniel Vetter
                   ` (8 subsequent siblings)
  14 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

This speeds up pwrite and pread from ~120 µs ro ~100 µs for
reading/writing 1mb on my snb (if the backing storage pages
are already pinned, of course).

v2: Chris Wilson pointed out a claring page reference bug - I've
unconditionally dropped the reference. With that fixed (and the
associated reduction of dirt in dmesg) it's now even a notch faster.

v3: Unconditionaly grab a page reference when dropping
dev->struct_mutex to simplify the code-flow.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |   42 +++++++++++++++++++++++++++-----------
 1 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c5b250c..117fda4 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -321,6 +321,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
 	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
 	int hit_slowpath = 0;
 	int needs_clflush = 0;
+	int release_page;
 
 	user_data = (char __user *) (uintptr_t) args->data_ptr;
 	remain = args->size;
@@ -355,10 +356,16 @@ i915_gem_shmem_pread(struct drm_device *dev,
 		if ((shmem_page_offset + page_length) > PAGE_SIZE)
 			page_length = PAGE_SIZE - shmem_page_offset;
 
-		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
-		if (IS_ERR(page)) {
-			ret = PTR_ERR(page);
-			goto out;
+		if (obj->pages) {
+			page = obj->pages[offset >> PAGE_SHIFT];
+			release_page = 0;
+		} else {
+			page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+			if (IS_ERR(page)) {
+				ret = PTR_ERR(page);
+				goto out;
+			}
+			release_page = 1;
 		}
 
 		page_do_bit17_swizzling = obj_do_bit17_swizzling &&
@@ -378,7 +385,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
 		}
 
 		hit_slowpath = 1;
-
+		page_cache_get(page);
 		mutex_unlock(&dev->struct_mutex);
 
 		vaddr = kmap(page);
@@ -397,9 +404,11 @@ i915_gem_shmem_pread(struct drm_device *dev,
 		kunmap(page);
 
 		mutex_lock(&dev->struct_mutex);
+		page_cache_release(page);
 next_page:
 		mark_page_accessed(page);
-		page_cache_release(page);
+		if (release_page)
+			page_cache_release(page);
 
 		if (ret) {
 			ret = -EFAULT;
@@ -680,6 +689,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 	int shmem_page_offset, page_length, ret = 0;
 	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
 	int hit_slowpath = 0;
+	int release_page;
 
 	user_data = (char __user *) (uintptr_t) args->data_ptr;
 	remain = args->size;
@@ -704,10 +714,16 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 		if ((shmem_page_offset + page_length) > PAGE_SIZE)
 			page_length = PAGE_SIZE - shmem_page_offset;
 
-		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
-		if (IS_ERR(page)) {
-			ret = PTR_ERR(page);
-			goto out;
+		if (obj->pages) {
+			page = obj->pages[offset >> PAGE_SHIFT];
+			release_page = 0;
+		} else {
+			page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
+			if (IS_ERR(page)) {
+				ret = PTR_ERR(page);
+				goto out;
+			}
+			release_page = 1;
 		}
 
 		page_do_bit17_swizzling = obj_do_bit17_swizzling &&
@@ -725,7 +741,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 		}
 
 		hit_slowpath = 1;
-
+		page_cache_get(page);
 		mutex_unlock(&dev->struct_mutex);
 
 		vaddr = kmap(page);
@@ -740,10 +756,12 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 		kunmap(page);
 
 		mutex_lock(&dev->struct_mutex);
+		page_cache_release(page);
 next_page:
 		set_page_dirty(page);
 		mark_page_accessed(page);
-		page_cache_release(page);
+		if (release_page)
+			page_cache_release(page);
 
 		if (ret) {
 			ret = -EFAULT;
-- 
1.7.7.6

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 08/16] drm/i915: drop gtt slowpath
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (5 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 07/16] drm/i915: don't call shmem_read_mapping unnecessarily Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 09/16] drm/i915: don't clobber userspace memory before commiting to the pread Daniel Vetter
                   ` (7 subsequent siblings)
  14 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

With the proper prefault, it's extremely unlikely that we fall back
to the gtt slowpath.

So just kill it and use the shmem_pwrite path as fallback.

To further clean up the code, move the preparatory gem calls into the
respective pwrite functions. This way the gtt_fast->shmem fallback
is much more obvious.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |  183 +++++++--------------------------------
 1 files changed, 30 insertions(+), 153 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 117fda4..ddd1fa4 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -504,30 +504,6 @@ fast_user_write(struct io_mapping *mapping,
 	return unwritten;
 }
 
-/* Here's the write path which can sleep for
- * page faults
- */
-
-static inline void
-slow_kernel_write(struct io_mapping *mapping,
-		  loff_t gtt_base, int gtt_offset,
-		  struct page *user_page, int user_offset,
-		  int length)
-{
-	char __iomem *dst_vaddr;
-	char *src_vaddr;
-
-	dst_vaddr = io_mapping_map_wc(mapping, gtt_base);
-	src_vaddr = kmap(user_page);
-
-	memcpy_toio(dst_vaddr + gtt_offset,
-		    src_vaddr + user_offset,
-		    length);
-
-	kunmap(user_page);
-	io_mapping_unmap(dst_vaddr);
-}
-
 /**
  * This is the fast pwrite path, where we copy the data directly from the
  * user into the GTT, uncached.
@@ -542,7 +518,19 @@ i915_gem_gtt_pwrite_fast(struct drm_device *dev,
 	ssize_t remain;
 	loff_t offset, page_base;
 	char __user *user_data;
-	int page_offset, page_length;
+	int page_offset, page_length, ret;
+
+	ret = i915_gem_object_pin(obj, 0, true);
+	if (ret)
+		goto out;
+
+	ret = i915_gem_object_set_to_gtt_domain(obj, true);
+	if (ret)
+		goto out_unpin;
+
+	ret = i915_gem_object_put_fence(obj);
+	if (ret)
+		goto out_unpin;
 
 	user_data = (char __user *) (uintptr_t) args->data_ptr;
 	remain = args->size;
@@ -567,112 +555,19 @@ i915_gem_gtt_pwrite_fast(struct drm_device *dev,
 		 * retry in the slow path.
 		 */
 		if (fast_user_write(dev_priv->mm.gtt_mapping, page_base,
-				    page_offset, user_data, page_length))
-			return -EFAULT;
+				    page_offset, user_data, page_length)) {
+			ret = -EFAULT;
+			goto out_unpin;
+		}
 
 		remain -= page_length;
 		user_data += page_length;
 		offset += page_length;
 	}
 
-	return 0;
-}
-
-/**
- * This is the fallback GTT pwrite path, which uses get_user_pages to pin
- * the memory and maps it using kmap_atomic for copying.
- *
- * This code resulted in x11perf -rgb10text consuming about 10% more CPU
- * than using i915_gem_gtt_pwrite_fast on a G45 (32-bit).
- */
-static int
-i915_gem_gtt_pwrite_slow(struct drm_device *dev,
-			 struct drm_i915_gem_object *obj,
-			 struct drm_i915_gem_pwrite *args,
-			 struct drm_file *file)
-{
-	drm_i915_private_t *dev_priv = dev->dev_private;
-	ssize_t remain;
-	loff_t gtt_page_base, offset;
-	loff_t first_data_page, last_data_page, num_pages;
-	loff_t pinned_pages, i;
-	struct page **user_pages;
-	struct mm_struct *mm = current->mm;
-	int gtt_page_offset, data_page_offset, data_page_index, page_length;
-	int ret;
-	uint64_t data_ptr = args->data_ptr;
-
-	remain = args->size;
-
-	/* Pin the user pages containing the data.  We can't fault while
-	 * holding the struct mutex, and all of the pwrite implementations
-	 * want to hold it while dereferencing the user data.
-	 */
-	first_data_page = data_ptr / PAGE_SIZE;
-	last_data_page = (data_ptr + args->size - 1) / PAGE_SIZE;
-	num_pages = last_data_page - first_data_page + 1;
-
-	user_pages = drm_malloc_ab(num_pages, sizeof(struct page *));
-	if (user_pages == NULL)
-		return -ENOMEM;
-
-	mutex_unlock(&dev->struct_mutex);
-	down_read(&mm->mmap_sem);
-	pinned_pages = get_user_pages(current, mm, (uintptr_t)args->data_ptr,
-				      num_pages, 0, 0, user_pages, NULL);
-	up_read(&mm->mmap_sem);
-	mutex_lock(&dev->struct_mutex);
-	if (pinned_pages < num_pages) {
-		ret = -EFAULT;
-		goto out_unpin_pages;
-	}
-
-	ret = i915_gem_object_set_to_gtt_domain(obj, true);
-	if (ret)
-		goto out_unpin_pages;
-
-	ret = i915_gem_object_put_fence(obj);
-	if (ret)
-		goto out_unpin_pages;
-
-	offset = obj->gtt_offset + args->offset;
-
-	while (remain > 0) {
-		/* Operation in this page
-		 *
-		 * gtt_page_base = page offset within aperture
-		 * gtt_page_offset = offset within page in aperture
-		 * data_page_index = page number in get_user_pages return
-		 * data_page_offset = offset with data_page_index page.
-		 * page_length = bytes to copy for this page
-		 */
-		gtt_page_base = offset & PAGE_MASK;
-		gtt_page_offset = offset_in_page(offset);
-		data_page_index = data_ptr / PAGE_SIZE - first_data_page;
-		data_page_offset = offset_in_page(data_ptr);
-
-		page_length = remain;
-		if ((gtt_page_offset + page_length) > PAGE_SIZE)
-			page_length = PAGE_SIZE - gtt_page_offset;
-		if ((data_page_offset + page_length) > PAGE_SIZE)
-			page_length = PAGE_SIZE - data_page_offset;
-
-		slow_kernel_write(dev_priv->mm.gtt_mapping,
-				  gtt_page_base, gtt_page_offset,
-				  user_pages[data_page_index],
-				  data_page_offset,
-				  page_length);
-
-		remain -= page_length;
-		offset += page_length;
-		data_ptr += page_length;
-	}
-
-out_unpin_pages:
-	for (i = 0; i < pinned_pages; i++)
-		page_cache_release(user_pages[i]);
-	drm_free_large(user_pages);
-
+out_unpin:
+	i915_gem_object_unpin(obj);
+out:
 	return ret;
 }
 
@@ -691,6 +586,10 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 	int hit_slowpath = 0;
 	int release_page;
 
+	ret = i915_gem_object_set_to_cpu_domain(obj, 1);
+	if (ret)
+		return ret;
+
 	user_data = (char __user *) (uintptr_t) args->data_ptr;
 	remain = args->size;
 
@@ -834,6 +733,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 
 	trace_i915_gem_object_pwrite(obj, args->offset, args->size);
 
+	ret = -EFAULT;
 	/* We can only do the GTT pwrite on untiled buffers, as otherwise
 	 * it would end up going through the fenced access, and we'll get
 	 * different detiling behavior between reading and writing.
@@ -848,37 +748,14 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 	if (obj->gtt_space &&
 	    obj->cache_level == I915_CACHE_NONE &&
 	    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
-		ret = i915_gem_object_pin(obj, 0, true);
-		if (ret)
-			goto out;
-
-		ret = i915_gem_object_set_to_gtt_domain(obj, true);
-		if (ret)
-			goto out_unpin;
-
-		ret = i915_gem_object_put_fence(obj);
-		if (ret)
-			goto out_unpin;
-
 		ret = i915_gem_gtt_pwrite_fast(dev, obj, args, file);
-		if (ret == -EFAULT)
-			ret = i915_gem_gtt_pwrite_slow(dev, obj, args, file);
-
-out_unpin:
-		i915_gem_object_unpin(obj);
-
-		if (ret != -EFAULT)
-			goto out;
-		/* Fall through to the shmfs paths because the gtt paths might
-		 * fail with non-page-backed user pointers (e.g. gtt mappings
-		 * when moving data between textures). */
+		/* Note that the gtt paths might fail with non-page-backed user
+		 * pointers (e.g. gtt mappings when moving data between
+		 * textures). Fallback to the shmem path in that case. */
 	}
 
-	ret = i915_gem_object_set_to_cpu_domain(obj, 1);
-	if (ret)
-		goto out;
-
-	ret = i915_gem_shmem_pwrite(dev, obj, args, file);
+	if (ret == -EFAULT)
+		ret = i915_gem_shmem_pwrite(dev, obj, args, file);
 
 out:
 	drm_gem_object_unreference(&obj->base);
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 09/16] drm/i915: don't clobber userspace memory before commiting to the pread
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (6 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 08/16] drm/i915: drop gtt slowpath Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 10/16] drm/i915: implement inline clflush for pwrite Daniel Vetter
                   ` (6 subsequent siblings)
  14 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

The pagemap.h prefault helpers do the prefaulting by simply writing
some data into every page. Hence we should not prefault when we're not
yet commited to to actually writing data to userspace. The problem is
now that
- we can't prefault while holding dev->struct_mutex for we could
  deadlock with our own pagefault handler
- we need to grab dev->struct_mutex before copying to sync up with any
  outsanding gpu writes.

Therefore only prefault when we're dropping the lock the first time in
the pread slowpath - at that point we're committed to the write, don't
wait on the gpu anymore and hence won't return early (with e.g.
-EINTR).

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |   16 +++++++++++-----
 1 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ddd1fa4..34593e4 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -320,6 +320,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
 	int shmem_page_offset, page_length, ret = 0;
 	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
 	int hit_slowpath = 0;
+	int prefaulted = 0;
 	int needs_clflush = 0;
 	int release_page;
 
@@ -388,6 +389,16 @@ i915_gem_shmem_pread(struct drm_device *dev,
 		page_cache_get(page);
 		mutex_unlock(&dev->struct_mutex);
 
+		if (!prefaulted) {
+			ret = fault_in_pages_writeable(user_data, remain);
+			/* Userspace is tricking us, but we've already clobbered
+			 * its pages with the prefault and promised to write the
+			 * data up to the first fault. Hence ignore any errors
+			 * and just continue. */
+			(void)ret;
+			prefaulted = 1;
+		}
+
 		vaddr = kmap(page);
 		if (needs_clflush)
 			drm_clflush_virt_range(vaddr + shmem_page_offset,
@@ -451,11 +462,6 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
 		       args->size))
 		return -EFAULT;
 
-	ret = fault_in_pages_writeable((char __user *)(uintptr_t)args->data_ptr,
-				       args->size);
-	if (ret)
-		return -EFAULT;
-
 	ret = i915_mutex_lock_interruptible(dev);
 	if (ret)
 		return ret;
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 10/16] drm/i915: implement inline clflush for pwrite
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (7 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 09/16] drm/i915: don't clobber userspace memory before commiting to the pread Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 11/16] drm/i915: fall back to shmem pwrite when the buffer is not accessible Daniel Vetter
                   ` (5 subsequent siblings)
  14 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

In micro-benchmarking of the usual pwrite use-pattern of alternating
pwrites with gtt domain reads from the gpu, this yields around 30%
improvement of pwrite throughput across all buffers size. The trick is
that we can avoid clflush cachelines that we will overwrite completely
anyway.

Furthermore for partial pwrites it gives a proportional speedup on top
of the 30% percent because we only clflush back the part of the buffer
we're actually writing.

v2: Simplify the clflush-before-write logic, as suggested by Chris
Wilson.

v3: Finishing touches suggested by Chris Wilson:
- add comment to needs_clflush_before and only set this if the bo is
  uncached.
- s/needs_clflush/needs_clflush_after/ in the write paths to clearly
  differentiate it from needs_clflush_before.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |   46 +++++++++++++++++++++++++++++++++++---
 1 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 34593e4..2f95bde 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -590,23 +590,39 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 	int shmem_page_offset, page_length, ret = 0;
 	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
 	int hit_slowpath = 0;
+	int needs_clflush_after = 0;
+	int needs_clflush_before = 0;
 	int release_page;
 
-	ret = i915_gem_object_set_to_cpu_domain(obj, 1);
-	if (ret)
-		return ret;
-
 	user_data = (char __user *) (uintptr_t) args->data_ptr;
 	remain = args->size;
 
 	obj_do_bit17_swizzling = i915_gem_object_needs_bit17_swizzle(obj);
 
+	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
+		/* If we're not in the cpu write domain, set ourself into the gtt
+		 * write domain and manually flush cachelines (if required). This
+		 * optimizes for the case when the gpu will use the data
+		 * right away and we therefore have to clflush anyway. */
+		if (obj->cache_level == I915_CACHE_NONE)
+			needs_clflush_after = 1;
+		ret = i915_gem_object_set_to_gtt_domain(obj, true);
+		if (ret)
+			return ret;
+	}
+	/* Same trick applies for invalidate partially written cachelines before
+	 * writing.  */
+	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU)
+	    && obj->cache_level == I915_CACHE_NONE)
+		needs_clflush_before = 1;
+
 	offset = args->offset;
 	obj->dirty = 1;
 
 	while (remain > 0) {
 		struct page *page;
 		char *vaddr;
+		int partial_cacheline_write;
 
 		/* Operation in this page
 		 *
@@ -619,6 +635,13 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 		if ((shmem_page_offset + page_length) > PAGE_SIZE)
 			page_length = PAGE_SIZE - shmem_page_offset;
 
+		/* If we don't overwrite a cacheline completely we need to be
+		 * careful to have up-to-date data by first clflushing. Don't
+		 * overcomplicate things and flush the entire patch. */
+		partial_cacheline_write = needs_clflush_before &&
+			((shmem_page_offset | page_length)
+				& (boot_cpu_data.x86_clflush_size - 1));
+
 		if (obj->pages) {
 			page = obj->pages[offset >> PAGE_SHIFT];
 			release_page = 0;
@@ -636,9 +659,15 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 
 		if (!page_do_bit17_swizzling) {
 			vaddr = kmap_atomic(page);
+			if (partial_cacheline_write)
+				drm_clflush_virt_range(vaddr + shmem_page_offset,
+						       page_length);
 			ret = __copy_from_user_inatomic(vaddr + shmem_page_offset,
 							user_data,
 							page_length);
+			if (needs_clflush_after)
+				drm_clflush_virt_range(vaddr + shmem_page_offset,
+						       page_length);
 			kunmap_atomic(vaddr);
 
 			if (ret == 0)
@@ -650,6 +679,9 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 		mutex_unlock(&dev->struct_mutex);
 
 		vaddr = kmap(page);
+		if (partial_cacheline_write)
+			drm_clflush_virt_range(vaddr + shmem_page_offset,
+					       page_length);
 		if (page_do_bit17_swizzling)
 			ret = __copy_from_user_swizzled(vaddr, shmem_page_offset,
 							user_data,
@@ -658,6 +690,9 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 			ret = __copy_from_user(vaddr + shmem_page_offset,
 					       user_data,
 					       page_length);
+		if (needs_clflush_after)
+			drm_clflush_virt_range(vaddr + shmem_page_offset,
+					       page_length);
 		kunmap(page);
 
 		mutex_lock(&dev->struct_mutex);
@@ -691,6 +726,9 @@ out:
 		}
 	}
 
+	if (needs_clflush_after)
+		intel_gtt_chipset_flush();
+
 	return ret;
 }
 
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 11/16] drm/i915: fall back to shmem pwrite when the buffer is not accessible
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (8 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 10/16] drm/i915: implement inline clflush for pwrite Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 12/16] drm/i915: use uncached writes in pwrite Daniel Vetter
                   ` (4 subsequent siblings)
  14 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

It's too expensive to move it around just for that pwrite, especially
when we're trashing on the mappable gtt part like crazy.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |    1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 2f95bde..ce53769 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -791,6 +791,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 
 	if (obj->gtt_space &&
 	    obj->cache_level == I915_CACHE_NONE &&
+	    obj->map_and_fenceable &&
 	    obj->base.write_domain != I915_GEM_DOMAIN_CPU) {
 		ret = i915_gem_gtt_pwrite_fast(dev, obj, args, file);
 		/* Note that the gtt paths might fail with non-page-backed user
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 12/16] drm/i915: use uncached writes in pwrite
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (9 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 11/16] drm/i915: fall back to shmem pwrite when the buffer is not accessible Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-25 17:47 ` [PATCH 13/16] drm/i915: extract copy helpers from shmem_pread|pwrite Daniel Vetter
                   ` (3 subsequent siblings)
  14 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

It's around 20% faster.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |    6 +++---
 1 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index ce53769..d413675 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -662,9 +662,9 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 			if (partial_cacheline_write)
 				drm_clflush_virt_range(vaddr + shmem_page_offset,
 						       page_length);
-			ret = __copy_from_user_inatomic(vaddr + shmem_page_offset,
-							user_data,
-							page_length);
+			ret = __copy_from_user_inatomic_nocache(vaddr + shmem_page_offset,
+								user_data,
+								page_length);
 			if (needs_clflush_after)
 				drm_clflush_virt_range(vaddr + shmem_page_offset,
 						       page_length);
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 13/16] drm/i915: extract copy helpers from shmem_pread|pwrite
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (10 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 12/16] drm/i915: use uncached writes in pwrite Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-26  9:05   ` Chris Wilson
  2012-03-25 17:47 ` [PATCH 14/16] mm: extend prefault helpers to fault in more than PAGE_SIZE Daniel Vetter
                   ` (2 subsequent siblings)
  14 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

While moving around things, this two functions slowly grew out of any
sane bounds. So extract a few lines that do the copying and
clflushing. Also add a few comments to explain what's going on.

v2: Again do s/needs_clflush/needs_clflush_after/ in the write paths
as suggested by Chris Wilson.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |  196 +++++++++++++++++++++++++++------------
 1 files changed, 136 insertions(+), 60 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d413675..d318a45 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -307,6 +307,60 @@ __copy_from_user_swizzled(char __user *gpu_vaddr, int gpu_offset,
 	return 0;
 }
 
+/* Per-page copy function for the shmem pread fastpath.
+ * Flushes invalid cachelines before reading the target if
+ * needs_clflush is set. */
+static int
+shmem_pread_fast(struct page *page, int shmem_page_offset, int page_length,
+		 char __user *user_data,
+		 bool page_do_bit17_swizzling, bool needs_clflush)
+{
+	char *vaddr;
+	int ret;
+
+	if (page_do_bit17_swizzling)
+		return -EINVAL;
+
+	vaddr = kmap_atomic(page);
+	if (needs_clflush)
+		drm_clflush_virt_range(vaddr + shmem_page_offset,
+				       page_length);
+	ret = __copy_to_user_inatomic(user_data,
+				      vaddr + shmem_page_offset,
+				      page_length);
+	kunmap_atomic(vaddr);
+
+	return ret;
+}
+
+/* Only difference to the fast-path function is that this can handle bit17
+ * and uses non-atomic copy and kmap functions. */
+static int
+shmem_pread_slow(struct page *page, int shmem_page_offset, int page_length,
+		 char __user *user_data,
+		 bool page_do_bit17_swizzling, bool needs_clflush)
+{
+	char *vaddr;
+	int ret;
+
+	vaddr = kmap(page);
+	if (needs_clflush)
+		drm_clflush_virt_range(vaddr + shmem_page_offset,
+				       page_length);
+
+	if (page_do_bit17_swizzling)
+		ret = __copy_to_user_swizzled(user_data,
+					      vaddr, shmem_page_offset,
+					      page_length);
+	else
+		ret = __copy_to_user(user_data,
+				     vaddr + shmem_page_offset,
+				     page_length);
+	kunmap(page);
+
+	return ret;
+}
+
 static int
 i915_gem_shmem_pread(struct drm_device *dev,
 		     struct drm_i915_gem_object *obj,
@@ -345,7 +399,6 @@ i915_gem_shmem_pread(struct drm_device *dev,
 
 	while (remain > 0) {
 		struct page *page;
-		char *vaddr;
 
 		/* Operation in this page
 		 *
@@ -372,18 +425,11 @@ i915_gem_shmem_pread(struct drm_device *dev,
 		page_do_bit17_swizzling = obj_do_bit17_swizzling &&
 			(page_to_phys(page) & (1 << 17)) != 0;
 
-		if (!page_do_bit17_swizzling) {
-			vaddr = kmap_atomic(page);
-			if (needs_clflush)
-				drm_clflush_virt_range(vaddr + shmem_page_offset,
-						       page_length);
-			ret = __copy_to_user_inatomic(user_data,
-						      vaddr + shmem_page_offset,
-						      page_length);
-			kunmap_atomic(vaddr);
-			if (ret == 0) 
-				goto next_page;
-		}
+		ret = shmem_pread_fast(page, shmem_page_offset, page_length,
+				       user_data, page_do_bit17_swizzling,
+				       needs_clflush);
+		if (ret == 0)
+			goto next_page;
 
 		hit_slowpath = 1;
 		page_cache_get(page);
@@ -399,20 +445,9 @@ i915_gem_shmem_pread(struct drm_device *dev,
 			prefaulted = 1;
 		}
 
-		vaddr = kmap(page);
-		if (needs_clflush)
-			drm_clflush_virt_range(vaddr + shmem_page_offset,
-					       page_length);
-
-		if (page_do_bit17_swizzling)
-			ret = __copy_to_user_swizzled(user_data,
-						      vaddr, shmem_page_offset,
-						      page_length);
-		else
-			ret = __copy_to_user(user_data,
-					     vaddr + shmem_page_offset,
-					     page_length);
-		kunmap(page);
+		ret = shmem_pread_slow(page, shmem_page_offset, page_length,
+				       user_data, page_do_bit17_swizzling,
+				       needs_clflush);
 
 		mutex_lock(&dev->struct_mutex);
 		page_cache_release(page);
@@ -577,6 +612,70 @@ out:
 	return ret;
 }
 
+/* Per-page copy function for the shmem pwrite fastpath.
+ * Flushes invalid cachelines before writing to the target if
+ * needs_clflush_before is set and flushes out any written cachelines after
+ * writing if needs_clflush is set. */
+static int
+shmem_pwrite_fast(struct page *page, int shmem_page_offset, int page_length,
+		  char __user *user_data,
+		  bool page_do_bit17_swizzling,
+		  bool needs_clflush_before,
+		  bool needs_clflush_after)
+{
+	char *vaddr;
+	int ret;
+
+	if (page_do_bit17_swizzling)
+		return -EINVAL;
+
+	vaddr = kmap_atomic(page);
+	if (needs_clflush_before)
+		drm_clflush_virt_range(vaddr + shmem_page_offset,
+				       page_length);
+	ret = __copy_from_user_inatomic_nocache(vaddr + shmem_page_offset,
+						user_data,
+						page_length);
+	if (needs_clflush_after)
+		drm_clflush_virt_range(vaddr + shmem_page_offset,
+				       page_length);
+	kunmap_atomic(vaddr);
+
+	return ret;
+}
+
+/* Only difference to the fast-path function is that this can handle bit17
+ * and uses non-atomic copy and kmap functions. */
+static int
+shmem_pwrite_slow(struct page *page, int shmem_page_offset, int page_length,
+		  char __user *user_data,
+		  bool page_do_bit17_swizzling,
+		  bool needs_clflush_before,
+		  bool needs_clflush_after)
+{
+	char *vaddr;
+	int ret;
+
+	vaddr = kmap(page);
+	if (needs_clflush_before)
+		drm_clflush_virt_range(vaddr + shmem_page_offset,
+				       page_length);
+	if (page_do_bit17_swizzling)
+		ret = __copy_from_user_swizzled(vaddr, shmem_page_offset,
+						user_data,
+						page_length);
+	else
+		ret = __copy_from_user(vaddr + shmem_page_offset,
+				       user_data,
+				       page_length);
+	if (needs_clflush_after)
+		drm_clflush_virt_range(vaddr + shmem_page_offset,
+				       page_length);
+	kunmap(page);
+
+	return ret;
+}
+
 static int
 i915_gem_shmem_pwrite(struct drm_device *dev,
 		      struct drm_i915_gem_object *obj,
@@ -621,7 +720,6 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 
 	while (remain > 0) {
 		struct page *page;
-		char *vaddr;
 		int partial_cacheline_write;
 
 		/* Operation in this page
@@ -657,43 +755,21 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
 		page_do_bit17_swizzling = obj_do_bit17_swizzling &&
 			(page_to_phys(page) & (1 << 17)) != 0;
 
-		if (!page_do_bit17_swizzling) {
-			vaddr = kmap_atomic(page);
-			if (partial_cacheline_write)
-				drm_clflush_virt_range(vaddr + shmem_page_offset,
-						       page_length);
-			ret = __copy_from_user_inatomic_nocache(vaddr + shmem_page_offset,
-								user_data,
-								page_length);
-			if (needs_clflush_after)
-				drm_clflush_virt_range(vaddr + shmem_page_offset,
-						       page_length);
-			kunmap_atomic(vaddr);
-
-			if (ret == 0)
-				goto next_page;
-		}
+		ret = shmem_pwrite_fast(page, shmem_page_offset, page_length,
+					user_data, page_do_bit17_swizzling,
+					partial_cacheline_write,
+					needs_clflush_after);
+		if (ret == 0)
+			goto next_page;
 
 		hit_slowpath = 1;
 		page_cache_get(page);
 		mutex_unlock(&dev->struct_mutex);
 
-		vaddr = kmap(page);
-		if (partial_cacheline_write)
-			drm_clflush_virt_range(vaddr + shmem_page_offset,
-					       page_length);
-		if (page_do_bit17_swizzling)
-			ret = __copy_from_user_swizzled(vaddr, shmem_page_offset,
-							user_data,
-							page_length);
-		else
-			ret = __copy_from_user(vaddr + shmem_page_offset,
-					       user_data,
-					       page_length);
-		if (needs_clflush_after)
-			drm_clflush_virt_range(vaddr + shmem_page_offset,
-					       page_length);
-		kunmap(page);
+		ret = shmem_pwrite_slow(page, shmem_page_offset, page_length,
+					user_data, page_do_bit17_swizzling,
+					partial_cacheline_write,
+					needs_clflush_after);
 
 		mutex_lock(&dev->struct_mutex);
 		page_cache_release(page);
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 14/16] mm: extend prefault helpers to fault in more than PAGE_SIZE
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (11 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 13/16] drm/i915: extract copy helpers from shmem_pread|pwrite Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-26  9:09   ` Chris Wilson
  2012-03-25 17:47 ` [PATCH 15/16] drm/i915: fixup in-line clflushing on bit17 swizzled bos Daniel Vetter
  2012-03-25 17:47 ` [PATCH 16/16] drm/i915: mark pwrite/pread slowpaths with unlikely Daniel Vetter
  14 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

drm/i915 wants to read/write more than one page in its fastpath
and hence needs to prefault more than PAGE_SIZE bytes.

Add new functions in filemap.h to make that possible.

Also kill a copy&pasted spurious space in both functions while at it.

v2: As suggested by Andrew Morton, add a multipage parameter to both
functions to avoid the additional branch for the pagemap.c hotpath.
My gcc 4.6 here seems to dtrt and indeed reap these branches where not
needed.

v3: Becaus I couldn't find a way around adding a uaddr += PAGE_SIZE to
the filemap.c hotpaths (that the compiler couldn't remove again),
let's go with separate new functions for the multipage use-case.

v4: Adjust comment to CodingStlye.

Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c            |    6 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |    2 +-
 include/linux/pagemap.h                    |   64 +++++++++++++++++++++++++++-
 3 files changed, 66 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index d318a45..accb2cb 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -436,7 +436,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
 		mutex_unlock(&dev->struct_mutex);
 
 		if (!prefaulted) {
-			ret = fault_in_pages_writeable(user_data, remain);
+			ret = fault_in_multipages_writeable(user_data, remain);
 			/* Userspace is tricking us, but we've already clobbered
 			 * its pages with the prefault and promised to write the
 			 * data up to the first fault. Hence ignore any errors
@@ -829,8 +829,8 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
 		       args->size))
 		return -EFAULT;
 
-	ret = fault_in_pages_readable((char __user *)(uintptr_t)args->data_ptr,
-				      args->size);
+	ret = fault_in_multipages_readable((char __user *)(uintptr_t)args->data_ptr,
+					   args->size);
 	if (ret)
 		return -EFAULT;
 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 0e051ec..0505a62 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -967,7 +967,7 @@ validate_exec_list(struct drm_i915_gem_exec_object2 *exec,
 		if (!access_ok(VERIFY_WRITE, ptr, length))
 			return -EFAULT;
 
-		if (fault_in_pages_readable(ptr, length))
+		if (fault_in_multipages_readable(ptr, length))
 			return -EFAULT;
 	}
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index cfaaa69..e7102a3 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -426,7 +426,7 @@ static inline int fault_in_pages_writeable(char __user *uaddr, int size)
 		 */
 		if (((unsigned long)uaddr & PAGE_MASK) !=
 				((unsigned long)end & PAGE_MASK))
-		 	ret = __put_user(0, end);
+			ret = __put_user(0, end);
 	}
 	return ret;
 }
@@ -445,13 +445,73 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size)
 
 		if (((unsigned long)uaddr & PAGE_MASK) !=
 				((unsigned long)end & PAGE_MASK)) {
-		 	ret = __get_user(c, end);
+			ret = __get_user(c, end);
 			(void)c;
 		}
 	}
 	return ret;
 }
 
+/*
+ * Multipage variants of the above prefault helpers, useful if more than
+ * PAGE_SIZE of date needs to be prefaulted. These are separate from the above
+ * functions (which only handle up to PAGE_SIZE) to avoid clobbering the
+ * filemap.c hotpaths.
+ */
+static inline int fault_in_multipages_writeable(char __user *uaddr, int size)
+{
+	int ret;
+	const char __user *end = uaddr + size - 1;
+
+	if (unlikely(size == 0))
+		return 0;
+
+	/*
+	 * Writing zeroes into userspace here is OK, because we know that if
+	 * the zero gets there, we'll be overwriting it.
+	 */
+	while (uaddr <= end) {
+		ret = __put_user(0, uaddr);
+		if (ret != 0)
+			return ret;
+		uaddr += PAGE_SIZE;
+	}
+
+	/* Check whether the range spilled into the next page. */
+	if (((unsigned long)uaddr & PAGE_MASK) ==
+			((unsigned long)end & PAGE_MASK))
+		ret = __put_user(0, end);
+
+	return ret;
+}
+
+static inline int fault_in_multipages_readable(const char __user *uaddr,
+					       int size)
+{
+	volatile char c;
+	int ret;
+	const char __user *end = uaddr + size - 1;
+
+	if (unlikely(size == 0))
+		return 0;
+
+	while (uaddr <= end) {
+		ret = __get_user(c, uaddr);
+		if (ret != 0)
+			return ret;
+		uaddr += PAGE_SIZE;
+	}
+
+	/* Check whether the range spilled into the next page. */
+	if (((unsigned long)uaddr & PAGE_MASK) ==
+			((unsigned long)end & PAGE_MASK)) {
+		ret = __get_user(c, end);
+		(void)c;
+	}
+
+	return ret;
+}
+
 int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 				pgoff_t index, gfp_t gfp_mask);
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 15/16] drm/i915: fixup in-line clflushing on bit17 swizzled bos
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (12 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 14/16] mm: extend prefault helpers to fault in more than PAGE_SIZE Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-26  9:18   ` Chris Wilson
  2012-03-25 17:47 ` [PATCH 16/16] drm/i915: mark pwrite/pread slowpaths with unlikely Daniel Vetter
  14 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

The issue is that with inline clflushing the clflushing isn't properly
swizzled. Fix this by
- always clflushing entire 128 byte chunks and
- unconditionally flush before writes when swizzling a given page.
  We could be clever and check whether we pwrite a partial 128 byte
  chunk instead of a partial cacheline, but I've figured that's not
  worth it.

Now the usual approach is to fold this into the original patch series, but
I've opted against this because
- this fixes a corner case only very old userspace relies on and
- I'd like to not invalidate all the testing the pwrite rewrite has gotten.

This fixes the regression notice by tests/gem_tiled_partial_prite_pread
from i-g-t. Unfortunately it doesn't fix the issues with partial pwrites to
tiled buffers on bit17 swizzling machines. But that is also broken without
the pwrite patches, so likely a different issue (or a problem with the
testcase).

v2: Simplify the patch by dropping the overly clever partial write
logic for swizzled pages.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |   39 ++++++++++++++++++++++++++++++++-------
 1 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index accb2cb..cbd3bad 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -333,6 +333,28 @@ shmem_pread_fast(struct page *page, int shmem_page_offset, int page_length,
 	return ret;
 }
 
+static void
+shmem_clflush_swizzled_range(char *addr, unsigned long length,
+			     bool swizzled)
+{
+	if (swizzled) {
+		unsigned long start = (unsigned long) addr;
+		unsigned long end = (unsigned long) addr + length;
+
+		/* For swizzling simply ensure that we always flush both
+		 * channels. Lame, but simple and it works. Swizzled
+		 * pwrite/pread is far from a hotpath - current userspace
+		 * doesn't use it at all. */
+		start = round_down(start, 128);
+		end = round_up(end, 128);
+
+		drm_clflush_virt_range((void *)start, end - start);
+	} else {
+		drm_clflush_virt_range(addr, length);
+	}
+
+}
+
 /* Only difference to the fast-path function is that this can handle bit17
  * and uses non-atomic copy and kmap functions. */
 static int
@@ -345,8 +367,9 @@ shmem_pread_slow(struct page *page, int shmem_page_offset, int page_length,
 
 	vaddr = kmap(page);
 	if (needs_clflush)
-		drm_clflush_virt_range(vaddr + shmem_page_offset,
-				       page_length);
+		shmem_clflush_swizzled_range(vaddr + shmem_page_offset,
+					     page_length,
+					     page_do_bit17_swizzling);
 
 	if (page_do_bit17_swizzling)
 		ret = __copy_to_user_swizzled(user_data,
@@ -657,9 +680,10 @@ shmem_pwrite_slow(struct page *page, int shmem_page_offset, int page_length,
 	int ret;
 
 	vaddr = kmap(page);
-	if (needs_clflush_before)
-		drm_clflush_virt_range(vaddr + shmem_page_offset,
-				       page_length);
+	if (needs_clflush_before || page_do_bit17_swizzling)
+		shmem_clflush_swizzled_range(vaddr + shmem_page_offset,
+					     page_length,
+					     page_do_bit17_swizzling);
 	if (page_do_bit17_swizzling)
 		ret = __copy_from_user_swizzled(vaddr, shmem_page_offset,
 						user_data,
@@ -669,8 +693,9 @@ shmem_pwrite_slow(struct page *page, int shmem_page_offset, int page_length,
 				       user_data,
 				       page_length);
 	if (needs_clflush_after)
-		drm_clflush_virt_range(vaddr + shmem_page_offset,
-				       page_length);
+		shmem_clflush_swizzled_range(vaddr + shmem_page_offset,
+					     page_length,
+					     page_do_bit17_swizzling);
 	kunmap(page);
 
 	return ret;
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 16/16] drm/i915: mark pwrite/pread slowpaths with unlikely
  2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
                   ` (13 preceding siblings ...)
  2012-03-25 17:47 ` [PATCH 15/16] drm/i915: fixup in-line clflushing on bit17 swizzled bos Daniel Vetter
@ 2012-03-25 17:47 ` Daniel Vetter
  2012-03-26  9:09   ` Chris Wilson
  14 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2012-03-25 17:47 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

Beside helping the compiler untangle this maze they double-up as
documentation for which a parts aren't performance-critical but just
around to keep old (but already dead-slow) userspace from breaking.

Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
---
 drivers/gpu/drm/i915/i915_gem.c |    8 ++++----
 1 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index cbd3bad..755b5f9 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -318,7 +318,7 @@ shmem_pread_fast(struct page *page, int shmem_page_offset, int page_length,
 	char *vaddr;
 	int ret;
 
-	if (page_do_bit17_swizzling)
+	if (unlikely(page_do_bit17_swizzling))
 		return -EINVAL;
 
 	vaddr = kmap_atomic(page);
@@ -337,7 +337,7 @@ static void
 shmem_clflush_swizzled_range(char *addr, unsigned long length,
 			     bool swizzled)
 {
-	if (swizzled) {
+	if (unlikely(swizzled)) {
 		unsigned long start = (unsigned long) addr;
 		unsigned long end = (unsigned long) addr + length;
 
@@ -649,7 +649,7 @@ shmem_pwrite_fast(struct page *page, int shmem_page_offset, int page_length,
 	char *vaddr;
 	int ret;
 
-	if (page_do_bit17_swizzling)
+	if (unlikely(page_do_bit17_swizzling))
 		return -EINVAL;
 
 	vaddr = kmap_atomic(page);
@@ -680,7 +680,7 @@ shmem_pwrite_slow(struct page *page, int shmem_page_offset, int page_length,
 	int ret;
 
 	vaddr = kmap(page);
-	if (needs_clflush_before || page_do_bit17_swizzling)
+	if (unlikely(needs_clflush_before || page_do_bit17_swizzling))
 		shmem_clflush_swizzled_range(vaddr + shmem_page_offset,
 					     page_length,
 					     page_do_bit17_swizzling);
-- 
1.7.7.6

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 13/16] drm/i915: extract copy helpers from shmem_pread|pwrite
  2012-03-25 17:47 ` [PATCH 13/16] drm/i915: extract copy helpers from shmem_pread|pwrite Daniel Vetter
@ 2012-03-26  9:05   ` Chris Wilson
  0 siblings, 0 replies; 24+ messages in thread
From: Chris Wilson @ 2012-03-26  9:05 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

On Sun, 25 Mar 2012 19:47:40 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> While moving around things, this two functions slowly grew out of any
> sane bounds. So extract a few lines that do the copying and
> clflushing. Also add a few comments to explain what's going on.
> 
> v2: Again do s/needs_clflush/needs_clflush_after/ in the write paths
> as suggested by Chris Wilson.
> 
> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>

I've tested up to this point and have no further comments to make. (Ok,
there is the s/claring/glaring/ spelling mistake ;-)

1-13:
  Tested-by: Chris Wilson <chris@chris-wilson.co.uk>
  Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>

And it turns out I've never used swizzled pwrites. ;)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 14/16] mm: extend prefault helpers to fault in more than PAGE_SIZE
  2012-03-25 17:47 ` [PATCH 14/16] mm: extend prefault helpers to fault in more than PAGE_SIZE Daniel Vetter
@ 2012-03-26  9:09   ` Chris Wilson
  0 siblings, 0 replies; 24+ messages in thread
From: Chris Wilson @ 2012-03-26  9:09 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

On Sun, 25 Mar 2012 19:47:41 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> +/*
> + * Multipage variants of the above prefault helpers, useful if more than
> + * PAGE_SIZE of date needs to be prefaulted. These are separate from the above
s/date/data/

> + * functions (which only handle up to PAGE_SIZE) to avoid clobbering the
> + * filemap.c hotpaths.
> + */

My only critique is that I'd still like the (a & PAGE_MASK) == (b &
PAGE_MASK) to be replaced with a readable macro.

#define same_page(a, b) (((unsigned long)(a) & PAGE_MASK) == ((unsigned long)(b) & PAGE_MASK))
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 16/16] drm/i915: mark pwrite/pread slowpaths with unlikely
  2012-03-25 17:47 ` [PATCH 16/16] drm/i915: mark pwrite/pread slowpaths with unlikely Daniel Vetter
@ 2012-03-26  9:09   ` Chris Wilson
  2012-03-27 11:42     ` Daniel Vetter
  0 siblings, 1 reply; 24+ messages in thread
From: Chris Wilson @ 2012-03-26  9:09 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

On Sun, 25 Mar 2012 19:47:43 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> Beside helping the compiler untangle this maze they double-up as
> documentation for which a parts aren't performance-critical but just
> around to keep old (but already dead-slow) userspace from breaking.
> 
> Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 07/16] drm/i915: don't call shmem_read_mapping unnecessarily
  2012-03-25 17:47 ` [PATCH 07/16] drm/i915: don't call shmem_read_mapping unnecessarily Daniel Vetter
@ 2012-03-26  9:10   ` Daniel Vetter
  0 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-26  9:10 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

On Sun, Mar 25, 2012 at 07:47:34PM +0200, Daniel Vetter wrote:
> This speeds up pwrite and pread from ~120 µs ro ~100 µs for
> reading/writing 1mb on my snb (if the backing storage pages
> are already pinned, of course).
> 
> v2: Chris Wilson pointed out a claring page reference bug - I've
s/claring/glaring, noticed by Chris.

Note to self: Fix this when applying.
-Daniel

> unconditionally dropped the reference. With that fixed (and the
> associated reduction of dirt in dmesg) it's now even a notch faster.
> 
> v3: Unconditionaly grab a page reference when dropping
> dev->struct_mutex to simplify the code-flow.
> 
> Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> ---
>  drivers/gpu/drm/i915/i915_gem.c |   42 +++++++++++++++++++++++++++-----------
>  1 files changed, 30 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index c5b250c..117fda4 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -321,6 +321,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
>  	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
>  	int hit_slowpath = 0;
>  	int needs_clflush = 0;
> +	int release_page;
>  
>  	user_data = (char __user *) (uintptr_t) args->data_ptr;
>  	remain = args->size;
> @@ -355,10 +356,16 @@ i915_gem_shmem_pread(struct drm_device *dev,
>  		if ((shmem_page_offset + page_length) > PAGE_SIZE)
>  			page_length = PAGE_SIZE - shmem_page_offset;
>  
> -		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
> -		if (IS_ERR(page)) {
> -			ret = PTR_ERR(page);
> -			goto out;
> +		if (obj->pages) {
> +			page = obj->pages[offset >> PAGE_SHIFT];
> +			release_page = 0;
> +		} else {
> +			page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
> +			if (IS_ERR(page)) {
> +				ret = PTR_ERR(page);
> +				goto out;
> +			}
> +			release_page = 1;
>  		}
>  
>  		page_do_bit17_swizzling = obj_do_bit17_swizzling &&
> @@ -378,7 +385,7 @@ i915_gem_shmem_pread(struct drm_device *dev,
>  		}
>  
>  		hit_slowpath = 1;
> -
> +		page_cache_get(page);
>  		mutex_unlock(&dev->struct_mutex);
>  
>  		vaddr = kmap(page);
> @@ -397,9 +404,11 @@ i915_gem_shmem_pread(struct drm_device *dev,
>  		kunmap(page);
>  
>  		mutex_lock(&dev->struct_mutex);
> +		page_cache_release(page);
>  next_page:
>  		mark_page_accessed(page);
> -		page_cache_release(page);
> +		if (release_page)
> +			page_cache_release(page);
>  
>  		if (ret) {
>  			ret = -EFAULT;
> @@ -680,6 +689,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
>  	int shmem_page_offset, page_length, ret = 0;
>  	int obj_do_bit17_swizzling, page_do_bit17_swizzling;
>  	int hit_slowpath = 0;
> +	int release_page;
>  
>  	user_data = (char __user *) (uintptr_t) args->data_ptr;
>  	remain = args->size;
> @@ -704,10 +714,16 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
>  		if ((shmem_page_offset + page_length) > PAGE_SIZE)
>  			page_length = PAGE_SIZE - shmem_page_offset;
>  
> -		page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
> -		if (IS_ERR(page)) {
> -			ret = PTR_ERR(page);
> -			goto out;
> +		if (obj->pages) {
> +			page = obj->pages[offset >> PAGE_SHIFT];
> +			release_page = 0;
> +		} else {
> +			page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT);
> +			if (IS_ERR(page)) {
> +				ret = PTR_ERR(page);
> +				goto out;
> +			}
> +			release_page = 1;
>  		}
>  
>  		page_do_bit17_swizzling = obj_do_bit17_swizzling &&
> @@ -725,7 +741,7 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
>  		}
>  
>  		hit_slowpath = 1;
> -
> +		page_cache_get(page);
>  		mutex_unlock(&dev->struct_mutex);
>  
>  		vaddr = kmap(page);
> @@ -740,10 +756,12 @@ i915_gem_shmem_pwrite(struct drm_device *dev,
>  		kunmap(page);
>  
>  		mutex_lock(&dev->struct_mutex);
> +		page_cache_release(page);
>  next_page:
>  		set_page_dirty(page);
>  		mark_page_accessed(page);
> -		page_cache_release(page);
> +		if (release_page)
> +			page_cache_release(page);
>  
>  		if (ret) {
>  			ret = -EFAULT;
> -- 
> 1.7.7.6
> 

-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 15/16] drm/i915: fixup in-line clflushing on bit17 swizzled bos
  2012-03-25 17:47 ` [PATCH 15/16] drm/i915: fixup in-line clflushing on bit17 swizzled bos Daniel Vetter
@ 2012-03-26  9:18   ` Chris Wilson
  2012-03-26  9:26     ` Daniel Vetter
  0 siblings, 1 reply; 24+ messages in thread
From: Chris Wilson @ 2012-03-26  9:18 UTC (permalink / raw)
  To: Intel Graphics Development; +Cc: Daniel Vetter

On Sun, 25 Mar 2012 19:47:42 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> The issue is that with inline clflushing the clflushing isn't properly
> swizzled. Fix this by
> - always clflushing entire 128 byte chunks and
> - unconditionally flush before writes when swizzling a given page.
>   We could be clever and check whether we pwrite a partial 128 byte
>   chunk instead of a partial cacheline, but I've figured that's not
>   worth it.

There's some black magic here that I haven't fully grasped. We only ever
swizzle the gpu address (by whole cachelines), so why do we need to
invalidate a pair of cachelines for a single cacheline write?

Also we have a lot of assumptions that the cacheline is 64 bytes. Have
we tested on gen2 where the GPU cacheline is 32 bytes?
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 15/16] drm/i915: fixup in-line clflushing on bit17 swizzled bos
  2012-03-26  9:18   ` Chris Wilson
@ 2012-03-26  9:26     ` Daniel Vetter
  2012-03-26 12:50       ` Chris Wilson
  0 siblings, 1 reply; 24+ messages in thread
From: Daniel Vetter @ 2012-03-26  9:26 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Daniel Vetter, Intel Graphics Development

On Mon, Mar 26, 2012 at 10:18:39AM +0100, Chris Wilson wrote:
> On Sun, 25 Mar 2012 19:47:42 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> > The issue is that with inline clflushing the clflushing isn't properly
> > swizzled. Fix this by
> > - always clflushing entire 128 byte chunks and
> > - unconditionally flush before writes when swizzling a given page.
> >   We could be clever and check whether we pwrite a partial 128 byte
> >   chunk instead of a partial cacheline, but I've figured that's not
> >   worth it.
> 
> There's some black magic here that I haven't fully grasped. We only ever
> swizzle the gpu address (by whole cachelines), so why do we need to
> invalidate a pair of cachelines for a single cacheline write?

Well, we do swizzle when doing the actual copy_to|from_user, so strictly
speaking we should also swizzle the clflushing in this case. No bit17
swizzling pwrite/pread is pretty much only around for backwards-compat
with dead-old userspace, so I've figure I'll just unconditionally align
the clflush range with even cachelines when bit17 swizzling is effective
on the current page. Instead of adding a complex and rather untested
swizzled clflush helper.


> Also we have a lot of assumptions that the cacheline is 64 bytes. Have
> we tested on gen2 where the GPU cacheline is 32 bytes?

We're lucking out in that regard because gen2 doesn't do swizzling. At
least my i855gm here, where I could run the corresponding i-g-t tests. And
there's a comment in the code that i865g is unswizzled, too.

So as long as people create dram controller where 64 bytes is the most
effective transaction size, we should be fine. It'll be a fun day though
when that changes. Otoh with gen5+ we don't have any bit17 swizzling
nonsense anymore because the gpu is much more integrated with the cpu. I
hope that trend continues and will prevent any bit17 madness in the
future.
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 15/16] drm/i915: fixup in-line clflushing on bit17 swizzled bos
  2012-03-26  9:26     ` Daniel Vetter
@ 2012-03-26 12:50       ` Chris Wilson
  0 siblings, 0 replies; 24+ messages in thread
From: Chris Wilson @ 2012-03-26 12:50 UTC (permalink / raw)
  To: Daniel Vetter; +Cc: Daniel Vetter, Intel Graphics Development

On Mon, 26 Mar 2012 11:26:33 +0200, Daniel Vetter <daniel@ffwll.ch> wrote:
> On Mon, Mar 26, 2012 at 10:18:39AM +0100, Chris Wilson wrote:
> > On Sun, 25 Mar 2012 19:47:42 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> > > The issue is that with inline clflushing the clflushing isn't properly
> > > swizzled. Fix this by
> > > - always clflushing entire 128 byte chunks and
> > > - unconditionally flush before writes when swizzling a given page.
> > >   We could be clever and check whether we pwrite a partial 128 byte
> > >   chunk instead of a partial cacheline, but I've figured that's not
> > >   worth it.
> > 
> > There's some black magic here that I haven't fully grasped. We only ever
> > swizzle the gpu address (by whole cachelines), so why do we need to
> > invalidate a pair of cachelines for a single cacheline write?
> 
> Well, we do swizzle when doing the actual copy_to|from_user, so strictly
> speaking we should also swizzle the clflushing in this case. No bit17
> swizzling pwrite/pread is pretty much only around for backwards-compat
> with dead-old userspace, so I've figure I'll just unconditionally align
> the clflush range with even cachelines when bit17 swizzling is effective
> on the current page. Instead of adding a complex and rather untested
> swizzled clflush helper.

I was struggling to see where exactly we were swizzling the CPU address
because I failed to make the connection between shmem_page_offset and
gpu_offset.

With that resolved, Daniel you deserve a special award for that!,
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 16/16] drm/i915: mark pwrite/pread slowpaths with unlikely
  2012-03-26  9:09   ` Chris Wilson
@ 2012-03-27 11:42     ` Daniel Vetter
  0 siblings, 0 replies; 24+ messages in thread
From: Daniel Vetter @ 2012-03-27 11:42 UTC (permalink / raw)
  To: Chris Wilson; +Cc: Daniel Vetter, Intel Graphics Development

On Mon, Mar 26, 2012 at 10:09:41AM +0100, Chris Wilson wrote:
> On Sun, 25 Mar 2012 19:47:43 +0200, Daniel Vetter <daniel.vetter@ffwll.ch> wrote:
> > Beside helping the compiler untangle this maze they double-up as
> > documentation for which a parts aren't performance-critical but just
> > around to keep old (but already dead-slow) userspace from breaking.
> > 
> > Signed-Off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>

I've queued up this entire series for 3.5. Thanks a lot for all your
review and testing on this!
-Daniel
-- 
Daniel Vetter
Mail: daniel@ffwll.ch
Mobile: +41 (0)79 365 57 48

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2012-03-27 11:42 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-25 17:47 [PATCH 01/16] drm/i915: merge shmem_pwrite slow&fast-path Daniel Vetter
2012-03-25 17:47 ` [PATCH 02/16] drm/i915: merge shmem_pread slow&fast-path Daniel Vetter
2012-03-25 17:47 ` [PATCH 03/16] drm: add helper to clflush a virtual address range Daniel Vetter
2012-03-25 17:47 ` [PATCH 04/16] drm/i915: move clflushing into shmem_pread Daniel Vetter
2012-03-25 17:47 ` [PATCH 05/16] drm/i915: kill ranged cpu read domain support Daniel Vetter
2012-03-25 17:47 ` [PATCH 06/16] drm/i915: don't use gtt_pwrite on LLC cached objects Daniel Vetter
2012-03-25 17:47 ` [PATCH 07/16] drm/i915: don't call shmem_read_mapping unnecessarily Daniel Vetter
2012-03-26  9:10   ` Daniel Vetter
2012-03-25 17:47 ` [PATCH 08/16] drm/i915: drop gtt slowpath Daniel Vetter
2012-03-25 17:47 ` [PATCH 09/16] drm/i915: don't clobber userspace memory before commiting to the pread Daniel Vetter
2012-03-25 17:47 ` [PATCH 10/16] drm/i915: implement inline clflush for pwrite Daniel Vetter
2012-03-25 17:47 ` [PATCH 11/16] drm/i915: fall back to shmem pwrite when the buffer is not accessible Daniel Vetter
2012-03-25 17:47 ` [PATCH 12/16] drm/i915: use uncached writes in pwrite Daniel Vetter
2012-03-25 17:47 ` [PATCH 13/16] drm/i915: extract copy helpers from shmem_pread|pwrite Daniel Vetter
2012-03-26  9:05   ` Chris Wilson
2012-03-25 17:47 ` [PATCH 14/16] mm: extend prefault helpers to fault in more than PAGE_SIZE Daniel Vetter
2012-03-26  9:09   ` Chris Wilson
2012-03-25 17:47 ` [PATCH 15/16] drm/i915: fixup in-line clflushing on bit17 swizzled bos Daniel Vetter
2012-03-26  9:18   ` Chris Wilson
2012-03-26  9:26     ` Daniel Vetter
2012-03-26 12:50       ` Chris Wilson
2012-03-25 17:47 ` [PATCH 16/16] drm/i915: mark pwrite/pread slowpaths with unlikely Daniel Vetter
2012-03-26  9:09   ` Chris Wilson
2012-03-27 11:42     ` Daniel Vetter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.