All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
@ 2017-05-17 15:05 Chris Wilson
  2017-05-17 15:26 ` ✓ Fi.CI.BAT: success for " Patchwork
  2017-05-17 17:53 ` [PATCH] " Dongwon Kim
  0 siblings, 2 replies; 3+ messages in thread
From: Chris Wilson @ 2017-05-17 15:05 UTC (permalink / raw)
  To: intel-gfx; +Cc: Dongwon Kim

Currently, we only mark the CPU cache as dirty if we skip a clflush.
This leads to some confusion where we have to ask if the object is in
the write domain or missed a clflush. If we always mark the cache as
dirty, this becomes a much simply question to answer.

The goal remains to do as few clflushes as required and to do them as
late as possible, in the hope of deferring the work to a kthread and not
block the caller (e.g. execbuf, flips).

v2: Always call clflush before GPU execution when the cache_dirty flag
is set. This may cause some extra work on llc systems that migrate dirty
buffers back and forth - but we do try to limit that by only setting
cache_dirty at the end of the gpu sequence.

v3: Always mark the cache as dirty upon a level change, as we need to
invalidate any stale cachelines due to external writes.

Reported-by: Dongwon Kim <dongwon.kim@intel.com>
Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Dongwon Kim <dongwon.kim@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
---
 drivers/gpu/drm/i915/i915_gem.c                  | 76 ++++++++++++++----------
 drivers/gpu/drm/i915/i915_gem_clflush.c          | 15 +++--
 drivers/gpu/drm/i915/i915_gem_execbuffer.c       | 21 +++----
 drivers/gpu/drm/i915/i915_gem_internal.c         |  3 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c          |  5 +-
 drivers/gpu/drm/i915/selftests/huge_gem_object.c |  3 +-
 6 files changed, 67 insertions(+), 56 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index 02adf8241394..155dd52f2d18 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
 
 static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
 {
-	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
+	if (obj->cache_dirty)
 		return false;
 
 	if (!i915_gem_object_is_coherent(obj))
@@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
 	return st;
 }
 
+static void __start_cpu_write(struct drm_i915_gem_object *obj)
+{
+	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+	if (cpu_write_needs_clflush(obj))
+		obj->cache_dirty = true;
+}
+
 static void
 __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 				struct sg_table *pages,
@@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
 	    !i915_gem_object_is_coherent(obj))
 		drm_clflush_sg(pages);
 
-	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
-	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+	__start_cpu_write(obj);
 }
 
 static void
@@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file,
 			       args->size, &args->handle);
 }
 
+static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
+{
+	return !(obj->cache_level == I915_CACHE_NONE ||
+		 obj->cache_level == I915_CACHE_WT);
+}
+
 /**
  * Creates a new mm object and returns a handle to it.
  * @dev: drm device pointer
@@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
 	case I915_GEM_DOMAIN_CPU:
 		i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
 		break;
+
+	case I915_GEM_DOMAIN_RENDER:
+		if (gpu_write_needs_clflush(obj))
+			obj->cache_dirty = true;
+		break;
 	}
 
 	obj->base.write_domain = 0;
@@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
 	 * optimizes for the case when the gpu will dirty the data
 	 * anyway again before the next pread happens.
 	 */
-	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
+	if (!obj->cache_dirty &&
+	    !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
 		*needs_clflush = CLFLUSH_BEFORE;
 
 out:
@@ -906,14 +925,16 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
 	 * This optimizes for the case when the gpu will use the data
 	 * right away and we therefore have to clflush anyway.
 	 */
-	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
+	if (!obj->cache_dirty) {
 		*needs_clflush |= CLFLUSH_AFTER;
 
-	/* Same trick applies to invalidate partially written cachelines read
-	 * before writing.
-	 */
-	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
-		*needs_clflush |= CLFLUSH_BEFORE;
+		/*
+		 * Same trick applies to invalidate partially written
+		 * cachelines read before writing.
+		 */
+		if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
+			*needs_clflush |= CLFLUSH_BEFORE;
+	}
 
 out:
 	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
@@ -3372,10 +3393,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
 
 static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
 {
-	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU && !obj->cache_dirty)
-		return;
-
-	i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
+	/*
+	 * We manually flush the CPU domain so that we can override and
+	 * force the flush for the display, and perform it asyncrhonously.
+	 */
+	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
+	if (obj->cache_dirty)
+		i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
 	obj->base.write_domain = 0;
 }
 
@@ -3634,13 +3658,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
 		}
 	}
 
-	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU &&
-	    i915_gem_object_is_coherent(obj))
-		obj->cache_dirty = true;
-
 	list_for_each_entry(vma, &obj->vma_list, obj_link)
 		vma->node.color = cache_level;
 	obj->cache_level = cache_level;
+	obj->cache_dirty = true; /* Always invalidate stale cachelines */
 
 	return 0;
 }
@@ -3862,9 +3883,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 	if (ret)
 		return ret;
 
-	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
-		return 0;
-
 	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
 
 	/* Flush the CPU cache if it's still invalid. */
@@ -3876,15 +3894,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
 	/* It should now be out of any other write domains, and we can update
 	 * the domain values for our changes.
 	 */
-	GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
+	GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
 
 	/* If we're writing through the CPU, then the GPU read domains will
 	 * need to be invalidated at next use.
 	 */
-	if (write) {
-		obj->base.read_domains = I915_GEM_DOMAIN_CPU;
-		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
-	}
+	if (write)
+		__start_cpu_write(obj);
 
 	return 0;
 }
@@ -4304,6 +4320,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
 	} else
 		obj->cache_level = I915_CACHE_NONE;
 
+	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
+
 	trace_i915_gem_object_create(obj);
 
 	return obj;
@@ -4972,10 +4990,8 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
 
 	mutex_lock(&dev_priv->drm.struct_mutex);
 	for (p = phases; *p; p++) {
-		list_for_each_entry(obj, *p, global_link) {
-			obj->base.read_domains = I915_GEM_DOMAIN_CPU;
-			obj->base.write_domain = I915_GEM_DOMAIN_CPU;
-		}
+		list_for_each_entry(obj, *p, global_link)
+			__start_cpu_write(obj);
 	}
 	mutex_unlock(&dev_priv->drm.struct_mutex);
 
diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
index ffac7a1f0caf..17b207e963c2 100644
--- a/drivers/gpu/drm/i915/i915_gem_clflush.c
+++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
@@ -71,8 +71,6 @@ static const struct dma_fence_ops i915_clflush_ops = {
 static void __i915_do_clflush(struct drm_i915_gem_object *obj)
 {
 	drm_clflush_sg(obj->mm.pages);
-	obj->cache_dirty = false;
-
 	intel_fb_obj_flush(obj, ORIGIN_CPU);
 }
 
@@ -81,9 +79,6 @@ static void i915_clflush_work(struct work_struct *work)
 	struct clflush *clflush = container_of(work, typeof(*clflush), work);
 	struct drm_i915_gem_object *obj = clflush->obj;
 
-	if (!obj->cache_dirty)
-		goto out;
-
 	if (i915_gem_object_pin_pages(obj)) {
 		DRM_ERROR("Failed to acquire obj->pages for clflushing\n");
 		goto out;
@@ -131,10 +126,10 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
 	 * anything not backed by physical memory we consider to be always
 	 * coherent and not need clflushing.
 	 */
-	if (!i915_gem_object_has_struct_page(obj))
+	if (!i915_gem_object_has_struct_page(obj)) {
+		obj->cache_dirty = false;
 		return;
-
-	obj->cache_dirty = true;
+	}
 
 	/* If the GPU is snooping the contents of the CPU cache,
 	 * we do not need to manually clear the CPU cache lines.  However,
@@ -153,6 +148,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
 	if (!(flags & I915_CLFLUSH_SYNC))
 		clflush = kmalloc(sizeof(*clflush), GFP_KERNEL);
 	if (clflush) {
+		GEM_BUG_ON(!obj->cache_dirty);
+
 		dma_fence_init(&clflush->dma,
 			       &i915_clflush_ops,
 			       &clflush_lock,
@@ -180,4 +177,6 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
 	} else {
 		GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
 	}
+
+	obj->cache_dirty = false;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index af1965774e7b..0b8ae0f56675 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -291,7 +291,7 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
 		return DBG_USE_CPU_RELOC > 0;
 
 	return (HAS_LLC(to_i915(obj->base.dev)) ||
-		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
+		obj->cache_dirty ||
 		obj->cache_level != I915_CACHE_NONE);
 }
 
@@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
 		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
 			continue;
 
-		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
+		if (obj->cache_dirty)
 			i915_gem_clflush_object(obj, 0);
-			obj->base.write_domain = 0;
-		}
 
 		ret = i915_gem_request_await_object
 			(req, obj, obj->base.pending_write_domain);
@@ -1265,12 +1263,6 @@ i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
 	return ctx;
 }
 
-static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
-{
-	return !(obj->cache_level == I915_CACHE_NONE ||
-		 obj->cache_level == I915_CACHE_WT);
-}
-
 void i915_vma_move_to_active(struct i915_vma *vma,
 			     struct drm_i915_gem_request *req,
 			     unsigned int flags)
@@ -1294,15 +1286,16 @@ void i915_vma_move_to_active(struct i915_vma *vma,
 	i915_gem_active_set(&vma->last_read[idx], req);
 	list_move_tail(&vma->vm_link, &vma->vm->active_list);
 
+	obj->base.write_domain = 0;
 	if (flags & EXEC_OBJECT_WRITE) {
+		obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
+
 		if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
 			i915_gem_active_set(&obj->frontbuffer_write, req);
 
-		/* update for the implicit flush after a batch */
-		obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
-		if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
-			obj->cache_dirty = true;
+		obj->base.read_domains = 0;
 	}
+	obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
 
 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
 		i915_gem_active_set(&vma->last_fence, req);
diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
index fc950abbe400..58e93e87d573 100644
--- a/drivers/gpu/drm/i915/i915_gem_internal.c
+++ b/drivers/gpu/drm/i915/i915_gem_internal.c
@@ -188,9 +188,10 @@ i915_gem_object_create_internal(struct drm_i915_private *i915,
 	drm_gem_private_object_init(&i915->drm, &obj->base, size);
 	i915_gem_object_init(obj, &i915_gem_object_internal_ops);
 
-	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
+	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
 
 	return obj;
 }
diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 58ccf8b8ca1c..9f84be171ad2 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -802,9 +802,10 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
 
 	drm_gem_private_object_init(dev, &obj->base, args->user_size);
 	i915_gem_object_init(obj, &i915_gem_userptr_ops);
-	obj->cache_level = I915_CACHE_LLC;
-	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
+	obj->cache_level = I915_CACHE_LLC;
+	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
 
 	obj->userptr.ptr = args->user_ptr;
 	obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
index 4e681fc13be4..0ca867a877b6 100644
--- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
+++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
@@ -126,9 +126,10 @@ huge_gem_object(struct drm_i915_private *i915,
 	drm_gem_private_object_init(&i915->drm, &obj->base, dma_size);
 	i915_gem_object_init(obj, &huge_ops);
 
-	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
+	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
 	obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
+	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
 	obj->scratch = phys_size;
 
 	return obj;
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 3+ messages in thread

* ✓ Fi.CI.BAT: success for drm/i915: Mark CPU cache as dirty on every transition for CPU writes
  2017-05-17 15:05 [PATCH] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Chris Wilson
@ 2017-05-17 15:26 ` Patchwork
  2017-05-17 17:53 ` [PATCH] " Dongwon Kim
  1 sibling, 0 replies; 3+ messages in thread
From: Patchwork @ 2017-05-17 15:26 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: drm/i915: Mark CPU cache as dirty on every transition for CPU writes
URL   : https://patchwork.freedesktop.org/series/24569/
State : success

== Summary ==

Series 24569v1 drm/i915: Mark CPU cache as dirty on every transition for CPU writes
https://patchwork.freedesktop.org/api/1.0/series/24569/revisions/1/mbox/

Test kms_cursor_legacy:
        Subgroup basic-busy-flip-before-cursor-atomic:
                pass       -> FAIL       (fi-snb-2600) fdo#100215

fdo#100215 https://bugs.freedesktop.org/show_bug.cgi?id=100215

fi-bdw-5557u     total:278  pass:267  dwarn:0   dfail:0   fail:0   skip:11  time:446s
fi-bsw-n3050     total:278  pass:242  dwarn:0   dfail:0   fail:0   skip:36  time:577s
fi-bxt-j4205     total:278  pass:259  dwarn:0   dfail:0   fail:0   skip:19  time:514s
fi-byt-j1900     total:278  pass:254  dwarn:0   dfail:0   fail:0   skip:24  time:492s
fi-byt-n2820     total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  time:485s
fi-hsw-4770      total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time:419s
fi-hsw-4770r     total:278  pass:262  dwarn:0   dfail:0   fail:0   skip:16  time:415s
fi-ilk-650       total:278  pass:228  dwarn:0   dfail:0   fail:0   skip:50  time:426s
fi-ivb-3520m     total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:495s
fi-ivb-3770      total:278  pass:260  dwarn:0   dfail:0   fail:0   skip:18  time:467s
fi-kbl-7500u     total:278  pass:255  dwarn:5   dfail:0   fail:0   skip:18  time:463s
fi-skl-6260u     total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:460s
fi-skl-6700hq    total:278  pass:261  dwarn:0   dfail:0   fail:0   skip:17  time:583s
fi-skl-6700k     total:278  pass:256  dwarn:4   dfail:0   fail:0   skip:18  time:468s
fi-skl-6770hq    total:278  pass:268  dwarn:0   dfail:0   fail:0   skip:10  time:500s
fi-snb-2520m     total:278  pass:250  dwarn:0   dfail:0   fail:0   skip:28  time:535s
fi-snb-2600      total:278  pass:248  dwarn:0   dfail:0   fail:1   skip:29  time:409s

eb3549d312620118dec3a69200894ac8a8fff358 drm-tip: 2017y-05m-17d-13h-53m-40s UTC integration manifest
cce0e79 drm/i915: Mark CPU cache as dirty on every transition for CPU writes

== Logs ==

For more details see: https://intel-gfx-ci.01.org/CI/Patchwork_4727/
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] drm/i915: Mark CPU cache as dirty on every transition for CPU writes
  2017-05-17 15:05 [PATCH] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Chris Wilson
  2017-05-17 15:26 ` ✓ Fi.CI.BAT: success for " Patchwork
@ 2017-05-17 17:53 ` Dongwon Kim
  1 sibling, 0 replies; 3+ messages in thread
From: Dongwon Kim @ 2017-05-17 17:53 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

Chris, 

This patch works now as we unconditionally set cache_dirty
in set_cache_level function.

Tested-by: Dongwon Kim <dongwon.kim@intel.com>

On Wed, May 17, 2017 at 04:05:24PM +0100, Chris Wilson wrote:
> Currently, we only mark the CPU cache as dirty if we skip a clflush.
> This leads to some confusion where we have to ask if the object is in
> the write domain or missed a clflush. If we always mark the cache as
> dirty, this becomes a much simply question to answer.
> 
> The goal remains to do as few clflushes as required and to do them as
> late as possible, in the hope of deferring the work to a kthread and not
> block the caller (e.g. execbuf, flips).
> 
> v2: Always call clflush before GPU execution when the cache_dirty flag
> is set. This may cause some extra work on llc systems that migrate dirty
> buffers back and forth - but we do try to limit that by only setting
> cache_dirty at the end of the gpu sequence.
> 
> v3: Always mark the cache as dirty upon a level change, as we need to
> invalidate any stale cachelines due to external writes.
> 
> Reported-by: Dongwon Kim <dongwon.kim@intel.com>
> Fixes: a6a7cc4b7db6 ("drm/i915: Always flush the dirty CPU cache when pinning the scanout")
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> Cc: Dongwon Kim <dongwon.kim@intel.com>
> Cc: Matt Roper <matthew.d.roper@intel.com>
> ---
>  drivers/gpu/drm/i915/i915_gem.c                  | 76 ++++++++++++++----------
>  drivers/gpu/drm/i915/i915_gem_clflush.c          | 15 +++--
>  drivers/gpu/drm/i915/i915_gem_execbuffer.c       | 21 +++----
>  drivers/gpu/drm/i915/i915_gem_internal.c         |  3 +-
>  drivers/gpu/drm/i915/i915_gem_userptr.c          |  5 +-
>  drivers/gpu/drm/i915/selftests/huge_gem_object.c |  3 +-
>  6 files changed, 67 insertions(+), 56 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
> index 02adf8241394..155dd52f2d18 100644
> --- a/drivers/gpu/drm/i915/i915_gem.c
> +++ b/drivers/gpu/drm/i915/i915_gem.c
> @@ -49,7 +49,7 @@ static void i915_gem_flush_free_objects(struct drm_i915_private *i915);
>  
>  static bool cpu_write_needs_clflush(struct drm_i915_gem_object *obj)
>  {
> -	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
> +	if (obj->cache_dirty)
>  		return false;
>  
>  	if (!i915_gem_object_is_coherent(obj))
> @@ -233,6 +233,14 @@ i915_gem_object_get_pages_phys(struct drm_i915_gem_object *obj)
>  	return st;
>  }
>  
> +static void __start_cpu_write(struct drm_i915_gem_object *obj)
> +{
> +	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> +	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +	if (cpu_write_needs_clflush(obj))
> +		obj->cache_dirty = true;
> +}
> +
>  static void
>  __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
>  				struct sg_table *pages,
> @@ -248,8 +256,7 @@ __i915_gem_object_release_shmem(struct drm_i915_gem_object *obj,
>  	    !i915_gem_object_is_coherent(obj))
>  		drm_clflush_sg(pages);
>  
> -	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> -	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +	__start_cpu_write(obj);
>  }
>  
>  static void
> @@ -684,6 +691,12 @@ i915_gem_dumb_create(struct drm_file *file,
>  			       args->size, &args->handle);
>  }
>  
> +static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> +{
> +	return !(obj->cache_level == I915_CACHE_NONE ||
> +		 obj->cache_level == I915_CACHE_WT);
> +}
> +
>  /**
>   * Creates a new mm object and returns a handle to it.
>   * @dev: drm device pointer
> @@ -753,6 +766,11 @@ flush_write_domain(struct drm_i915_gem_object *obj, unsigned int flush_domains)
>  	case I915_GEM_DOMAIN_CPU:
>  		i915_gem_clflush_object(obj, I915_CLFLUSH_SYNC);
>  		break;
> +
> +	case I915_GEM_DOMAIN_RENDER:
> +		if (gpu_write_needs_clflush(obj))
> +			obj->cache_dirty = true;
> +		break;
>  	}
>  
>  	obj->base.write_domain = 0;
> @@ -854,7 +872,8 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
>  	 * optimizes for the case when the gpu will dirty the data
>  	 * anyway again before the next pread happens.
>  	 */
> -	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> +	if (!obj->cache_dirty &&
> +	    !(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
>  		*needs_clflush = CLFLUSH_BEFORE;
>  
>  out:
> @@ -906,14 +925,16 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
>  	 * This optimizes for the case when the gpu will use the data
>  	 * right away and we therefore have to clflush anyway.
>  	 */
> -	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU)
> +	if (!obj->cache_dirty) {
>  		*needs_clflush |= CLFLUSH_AFTER;
>  
> -	/* Same trick applies to invalidate partially written cachelines read
> -	 * before writing.
> -	 */
> -	if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> -		*needs_clflush |= CLFLUSH_BEFORE;
> +		/*
> +		 * Same trick applies to invalidate partially written
> +		 * cachelines read before writing.
> +		 */
> +		if (!(obj->base.read_domains & I915_GEM_DOMAIN_CPU))
> +			*needs_clflush |= CLFLUSH_BEFORE;
> +	}
>  
>  out:
>  	intel_fb_obj_invalidate(obj, ORIGIN_CPU);
> @@ -3372,10 +3393,13 @@ int i915_gem_wait_for_idle(struct drm_i915_private *i915, unsigned int flags)
>  
>  static void __i915_gem_object_flush_for_display(struct drm_i915_gem_object *obj)
>  {
> -	if (obj->base.write_domain != I915_GEM_DOMAIN_CPU && !obj->cache_dirty)
> -		return;
> -
> -	i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
> +	/*
> +	 * We manually flush the CPU domain so that we can override and
> +	 * force the flush for the display, and perform it asyncrhonously.
> +	 */
> +	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
> +	if (obj->cache_dirty)
> +		i915_gem_clflush_object(obj, I915_CLFLUSH_FORCE);
>  	obj->base.write_domain = 0;
>  }
>  
> @@ -3634,13 +3658,10 @@ int i915_gem_object_set_cache_level(struct drm_i915_gem_object *obj,
>  		}
>  	}
>  
> -	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU &&
> -	    i915_gem_object_is_coherent(obj))
> -		obj->cache_dirty = true;
> -
>  	list_for_each_entry(vma, &obj->vma_list, obj_link)
>  		vma->node.color = cache_level;
>  	obj->cache_level = cache_level;
> +	obj->cache_dirty = true; /* Always invalidate stale cachelines */
>  
>  	return 0;
>  }
> @@ -3862,9 +3883,6 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
>  	if (ret)
>  		return ret;
>  
> -	if (obj->base.write_domain == I915_GEM_DOMAIN_CPU)
> -		return 0;
> -
>  	flush_write_domain(obj, ~I915_GEM_DOMAIN_CPU);
>  
>  	/* Flush the CPU cache if it's still invalid. */
> @@ -3876,15 +3894,13 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
>  	/* It should now be out of any other write domains, and we can update
>  	 * the domain values for our changes.
>  	 */
> -	GEM_BUG_ON((obj->base.write_domain & ~I915_GEM_DOMAIN_CPU) != 0);
> +	GEM_BUG_ON(obj->base.write_domain & ~I915_GEM_DOMAIN_CPU);
>  
>  	/* If we're writing through the CPU, then the GPU read domains will
>  	 * need to be invalidated at next use.
>  	 */
> -	if (write) {
> -		obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> -		obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> -	}
> +	if (write)
> +		__start_cpu_write(obj);
>  
>  	return 0;
>  }
> @@ -4304,6 +4320,8 @@ i915_gem_object_create(struct drm_i915_private *dev_priv, u64 size)
>  	} else
>  		obj->cache_level = I915_CACHE_NONE;
>  
> +	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
> +
>  	trace_i915_gem_object_create(obj);
>  
>  	return obj;
> @@ -4972,10 +4990,8 @@ int i915_gem_freeze_late(struct drm_i915_private *dev_priv)
>  
>  	mutex_lock(&dev_priv->drm.struct_mutex);
>  	for (p = phases; *p; p++) {
> -		list_for_each_entry(obj, *p, global_link) {
> -			obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> -			obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> -		}
> +		list_for_each_entry(obj, *p, global_link)
> +			__start_cpu_write(obj);
>  	}
>  	mutex_unlock(&dev_priv->drm.struct_mutex);
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_clflush.c b/drivers/gpu/drm/i915/i915_gem_clflush.c
> index ffac7a1f0caf..17b207e963c2 100644
> --- a/drivers/gpu/drm/i915/i915_gem_clflush.c
> +++ b/drivers/gpu/drm/i915/i915_gem_clflush.c
> @@ -71,8 +71,6 @@ static const struct dma_fence_ops i915_clflush_ops = {
>  static void __i915_do_clflush(struct drm_i915_gem_object *obj)
>  {
>  	drm_clflush_sg(obj->mm.pages);
> -	obj->cache_dirty = false;
> -
>  	intel_fb_obj_flush(obj, ORIGIN_CPU);
>  }
>  
> @@ -81,9 +79,6 @@ static void i915_clflush_work(struct work_struct *work)
>  	struct clflush *clflush = container_of(work, typeof(*clflush), work);
>  	struct drm_i915_gem_object *obj = clflush->obj;
>  
> -	if (!obj->cache_dirty)
> -		goto out;
> -
>  	if (i915_gem_object_pin_pages(obj)) {
>  		DRM_ERROR("Failed to acquire obj->pages for clflushing\n");
>  		goto out;
> @@ -131,10 +126,10 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>  	 * anything not backed by physical memory we consider to be always
>  	 * coherent and not need clflushing.
>  	 */
> -	if (!i915_gem_object_has_struct_page(obj))
> +	if (!i915_gem_object_has_struct_page(obj)) {
> +		obj->cache_dirty = false;
>  		return;
> -
> -	obj->cache_dirty = true;
> +	}
>  
>  	/* If the GPU is snooping the contents of the CPU cache,
>  	 * we do not need to manually clear the CPU cache lines.  However,
> @@ -153,6 +148,8 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>  	if (!(flags & I915_CLFLUSH_SYNC))
>  		clflush = kmalloc(sizeof(*clflush), GFP_KERNEL);
>  	if (clflush) {
> +		GEM_BUG_ON(!obj->cache_dirty);
> +
>  		dma_fence_init(&clflush->dma,
>  			       &i915_clflush_ops,
>  			       &clflush_lock,
> @@ -180,4 +177,6 @@ void i915_gem_clflush_object(struct drm_i915_gem_object *obj,
>  	} else {
>  		GEM_BUG_ON(obj->base.write_domain != I915_GEM_DOMAIN_CPU);
>  	}
> +
> +	obj->cache_dirty = false;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> index af1965774e7b..0b8ae0f56675 100644
> --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
> @@ -291,7 +291,7 @@ static inline int use_cpu_reloc(struct drm_i915_gem_object *obj)
>  		return DBG_USE_CPU_RELOC > 0;
>  
>  	return (HAS_LLC(to_i915(obj->base.dev)) ||
> -		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
> +		obj->cache_dirty ||
>  		obj->cache_level != I915_CACHE_NONE);
>  }
>  
> @@ -1129,10 +1129,8 @@ i915_gem_execbuffer_move_to_gpu(struct drm_i915_gem_request *req,
>  		if (vma->exec_entry->flags & EXEC_OBJECT_ASYNC)
>  			continue;
>  
> -		if (obj->base.write_domain & I915_GEM_DOMAIN_CPU) {
> +		if (obj->cache_dirty)
>  			i915_gem_clflush_object(obj, 0);
> -			obj->base.write_domain = 0;
> -		}
>  
>  		ret = i915_gem_request_await_object
>  			(req, obj, obj->base.pending_write_domain);
> @@ -1265,12 +1263,6 @@ i915_gem_validate_context(struct drm_device *dev, struct drm_file *file,
>  	return ctx;
>  }
>  
> -static bool gpu_write_needs_clflush(struct drm_i915_gem_object *obj)
> -{
> -	return !(obj->cache_level == I915_CACHE_NONE ||
> -		 obj->cache_level == I915_CACHE_WT);
> -}
> -
>  void i915_vma_move_to_active(struct i915_vma *vma,
>  			     struct drm_i915_gem_request *req,
>  			     unsigned int flags)
> @@ -1294,15 +1286,16 @@ void i915_vma_move_to_active(struct i915_vma *vma,
>  	i915_gem_active_set(&vma->last_read[idx], req);
>  	list_move_tail(&vma->vm_link, &vma->vm->active_list);
>  
> +	obj->base.write_domain = 0;
>  	if (flags & EXEC_OBJECT_WRITE) {
> +		obj->base.write_domain = I915_GEM_DOMAIN_RENDER;
> +
>  		if (intel_fb_obj_invalidate(obj, ORIGIN_CS))
>  			i915_gem_active_set(&obj->frontbuffer_write, req);
>  
> -		/* update for the implicit flush after a batch */
> -		obj->base.write_domain &= ~I915_GEM_GPU_DOMAINS;
> -		if (!obj->cache_dirty && gpu_write_needs_clflush(obj))
> -			obj->cache_dirty = true;
> +		obj->base.read_domains = 0;
>  	}
> +	obj->base.read_domains |= I915_GEM_GPU_DOMAINS;
>  
>  	if (flags & EXEC_OBJECT_NEEDS_FENCE)
>  		i915_gem_active_set(&vma->last_fence, req);
> diff --git a/drivers/gpu/drm/i915/i915_gem_internal.c b/drivers/gpu/drm/i915/i915_gem_internal.c
> index fc950abbe400..58e93e87d573 100644
> --- a/drivers/gpu/drm/i915/i915_gem_internal.c
> +++ b/drivers/gpu/drm/i915/i915_gem_internal.c
> @@ -188,9 +188,10 @@ i915_gem_object_create_internal(struct drm_i915_private *i915,
>  	drm_gem_private_object_init(&i915->drm, &obj->base, size);
>  	i915_gem_object_init(obj, &i915_gem_object_internal_ops);
>  
> -	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> +	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> +	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
>  
>  	return obj;
>  }
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index 58ccf8b8ca1c..9f84be171ad2 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -802,9 +802,10 @@ i915_gem_userptr_ioctl(struct drm_device *dev, void *data, struct drm_file *file
>  
>  	drm_gem_private_object_init(dev, &obj->base, args->user_size);
>  	i915_gem_object_init(obj, &i915_gem_userptr_ops);
> -	obj->cache_level = I915_CACHE_LLC;
> -	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> +	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
> +	obj->cache_level = I915_CACHE_LLC;
> +	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
>  
>  	obj->userptr.ptr = args->user_ptr;
>  	obj->userptr.read_only = !!(args->flags & I915_USERPTR_READ_ONLY);
> diff --git a/drivers/gpu/drm/i915/selftests/huge_gem_object.c b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> index 4e681fc13be4..0ca867a877b6 100644
> --- a/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> +++ b/drivers/gpu/drm/i915/selftests/huge_gem_object.c
> @@ -126,9 +126,10 @@ huge_gem_object(struct drm_i915_private *i915,
>  	drm_gem_private_object_init(&i915->drm, &obj->base, dma_size);
>  	i915_gem_object_init(obj, &huge_ops);
>  
> -	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	obj->base.read_domains = I915_GEM_DOMAIN_CPU;
> +	obj->base.write_domain = I915_GEM_DOMAIN_CPU;
>  	obj->cache_level = HAS_LLC(i915) ? I915_CACHE_LLC : I915_CACHE_NONE;
> +	obj->cache_dirty = !i915_gem_object_is_coherent(obj);
>  	obj->scratch = phys_size;
>  
>  	return obj;
> -- 
> 2.11.0
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2017-05-17 17:54 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-17 15:05 [PATCH] drm/i915: Mark CPU cache as dirty on every transition for CPU writes Chris Wilson
2017-05-17 15:26 ` ✓ Fi.CI.BAT: success for " Patchwork
2017-05-17 17:53 ` [PATCH] " Dongwon Kim

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.