All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chris Wilson <chris@chris-wilson.co.uk>
To: intel-gfx@lists.freedesktop.org
Subject: [PATCH 14/15] drm/i915: Async GPU relocation processing
Date: Thu, 16 Mar 2017 13:20:05 +0000	[thread overview]
Message-ID: <20170316132006.7976-15-chris@chris-wilson.co.uk> (raw)
In-Reply-To: <20170316132006.7976-1-chris@chris-wilson.co.uk>

If the user requires patching of their batch or auxiliary buffers, we
currently make the alterations on the cpu. If they are active on the GPU
at the time, we wait under the struct_mutex for them to finish executing
before we rewrite the contents. This happens if shared relocation trees
are used between different contexts with separate address space (and the
buffers then have different addresses in each), the 3D state will need
to be adjusted between execution on each context. However, we don't need
to use the CPU to do the relocation patching, as we could queue commands
to the GPU to perform it and use fences to serialise the operation with
the current activity and future - so the operation on the GPU appears
just as atomic as performing it immediately. Performing the relocation
rewrites on the GPU is not free, in terms of pure throughput, the number
of relocations/s is about halved - but more importantly so is the time
under the struct_mutex.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/i915_gem.c            |   1 -
 drivers/gpu/drm/i915/i915_gem_execbuffer.c | 217 +++++++++++++++++++++++++++--
 2 files changed, 208 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
index c6077eba3d3c..15ca0dea0a13 100644
--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -4192,7 +4192,6 @@ static void __i915_gem_free_objects(struct drm_i915_private *i915,
 		GEM_BUG_ON(i915_gem_object_is_active(obj));
 		list_for_each_entry_safe(vma, vn,
 					 &obj->vma_list, obj_link) {
-			GEM_BUG_ON(!i915_vma_is_ggtt(vma));
 			GEM_BUG_ON(i915_vma_is_active(vma));
 			vma->flags &= ~I915_VMA_PIN_MASK;
 			i915_vma_close(vma);
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index d39ea2ba20ff..89d2218ae9ea 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -40,7 +40,12 @@
 #include "intel_drv.h"
 #include "intel_frontbuffer.h"
 
-#define DBG_USE_CPU_RELOC 0 /* -1 force GTT relocs; 1 force CPU relocs */
+enum {
+	FORCE_CPU_RELOC = 1,
+	FORCE_GTT_RELOC,
+	FORCE_GPU_RELOC,
+#define DBG_FORCE_RELOC 0 /* choose one of the above! */
+};
 
 #define  __EXEC_OBJECT_HAS_PIN		BIT(31)
 #define  __EXEC_OBJECT_HAS_FENCE	BIT(30)
@@ -187,9 +192,14 @@ struct i915_execbuffer {
 		struct drm_mm_node node;
 		unsigned long vaddr;
 		unsigned int page;
-		bool use_64bit_reloc;
-		bool has_llc;
-		bool has_fence;
+		unsigned int gen;
+		bool use_64bit_reloc : 1;
+		bool has_llc : 1;
+		bool has_fence : 1;
+
+		struct drm_i915_gem_request *rq;
+		u32 *rq_cmd;
+		unsigned int rq_size;
 	} reloc_cache;
 	u64 invalid_flags;
 	u32 context_flags;
@@ -434,8 +444,11 @@ static inline int use_cpu_reloc(const struct reloc_cache *cache,
 	if (!i915_gem_object_has_struct_page(obj))
 		return false;
 
-	if (DBG_USE_CPU_RELOC)
-		return DBG_USE_CPU_RELOC > 0;
+	if (DBG_FORCE_RELOC == FORCE_CPU_RELOC)
+		return true;
+
+	if (DBG_FORCE_RELOC == FORCE_GTT_RELOC)
+		return false;
 
 	return (cache->has_llc ||
 		obj->base.write_domain == I915_GEM_DOMAIN_CPU ||
@@ -807,10 +820,13 @@ static void reloc_cache_init(struct reloc_cache *cache,
 	cache->page = -1;
 	cache->vaddr = 0;
 	/* Must be a variable in the struct to allow GCC to unroll. */
+	cache->gen = INTEL_GEN(i915);
 	cache->has_llc = HAS_LLC(i915);
-	cache->has_fence = INTEL_GEN(i915) < 4;
-	cache->use_64bit_reloc = HAS_64BIT_RELOC(i915);
+	cache->has_fence = cache->gen < 4;
+	cache->use_64bit_reloc = cache->gen >= 8;
 	cache->node.allocated = false;
+	cache->rq = NULL;
+	cache->rq_size = 0;
 }
 
 static inline void *unmask_page(unsigned long p)
@@ -832,10 +848,24 @@ static inline struct i915_ggtt *cache_to_ggtt(struct reloc_cache *cache)
 	return &i915->ggtt;
 }
 
+static void reloc_gpu_flush(struct reloc_cache *cache)
+{
+	GEM_BUG_ON(cache->rq_size >= cache->rq->batch->obj->base.size / sizeof(u32));
+	cache->rq_cmd[cache->rq_size] = MI_BATCH_BUFFER_END;
+	i915_gem_object_unpin_map(cache->rq->batch->obj);
+	i915_gem_chipset_flush(cache->rq->i915);
+
+	__i915_add_request(cache->rq, true);
+	cache->rq = NULL;
+}
+
 static void reloc_cache_reset(struct reloc_cache *cache)
 {
 	void *vaddr;
 
+	if (cache->rq)
+		reloc_gpu_flush(cache);
+
 	if (!cache->vaddr)
 		return;
 
@@ -1000,6 +1030,112 @@ static void clflush_write32(u32 *addr, u32 value, unsigned int flushes)
 		*addr = value;
 }
 
+static u32 *reloc_gpu(struct i915_execbuffer *eb,
+		      struct i915_vma *vma,
+		      unsigned int len)
+{
+	struct reloc_cache *cache = &eb->reloc_cache;
+	u32 *cmd;
+
+	if (cache->rq_size > PAGE_SIZE/sizeof(u32) - (len + 1))
+		reloc_gpu_flush(cache);
+
+	if (!cache->rq) {
+		struct drm_i915_gem_object *obj;
+		struct drm_i915_gem_request *rq;
+		struct i915_vma *batch;
+		int err;
+
+		GEM_BUG_ON(vma->obj->base.write_domain & I915_GEM_DOMAIN_CPU);
+
+		obj = i915_gem_batch_pool_get(&eb->engine->batch_pool,
+					      PAGE_SIZE);
+		if (IS_ERR(obj))
+			return ERR_CAST(obj);
+
+		cmd = i915_gem_object_pin_map(obj,
+					      cache->has_llc ? I915_MAP_WB : I915_MAP_WC);
+		i915_gem_object_unpin_pages(obj);
+		if (IS_ERR(cmd))
+			return ERR_CAST(cmd);
+
+		err = i915_gem_object_set_to_gtt_domain(obj, false);
+		if (err) {
+err_unmap:
+			i915_gem_object_unpin_map(obj);
+			return ERR_PTR(err);
+		}
+
+		batch = i915_vma_instance(obj, vma->vm, NULL);
+		if (IS_ERR(batch)) {
+			err = PTR_ERR(batch);
+			goto err_unmap;
+		}
+
+		err = i915_vma_pin(batch, 0, 0, PIN_USER | PIN_NONBLOCK);
+		if (err)
+			goto err_unmap;
+
+		rq = i915_gem_request_alloc(eb->engine, eb->ctx);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+err_unpin:
+			i915_vma_unpin(batch);
+			goto err_unmap;
+		}
+
+		err = i915_gem_request_await_object(rq,
+						    vma->obj,
+						    EXEC_OBJECT_WRITE);
+		if (err) {
+err_request:
+			i915_add_request(rq);
+			goto err_unpin;
+		}
+
+		err = eb->engine->emit_flush(rq, EMIT_INVALIDATE);
+		if (err)
+			goto err_request;
+
+		err = i915_switch_context(rq);
+		if (err)
+			goto err_request;
+
+		err = eb->engine->emit_bb_start(rq,
+						batch->node.start, PAGE_SIZE,
+						cache->gen > 5 ? 0 : I915_DISPATCH_SECURE);
+		if (err)
+			goto err_request;
+
+		GEM_BUG_ON(!reservation_object_test_signaled_rcu(obj->resv,
+								 true));
+		i915_vma_move_to_active(batch, rq, 0);
+		reservation_object_lock(obj->resv, NULL);
+		reservation_object_add_excl_fence(obj->resv, &rq->fence);
+		reservation_object_unlock(obj->resv);
+		i915_vma_unpin(batch);
+
+		i915_vma_move_to_active(vma, rq, true);
+		reservation_object_lock(vma->obj->resv, NULL);
+		reservation_object_add_excl_fence(vma->obj->resv, &rq->fence);
+		reservation_object_unlock(vma->obj->resv);
+
+		vma->obj->base.write_domain = 0;
+		vma->obj->base.read_domains = I915_GEM_GPU_DOMAINS;
+
+		rq->batch = batch;
+
+		cache->rq = rq;
+		cache->rq_cmd = cmd;
+		cache->rq_size = 0;
+	}
+
+	cmd = cache->rq_cmd + cache->rq_size;
+	cache->rq_size += len;
+
+	return cmd;
+}
+
 static u64
 relocate_entry(struct i915_vma *vma,
 	       const struct drm_i915_gem_relocation_entry *reloc,
@@ -1012,6 +1148,67 @@ relocate_entry(struct i915_vma *vma,
 	bool wide = eb->reloc_cache.use_64bit_reloc;
 	void *vaddr;
 
+	if (!eb->reloc_cache.vaddr &&
+	    (DBG_FORCE_RELOC == FORCE_GPU_RELOC ||
+	     !reservation_object_test_signaled_rcu(obj->resv, true))) {
+		const unsigned int gen = eb->reloc_cache.gen;
+		unsigned int len;
+		u32 *batch;
+		u64 addr;
+
+		if (wide)
+			len = offset & 7 ? 8 : 5;
+		else if (gen >= 4)
+			len = 4;
+		else if (gen >= 3)
+			len = 3;
+		else /* On gen2 MI_STORE_DWORD_IMM uses a physical address */
+			goto repeat;
+
+		batch = reloc_gpu(eb, vma, len);
+		if (IS_ERR(batch))
+			goto repeat;
+
+		addr = gen8_canonical_addr(vma->node.start + offset);
+		if (wide) {
+			if (offset & 7) {
+				*batch++ = MI_STORE_DWORD_IMM_GEN4;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = lower_32_bits(target_offset);
+
+				addr = gen8_canonical_addr(addr + 4);
+
+				*batch++ = MI_STORE_DWORD_IMM_GEN4;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = upper_32_bits(target_offset);
+			} else {
+				*batch++ = (MI_STORE_DWORD_IMM_GEN4 | (1 << 21)) + 1;
+				*batch++ = lower_32_bits(addr);
+				*batch++ = upper_32_bits(addr);
+				*batch++ = lower_32_bits(target_offset);
+				*batch++ = upper_32_bits(target_offset);
+			}
+		} else if (gen >= 6) {
+			*batch++ = MI_STORE_DWORD_IMM_GEN4;
+			*batch++ = 0;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		} else if (gen >= 4) {
+			*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
+			*batch++ = 0;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		} else {
+			*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
+			*batch++ = addr;
+			*batch++ = target_offset;
+		}
+
+		goto out;
+	}
+
 repeat:
 	vaddr = reloc_vaddr(obj, &eb->reloc_cache, offset >> PAGE_SHIFT);
 	if (IS_ERR(vaddr))
@@ -1028,6 +1225,7 @@ relocate_entry(struct i915_vma *vma,
 		goto repeat;
 	}
 
+out:
 	return gen8_canonical_addr(target->node.start) | 1;
 }
 
@@ -1088,7 +1286,8 @@ eb_relocate_entry(struct i915_execbuffer *eb,
 	/* If the relocation already has the right value in it, no
 	 * more work needs to be done.
 	 */
-	if (gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
+	if (!DBG_FORCE_RELOC &&
+	    gen8_canonical_addr(target->node.start) == reloc->presumed_offset)
 		return 0;
 
 	/* Check that the relocation address is valid... */
-- 
2.11.0

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

  parent reply	other threads:[~2017-03-16 13:20 UTC|newest]

Thread overview: 39+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-03-16 13:19 Make execbuf fast and green Chris Wilson
2017-03-16 13:19 ` [PATCH 01/15] drm/i915: Copy user requested buffers into the error state Chris Wilson
2017-03-16 13:19 ` [PATCH 02/15] drm/i915: Retire an active batch pool object rather than allocate new Chris Wilson
2017-03-17  8:52   ` Joonas Lahtinen
2017-03-17  9:02     ` Chris Wilson
2017-03-16 13:19 ` [PATCH 03/15] drm/i915: Amalgamate execbuffer parameter structures Chris Wilson
2017-03-17 10:04   ` Joonas Lahtinen
2017-03-16 13:19 ` [PATCH 04/15] drm/i915: Use vma->exec_entry as our double-entry placeholder Chris Wilson
2017-03-16 13:19 ` [PATCH 05/15] drm/i915: Split vma exec_link/evict_link Chris Wilson
2017-03-16 13:19 ` [PATCH 06/15] drm/i915: Stop using obj->obj_exec_link outside of execbuf Chris Wilson
2017-03-16 13:19 ` [PATCH 07/15] drm/i915: Store a direct lookup from object handle to vma Chris Wilson
2017-03-22  9:33   ` [PATCH v2] " Chris Wilson
2017-03-22 14:22     ` Joonas Lahtinen
2017-03-23 14:23       ` Daniel Vetter
2017-03-23 14:40         ` Chris Wilson
2017-03-16 13:19 ` [PATCH 08/15] drm/i915: Pass vma to relocate entry Chris Wilson
2017-03-21 14:17   ` Joonas Lahtinen
2017-03-23 13:02     ` [PATCH] " Chris Wilson
2017-03-16 13:20 ` [PATCH 09/15] drm/i915: Eliminate lots of iterations over the execobjects array Chris Wilson
2017-03-16 13:20 ` [PATCH 10/15] drm/i915: First try the previous execbuffer location Chris Wilson
2017-03-21 13:54   ` Joonas Lahtinen
2017-03-16 13:20 ` [PATCH 11/15] drm/i915: Wait upon userptr get-user-pages within execbuffer Chris Wilson
2017-03-16 13:20 ` [PATCH 12/15] drm/i915: Remove superfluous i915_add_request_no_flush() helper Chris Wilson
2017-03-17 11:17   ` Joonas Lahtinen
2017-03-17 13:06     ` Chris Wilson
2017-03-16 13:20 ` [PATCH 13/15] drm/i915: Allow execbuffer to use the first object as the batch Chris Wilson
2017-03-17 11:15   ` Joonas Lahtinen
2017-07-07 10:17     ` Daniel Vetter
2017-07-07 11:58       ` Chris Wilson
2017-07-10  5:55         ` Daniel Vetter
2017-07-08  3:54       ` Kenneth Graunke
2017-03-16 13:20 ` Chris Wilson [this message]
2017-03-17 11:11   ` [PATCH v2] drm/i915: Async GPU relocation processing Chris Wilson
2017-03-17 11:15   ` [PATCH 14/15] " Joonas Lahtinen
2017-03-16 13:20 ` [PATCH 15/15] drm/i915/scheduler: Support user-defined priorities Chris Wilson
2017-03-16 13:57 ` ✗ Fi.CI.BAT: warning for series starting with [01/15] drm/i915: Copy user requested buffers into the error state Patchwork
2017-03-17 11:28 ` ✓ Fi.CI.BAT: success for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev2) Patchwork
2017-03-22  9:43 ` ✗ Fi.CI.BAT: failure for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev3) Patchwork
2017-03-23 13:51 ` ✗ Fi.CI.BAT: failure for series starting with [01/15] drm/i915: Copy user requested buffers into the error state (rev4) Patchwork

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170316132006.7976-15-chris@chris-wilson.co.uk \
    --to=chris@chris-wilson.co.uk \
    --cc=intel-gfx@lists.freedesktop.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.