[Intel-gfx] [RFC 1/4] drm/i915/gt: Add an insert_entry for gen8

All of lore.kernel.org
 help / color / mirror / Atom feed

* [Intel-gfx] [RFC 1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt
@ 2020-11-28 18:40 Chris Wilson
  2020-11-28 18:40 ` [Intel-gfx] [RFC 2/4] drm/i915/gt: Add a routine to iterate over the pagetables of a GTT Chris Wilson
                   ` (4 more replies)
  0 siblings, 5 replies; 16+ messages in thread
From: Chris Wilson @ 2020-11-28 18:40 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

In the next patch, we will want to write a PTE for an explicit
dma address, outside of the usual vma.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index a37c968ef8f7..1600a654baa8 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -556,6 +556,25 @@ static void gen8_ppgtt_insert(struct i915_address_space *vm,
 	}
 }
 
+static void gen8_ppgtt_insert_entry(struct i915_address_space *vm,
+				    dma_addr_t addr,
+				    u64 offset,
+				    enum i915_cache_level level,
+				    u32 flags)
+{
+	u64 idx = offset >> GEN8_PTE_SHIFT;
+	struct i915_page_directory * const pdp =
+		gen8_pdp_for_page_index(vm, idx);
+	struct i915_page_directory *pd =
+		i915_pd_entry(pdp, gen8_pd_index(idx, 2));
+	gen8_pte_t *vaddr;
+
+	vaddr = kmap_atomic_px(i915_pt_entry(pd, gen8_pd_index(idx, 1)));
+	vaddr[gen8_pd_index(idx, 0)] = gen8_pte_encode(addr, level, flags);
+	clflush_cache_range(&vaddr[gen8_pd_index(idx, 0)], sizeof(*vaddr));
+	kunmap_atomic(vaddr);
+}
+
 static int gen8_init_scratch(struct i915_address_space *vm)
 {
 	int ret;
@@ -728,6 +747,7 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt)
 
 	ppgtt->vm.bind_async_flags = I915_VMA_LOCAL_BIND;
 	ppgtt->vm.insert_entries = gen8_ppgtt_insert;
+	ppgtt->vm.insert_page = gen8_ppgtt_insert_entry;
 	ppgtt->vm.allocate_va_range = gen8_ppgtt_alloc;
 	ppgtt->vm.clear_range = gen8_ppgtt_clear;
 
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Intel-gfx] [RFC 2/4] drm/i915/gt: Add a routine to iterate over the pagetables of a GTT
  2020-11-28 18:40 [Intel-gfx] [RFC 1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt Chris Wilson
@ 2020-11-28 18:40 ` Chris Wilson
  2020-11-28 18:40 ` [Intel-gfx] [RFC 3/4] drm/i915/gt: Export the pinned context constructor Chris Wilson
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: Chris Wilson @ 2020-11-28 18:40 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

In the next patch, we will want to look at the dma addresses of
individual page tables, so add a routine to iterate over them.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 49 ++++++++++++++++++++++++++++
 drivers/gpu/drm/i915/gt/intel_gtt.h  |  7 ++++
 2 files changed, 56 insertions(+)

diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
index 1600a654baa8..abf89b0c19da 100644
--- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
+++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c
@@ -357,6 +357,54 @@ static void gen8_ppgtt_alloc(struct i915_address_space *vm,
 			   &start, start + length, vm->top);
 }
 
+static void __gen8_ppgtt_foreach(struct i915_address_space *vm,
+				 struct i915_page_directory *pd,
+				 u64 *start, u64 end, int lvl,
+				 void (*fn)(struct i915_address_space *vm,
+					    struct i915_page_table *pt,
+					    void *data),
+				 void *data)
+{
+	unsigned int idx, len;
+
+	len = gen8_pd_range(*start, end, lvl--, &idx);
+
+	spin_lock(&pd->lock);
+	do {
+		struct i915_page_table *pt = pd->entry[idx];
+
+		atomic_inc(&pt->used);
+		spin_unlock(&pd->lock);
+
+		if (lvl) {
+			__gen8_ppgtt_foreach(vm, as_pd(pt), start, end, lvl,
+					     fn, data);
+		} else {
+			fn(vm, pt, data);
+			*start += gen8_pt_count(*start, end);
+		}
+
+		spin_lock(&pd->lock);
+		atomic_dec(&pt->used);
+	} while (idx++, --len);
+	spin_unlock(&pd->lock);
+}
+
+static void gen8_ppgtt_foreach(struct i915_address_space *vm,
+			       u64 start, u64 length,
+			       void (*fn)(struct i915_address_space *vm,
+					  struct i915_page_table *pt,
+					  void *data),
+			       void *data)
+{
+	start >>= GEN8_PTE_SHIFT;
+	length >>= GEN8_PTE_SHIFT;
+
+	__gen8_ppgtt_foreach(vm, i915_vm_to_ppgtt(vm)->pd,
+			     &start, start + length, vm->top,
+			     fn, data);
+}
+
 static __always_inline u64
 gen8_ppgtt_insert_pte(struct i915_ppgtt *ppgtt,
 		      struct i915_page_directory *pdp,
@@ -750,6 +798,7 @@ struct i915_ppgtt *gen8_ppgtt_create(struct intel_gt *gt)
 	ppgtt->vm.insert_page = gen8_ppgtt_insert_entry;
 	ppgtt->vm.allocate_va_range = gen8_ppgtt_alloc;
 	ppgtt->vm.clear_range = gen8_ppgtt_clear;
+	ppgtt->vm.foreach = gen8_ppgtt_foreach;
 
 	ppgtt->vm.pte_encode = gen8_pte_encode;
 
diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.h b/drivers/gpu/drm/i915/gt/intel_gtt.h
index 8a33940a71f3..f91ad8442f2e 100644
--- a/drivers/gpu/drm/i915/gt/intel_gtt.h
+++ b/drivers/gpu/drm/i915/gt/intel_gtt.h
@@ -281,6 +281,13 @@ struct i915_address_space {
 			       u32 flags);
 	void (*cleanup)(struct i915_address_space *vm);
 
+	void (*foreach)(struct i915_address_space *vm,
+			u64 start, u64 length,
+			void (*fn)(struct i915_address_space *vm,
+				   struct i915_page_table *pt,
+				   void *data),
+			void *data);
+
 	struct i915_vma_ops vma_ops;
 
 	I915_SELFTEST_DECLARE(struct fault_attr fault_attr);
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Intel-gfx] [RFC 3/4] drm/i915/gt: Export the pinned context constructor
  2020-11-28 18:40 [Intel-gfx] [RFC 1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt Chris Wilson
  2020-11-28 18:40 ` [Intel-gfx] [RFC 2/4] drm/i915/gt: Add a routine to iterate over the pagetables of a GTT Chris Wilson
@ 2020-11-28 18:40 ` Chris Wilson
  2020-11-28 18:40 ` [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration Chris Wilson
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 16+ messages in thread
From: Chris Wilson @ 2020-11-28 18:40 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

Allow internal clients to create a pinned context.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/gt/intel_engine.h    |  8 ++++++++
 drivers/gpu/drm/i915/gt/intel_engine_cs.c | 17 ++++++++++-------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index 760fefdfe392..ac58fcda4927 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -19,7 +19,9 @@
 #include "intel_workarounds.h"
 
 struct drm_printer;
+struct intel_context;
 struct intel_gt;
+struct lock_class_key;
 
 /* Early gen2 devices have a cacheline of just 32 bytes, using 64 is overkill,
  * but keeps the logic simple. Indeed, the whole purpose of this macro is just
@@ -336,6 +338,12 @@ struct i915_request *
 intel_engine_find_active_request(struct intel_engine_cs *engine);
 
 u32 intel_engine_context_size(struct intel_gt *gt, u8 class);
+struct intel_context *
+intel_engine_create_pinned_context(struct intel_engine_cs *engine,
+				   unsigned int ring_size,
+				   unsigned int hwsp,
+				   struct lock_class_key *key,
+				   const char *name);
 
 void intel_engine_init_active(struct intel_engine_cs *engine,
 			      unsigned int subclass);
diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
index d4e988b2816a..f37de40f8a4f 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c
+++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c
@@ -796,11 +796,12 @@ intel_engine_init_active(struct intel_engine_cs *engine, unsigned int subclass)
 #endif
 }
 
-static struct intel_context *
-create_pinned_context(struct intel_engine_cs *engine,
-		      unsigned int hwsp,
-		      struct lock_class_key *key,
-		      const char *name)
+struct intel_context *
+intel_engine_create_pinned_context(struct intel_engine_cs *engine,
+				   unsigned int ring_size,
+				   unsigned int hwsp,
+				   struct lock_class_key *key,
+				   const char *name)
 {
 	struct intel_context *ce;
 	int err;
@@ -811,6 +812,7 @@ create_pinned_context(struct intel_engine_cs *engine,
 
 	__set_bit(CONTEXT_BARRIER_BIT, &ce->flags);
 	ce->timeline = page_pack_bits(NULL, hwsp);
+	ce->ring = __intel_context_ring_size(ring_size);
 
 	err = intel_context_pin(ce); /* perma-pin so it is always available */
 	if (err) {
@@ -834,8 +836,9 @@ create_kernel_context(struct intel_engine_cs *engine)
 {
 	static struct lock_class_key kernel;
 
-	return create_pinned_context(engine, I915_GEM_HWS_SEQNO_ADDR,
-				     &kernel, "kernel_context");
+	return intel_engine_create_pinned_context(engine, SZ_4K,
+						  I915_GEM_HWS_SEQNO_ADDR,
+						  &kernel, "kernel_context");
 }
 
 /**
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-11-28 18:40 [Intel-gfx] [RFC 1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt Chris Wilson
  2020-11-28 18:40 ` [Intel-gfx] [RFC 2/4] drm/i915/gt: Add a routine to iterate over the pagetables of a GTT Chris Wilson
  2020-11-28 18:40 ` [Intel-gfx] [RFC 3/4] drm/i915/gt: Export the pinned context constructor Chris Wilson
@ 2020-11-28 18:40 ` Chris Wilson
  2020-11-30 13:12   ` Tvrtko Ursulin
  2020-11-30 15:19 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [RFC,1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt Patchwork
  2020-11-30 15:51 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
  4 siblings, 1 reply; 16+ messages in thread
From: Chris Wilson @ 2020-11-28 18:40 UTC (permalink / raw)
  To: intel-gfx; +Cc: Chris Wilson

If we pipeline the PTE updates and then do the copy of those pages
within a single unpreemptible command packet, we can submit the copies
and leave them to be scheduled without having to synchronously wait
under a global lock.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
---
 drivers/gpu/drm/i915/Makefile                 |   1 +
 drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
 drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
 drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
 drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
 .../drm/i915/selftests/i915_live_selftests.h  |   1 +
 6 files changed, 511 insertions(+)
 create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
 create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
 create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c

diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
index e5574e506a5c..0b2e12c87f9d 100644
--- a/drivers/gpu/drm/i915/Makefile
+++ b/drivers/gpu/drm/i915/Makefile
@@ -103,6 +103,7 @@ gt-y += \
 	gt/intel_gtt.o \
 	gt/intel_llc.o \
 	gt/intel_lrc.o \
+	gt/intel_migrate.o \
 	gt/intel_mocs.o \
 	gt/intel_ppgtt.o \
 	gt/intel_rc6.o \
diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
index ac58fcda4927..079d26b47a97 100644
--- a/drivers/gpu/drm/i915/gt/intel_engine.h
+++ b/drivers/gpu/drm/i915/gt/intel_engine.h
@@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
 #define I915_GEM_HWS_PREEMPT_ADDR	(I915_GEM_HWS_PREEMPT * sizeof(u32))
 #define I915_GEM_HWS_SEQNO		0x40
 #define I915_GEM_HWS_SEQNO_ADDR		(I915_GEM_HWS_SEQNO * sizeof(u32))
+#define I915_GEM_HWS_MIGRATE		(0x42 * sizeof(u32))
 #define I915_GEM_HWS_SCRATCH		0x80
 
 #define I915_HWS_CSB_BUF0_INDEX		0x10
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
new file mode 100644
index 000000000000..4d7bd32eb8d4
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -0,0 +1,370 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#include "i915_drv.h"
+#include "intel_context.h"
+#include "intel_gt.h"
+#include "intel_gtt.h"
+#include "intel_lrc.h" /* virtual engine */
+#include "intel_migrate.h"
+#include "intel_ring.h"
+
+#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
+
+static void insert_pte(struct i915_address_space *vm,
+		       struct i915_page_table *pt,
+		       void *data)
+{
+	u64 *offset = data;
+
+	vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
+	*offset += PAGE_SIZE;
+}
+
+static struct i915_address_space *migrate_vm(struct intel_gt *gt)
+{
+	struct i915_vm_pt_stash stash = {};
+	struct i915_ppgtt *vm;
+	u64 offset, sz;
+	int err;
+
+	vm = i915_ppgtt_create(gt);
+	if (IS_ERR(vm))
+		return ERR_CAST(vm);
+
+	if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
+		err = -ENODEV;
+		goto err_vm;
+	}
+
+	/*
+	 * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
+	 * 4x2 page directories for source/destination.
+	 */
+	sz = 2 * CHUNK_SZ;
+	offset = sz;
+
+	/*
+	 * We need another page directory setup so that we can write
+	 * the 8x512 PTE in each chunk.
+	 */
+	sz += (sz >> 12) * sizeof(u64);
+
+	err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
+	if (err)
+		goto err_vm;
+
+	err = i915_vm_pin_pt_stash(&vm->vm, &stash);
+	if (err) {
+		i915_vm_free_pt_stash(&vm->vm, &stash);
+		goto err_vm;
+	}
+
+	vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
+	i915_vm_free_pt_stash(&vm->vm, &stash);
+
+	/* Now allow the GPU to rewrite the PTE via its own ppGTT */
+	vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);
+
+	return &vm->vm;
+
+err_vm:
+	i915_vm_put(&vm->vm);
+	return ERR_PTR(err);
+}
+
+static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt)
+{
+	struct intel_engine_cs *engine;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
+		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
+		if (engine)
+			return engine;
+	}
+
+	return NULL;
+}
+
+static struct intel_context *pinned_context(struct intel_gt *gt)
+{
+	static struct lock_class_key key;
+	struct intel_engine_cs *engine;
+	struct i915_address_space *vm;
+	struct intel_context *ce;
+	int err;
+
+	engine = first_copy_engine(gt);
+	if (!engine)
+		return ERR_PTR(-ENODEV);
+
+	ce = intel_engine_create_pinned_context(engine, SZ_512K,
+						I915_GEM_HWS_MIGRATE,
+						&key, "migrate");
+	if (IS_ERR(ce))
+		return ce;
+
+	vm = migrate_vm(gt);
+	if (IS_ERR(vm)) {
+		err = PTR_ERR(vm);
+		goto err_ce;
+	}
+	i915_vm_put(ce->vm);
+	ce->vm = vm;
+
+	return ce;
+
+err_ce:
+	intel_context_put(ce);
+	return ERR_PTR(err);
+}
+
+int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt)
+{
+	struct intel_context *ce;
+
+	ce = pinned_context(gt);
+	if (IS_ERR(ce))
+		return PTR_ERR(ce);
+
+	m->ce = ce;
+	return 0;
+}
+
+static struct intel_context *__migrate_engines(struct intel_gt *gt)
+{
+	struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE];
+	struct intel_engine_cs *engine;
+	unsigned int count, i;
+
+	count = 0;
+	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
+		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
+		if (engine)
+			engines[count++] = engine;
+	}
+
+	return intel_execlists_create_virtual(engines, count);
+}
+
+struct intel_context *intel_migrate_create_context(struct intel_migrate *m)
+{
+	struct intel_context *ce;
+
+	ce = __migrate_engines(m->ce->engine->gt);
+	if (IS_ERR(ce))
+		return ce;
+
+	ce->ring = __intel_context_ring_size(SZ_512K);
+
+	i915_vm_put(ce->vm);
+	ce->vm = i915_vm_get(m->ce->vm);
+
+	return ce;
+}
+
+static inline struct sgt_dma sg_sgt(struct scatterlist *sg)
+{
+	dma_addr_t addr = sg_dma_address(sg);
+
+	return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) };
+}
+
+static int emit_pte(struct i915_request *rq,
+		    struct sgt_dma *it,
+		    u64 encode,
+		    int offset,
+		    int length)
+{
+	int total = 0;
+
+	offset >>= 12;
+	offset *= sizeof(u64);
+	offset += 2 * CHUNK_SZ;
+
+	do {
+		u32 *cs;
+
+		cs = intel_ring_begin(rq, 8);
+		if (IS_ERR(cs))
+			return PTR_ERR(cs);
+
+		*cs++ = MI_STORE_DWORD_IMM_GEN4;
+		*cs++ = offset;
+		*cs++ = 0;
+		*cs++ = lower_32_bits(encode | it->dma);
+		*cs++ = MI_STORE_DWORD_IMM_GEN4;
+		*cs++ = offset + 4;
+		*cs++ = 0;
+		*cs++ = upper_32_bits(encode | it->dma);
+		intel_ring_advance(rq, cs);
+
+		offset += 8;
+		total += I915_GTT_PAGE_SIZE;
+
+		it->dma += I915_GTT_PAGE_SIZE;
+		if (it->dma >= it->max) {
+			it->sg = __sg_next(it->sg);
+			if (!it->sg || sg_dma_len(it->sg) == 0)
+				break;
+
+			it->dma = sg_dma_address(it->sg);
+			it->max = it->dma + sg_dma_len(it->sg);
+		}
+
+		if (total == length)
+			break;
+	} while (1);
+
+	return total;
+}
+
+static bool wa_1209644611_applies(int gen, u32 size)
+{
+	u32 height = size >> PAGE_SHIFT;
+
+	if (gen != 11)
+		return false;
+
+	return height % 4 == 3 && height <= 8;
+}
+
+static int emit_copy(struct i915_request *rq, int size)
+{
+	const int gen = INTEL_GEN(rq->engine->i915);
+	u32 *cs;
+
+	cs = intel_ring_begin(rq, gen >= 8 ? 10 : 6);
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	if (gen >= 9 && !wa_1209644611_applies(gen, size)) {
+		*cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
+		*cs++ = BLT_DEPTH_32 | PAGE_SIZE;
+		*cs++ = 0;
+		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+		*cs++ = CHUNK_SZ; /* dst offset */
+		*cs++ = 0;
+		*cs++ = 0;
+		*cs++ = PAGE_SIZE;
+		*cs++ = 0; /* src offset */
+		*cs++ = 0;
+	} else if (gen >= 8) {
+		*cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2);
+		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
+		*cs++ = 0;
+		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
+		*cs++ = CHUNK_SZ; /* dst offset */
+		*cs++ = 0;
+		*cs++ = 0;
+		*cs++ = PAGE_SIZE;
+		*cs++ = 0; /* src offset */
+		*cs++ = 0;
+	} else {
+		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
+		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
+		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE;
+		*cs++ = CHUNK_SZ; /* dst offset */
+		*cs++ = PAGE_SIZE;
+		*cs++ = 0; /* src offset */
+	}
+
+	intel_ring_advance(rq, cs);
+	return 0;
+}
+
+struct i915_request *
+intel_context_migrate_pages(struct intel_context *ce,
+			    struct scatterlist *src,
+			    struct scatterlist *dst)
+{
+	struct sgt_dma it_s = sg_sgt(src), it_d = sg_sgt(dst);
+	u64 encode = ce->vm->pte_encode(0, I915_CACHE_LLC, 0); /* flags */
+	struct i915_request *rq;
+	int len;
+	int err;
+
+	/* GEM_BUG_ON(ce->vm != migrate_vm); */
+
+	err = intel_context_pin(ce);
+	if (err)
+		return ERR_PTR(err);
+
+	GEM_BUG_ON(ce->ring->size < SZ_64K);
+
+	do {
+		rq = i915_request_create(ce);
+		if (IS_ERR(rq)) {
+			err = PTR_ERR(rq);
+			goto out_ce;
+		}
+
+		len = emit_pte(rq, &it_s, encode, 0, CHUNK_SZ);
+		if (len <= 0) {
+			err = len;
+			goto out_rq;
+		}
+
+		if (emit_pte(rq, &it_d, encode, CHUNK_SZ, len) < len) {
+			err = -EINVAL;
+			goto out_rq;
+		}
+
+		err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
+		if (err)
+			goto out_rq;
+
+		err = emit_copy(rq, len);
+		if (err)
+			goto out_rq;
+
+		if (!it_s.sg)
+			i915_request_get(rq);
+out_rq:
+		i915_request_add(rq);
+		if (it_s.sg)
+			cond_resched();
+	} while (err == 0 && it_s.sg);
+
+out_ce:
+	intel_context_unpin(ce);
+	return err ? ERR_PTR(err) : rq;
+}
+
+struct i915_request *
+intel_migrate_pages(struct intel_migrate *m,
+		    struct scatterlist *src,
+		    struct scatterlist *dst)
+{
+	struct intel_context *ce;
+	struct i915_request *rq;
+
+	if (!m->ce)
+		return ERR_PTR(-ENODEV);
+
+	ce = intel_migrate_create_context(m);
+	if (IS_ERR(ce))
+		ce = intel_context_get(m->ce);
+	GEM_BUG_ON(IS_ERR(ce));
+
+	rq = intel_context_migrate_pages(ce, src, dst);
+
+	intel_context_put(ce);
+	return rq;
+}
+
+void intel_migrate_fini(struct intel_migrate *m)
+{
+	if (!m->ce)
+		return;
+
+	intel_context_unpin(m->ce);
+	intel_context_put(m->ce);
+}
+
+#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
+#include "selftest_migrate.c"
+#endif
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.h b/drivers/gpu/drm/i915/gt/intel_migrate.h
new file mode 100644
index 000000000000..8c3c446fbd33
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: MIT */
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+#ifndef __INTEL_MIGRATE__
+#define __INTEL_MIGRATE__
+
+struct i915_request;
+struct intel_context;
+struct intel_gt;
+struct scatterlist;
+
+struct intel_migrate {
+	struct intel_context *ce;
+};
+
+int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt);
+
+struct i915_request *
+intel_migrate_pages(struct intel_migrate *m,
+		    struct scatterlist *src,
+		    struct scatterlist *dst);
+
+struct intel_context *intel_migrate_create_context(struct intel_migrate *m);
+struct i915_request *
+intel_context_migrate_pages(struct intel_context *ce,
+			    struct scatterlist *src,
+			    struct scatterlist *dst);
+
+void intel_migrate_fini(struct intel_migrate *m);
+
+#endif /* __INTEL_MIGRATE__ */
diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c b/drivers/gpu/drm/i915/gt/selftest_migrate.c
new file mode 100644
index 000000000000..d5102058fe3b
--- /dev/null
+++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2020 Intel Corporation
+ */
+
+static int live_migrate(void *arg)
+{
+	struct intel_migrate *m = arg;
+	struct drm_i915_private *i915 = m->ce->engine->i915;
+	const unsigned int sizes[] = {
+		SZ_4K,
+		SZ_64K,
+		SZ_2M,
+		SZ_64M,
+		//SZ_2G,
+	};
+	int err = 0;
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(sizes); i++) {
+		struct drm_i915_gem_object *src, *dst;
+		struct i915_request *rq;
+		u32 *vaddr;
+
+		src = i915_gem_object_create_internal(i915, sizes[i]);
+		if (IS_ERR(src))
+			break;
+
+		vaddr = i915_gem_object_pin_map(src, I915_MAP_WC);
+		if (IS_ERR(vaddr)) {
+			i915_gem_object_put(src);
+			break;
+		}
+
+		for (j = 0; j < sizes[i] / sizeof(u32); j++)
+			vaddr[j] = j;
+		i915_gem_object_flush_map(src);
+
+		dst = i915_gem_object_create_internal(i915, sizes[i]);
+		if (IS_ERR(dst)) {
+			i915_gem_object_put(dst);
+			break;
+		}
+
+		vaddr = i915_gem_object_pin_map(dst, I915_MAP_WC);
+		if (IS_ERR(vaddr)) {
+			i915_gem_object_put(dst);
+			i915_gem_object_put(src);
+			break;
+		}
+
+		for (j = 0; j < sizes[i] / sizeof(u32); j++)
+			vaddr[j] = ~j;
+		i915_gem_object_flush_map(dst);
+
+		rq = intel_migrate_pages(m,
+					 src->mm.pages->sgl,
+					 dst->mm.pages->sgl);
+		if (IS_ERR(rq)) {
+			pr_err("Migration failed, size: %u\n", sizes[i]);
+			err = PTR_ERR(rq);
+		}
+
+		if (i915_request_wait(rq, 0, HZ) < 0) {
+			pr_err("Migration timed out, size: %u\n", sizes[i]);
+			err = -ETIME;
+		}
+		i915_request_put(rq);
+
+		for (j = 0; j < sizes[i] / sizeof(u32); j++) {
+			if (vaddr[j] != j) {
+				pr_err("Copy failed, size: %u, offset: %zu\n",
+				       sizes[i], j * sizeof(u32));
+				igt_hexdump(vaddr + round_down(j, 1024), 4096);
+				err = -EINVAL;
+				break;
+			}
+		}
+
+		i915_gem_object_put(dst);
+		i915_gem_object_put(src);
+		i915_gem_drain_freed_objects(i915);
+		if (err)
+			break;
+	}
+
+	return err;
+}
+
+int intel_migrate_live_selftests(struct drm_i915_private *i915)
+{
+	static const struct i915_subtest tests[] = {
+		SUBTEST(live_migrate),
+	};
+	struct intel_migrate m;
+	int err;
+
+	if (intel_migrate_init(&m, &i915->gt))
+		return 0;
+
+	err = i915_subtests(tests, &m);
+	intel_migrate_fini(&m);
+
+	return err;
+}
diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
index a92c0e9b7e6b..be5e0191eaea 100644
--- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
+++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
@@ -26,6 +26,7 @@ selftest(gt_mocs, intel_mocs_live_selftests)
 selftest(gt_pm, intel_gt_pm_live_selftests)
 selftest(gt_heartbeat, intel_heartbeat_live_selftests)
 selftest(requests, i915_request_live_selftests)
+selftest(migrate, intel_migrate_live_selftests)
 selftest(active, i915_active_live_selftests)
 selftest(objects, i915_gem_object_live_selftests)
 selftest(mman, i915_gem_mman_live_selftests)
-- 
2.20.1

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-11-28 18:40 ` [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration Chris Wilson
@ 2020-11-30 13:12   ` Tvrtko Ursulin
  2020-11-30 13:39     ` Chris Wilson
  0 siblings, 1 reply; 16+ messages in thread
From: Tvrtko Ursulin @ 2020-11-30 13:12 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 28/11/2020 18:40, Chris Wilson wrote:
> If we pipeline the PTE updates and then do the copy of those pages
> within a single unpreemptible command packet, we can submit the copies
> and leave them to be scheduled without having to synchronously wait
> under a global lock.
> 
> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> ---
>   drivers/gpu/drm/i915/Makefile                 |   1 +
>   drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
>   drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
>   drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
>   drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
>   .../drm/i915/selftests/i915_live_selftests.h  |   1 +
>   6 files changed, 511 insertions(+)
>   create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
>   create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
>   create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index e5574e506a5c..0b2e12c87f9d 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -103,6 +103,7 @@ gt-y += \
>   	gt/intel_gtt.o \
>   	gt/intel_llc.o \
>   	gt/intel_lrc.o \
> +	gt/intel_migrate.o \
>   	gt/intel_mocs.o \
>   	gt/intel_ppgtt.o \
>   	gt/intel_rc6.o \
> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
> index ac58fcda4927..079d26b47a97 100644
> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
> @@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
>   #define I915_GEM_HWS_PREEMPT_ADDR	(I915_GEM_HWS_PREEMPT * sizeof(u32))
>   #define I915_GEM_HWS_SEQNO		0x40
>   #define I915_GEM_HWS_SEQNO_ADDR		(I915_GEM_HWS_SEQNO * sizeof(u32))
> +#define I915_GEM_HWS_MIGRATE		(0x42 * sizeof(u32))
>   #define I915_GEM_HWS_SCRATCH		0x80
>   
>   #define I915_HWS_CSB_BUF0_INDEX		0x10
> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
> new file mode 100644
> index 000000000000..4d7bd32eb8d4
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> @@ -0,0 +1,370 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2020 Intel Corporation
> + */
> +
> +#include "i915_drv.h"
> +#include "intel_context.h"
> +#include "intel_gt.h"
> +#include "intel_gtt.h"
> +#include "intel_lrc.h" /* virtual engine */
> +#include "intel_migrate.h"
> +#include "intel_ring.h"
> +
> +#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
> +
> +static void insert_pte(struct i915_address_space *vm,
> +		       struct i915_page_table *pt,
> +		       void *data)
> +{
> +	u64 *offset = data;
> +
> +	vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
> +	*offset += PAGE_SIZE;
> +}
> +
> +static struct i915_address_space *migrate_vm(struct intel_gt *gt)
> +{
> +	struct i915_vm_pt_stash stash = {};
> +	struct i915_ppgtt *vm;
> +	u64 offset, sz;
> +	int err;
> +
> +	vm = i915_ppgtt_create(gt);
> +	if (IS_ERR(vm))
> +		return ERR_CAST(vm);
> +
> +	if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
> +		err = -ENODEV;
> +		goto err_vm;
> +	}
> +
> +	/*
> +	 * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
> +	 * 4x2 page directories for source/destination.
> +	 */
> +	sz = 2 * CHUNK_SZ;
> +	offset = sz;
> +
> +	/*
> +	 * We need another page directory setup so that we can write
> +	 * the 8x512 PTE in each chunk.
> +	 */
> +	sz += (sz >> 12) * sizeof(u64);
> +
> +	err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
> +	if (err)
> +		goto err_vm;
> +
> +	err = i915_vm_pin_pt_stash(&vm->vm, &stash);
> +	if (err) {
> +		i915_vm_free_pt_stash(&vm->vm, &stash);
> +		goto err_vm;
> +	}
> +
> +	vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
> +	i915_vm_free_pt_stash(&vm->vm, &stash);
> +
> +	/* Now allow the GPU to rewrite the PTE via its own ppGTT */
> +	vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);

This is just making the [0 - sz) gva point to the allocated sz bytes of 
backing store?

> +
> +	return &vm->vm;
> +
> +err_vm:
> +	i915_vm_put(&vm->vm);
> +	return ERR_PTR(err);
> +}
> +
> +static struct intel_engine_cs *first_copy_engine(struct intel_gt *gt)
> +{
> +	struct intel_engine_cs *engine;
> +	int i;
> +
> +	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
> +		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
> +		if (engine)
> +			return engine;
> +	}
> +
> +	return NULL;
> +}
> +
> +static struct intel_context *pinned_context(struct intel_gt *gt)
> +{
> +	static struct lock_class_key key;
> +	struct intel_engine_cs *engine;
> +	struct i915_address_space *vm;
> +	struct intel_context *ce;
> +	int err;
> +
> +	engine = first_copy_engine(gt);
> +	if (!engine)
> +		return ERR_PTR(-ENODEV);
> +
> +	ce = intel_engine_create_pinned_context(engine, SZ_512K,
> +						I915_GEM_HWS_MIGRATE,
> +						&key, "migrate");
> +	if (IS_ERR(ce))
> +		return ce;
> +
> +	vm = migrate_vm(gt);
> +	if (IS_ERR(vm)) {
> +		err = PTR_ERR(vm);
> +		goto err_ce;
> +	}
> +	i915_vm_put(ce->vm);
> +	ce->vm = vm;
> +
> +	return ce;
> +
> +err_ce:
> +	intel_context_put(ce);
> +	return ERR_PTR(err);
> +}
> +
> +int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt)
> +{
> +	struct intel_context *ce;
> +
> +	ce = pinned_context(gt);
> +	if (IS_ERR(ce))
> +		return PTR_ERR(ce);
> +
> +	m->ce = ce;
> +	return 0;
> +}
> +
> +static struct intel_context *__migrate_engines(struct intel_gt *gt)
> +{
> +	struct intel_engine_cs *engines[MAX_ENGINE_INSTANCE];
> +	struct intel_engine_cs *engine;
> +	unsigned int count, i;
> +
> +	count = 0;
> +	for (i = 0; i < ARRAY_SIZE(gt->engine_class[COPY_ENGINE_CLASS]); i++) {
> +		engine = gt->engine_class[COPY_ENGINE_CLASS][i];
> +		if (engine)
> +			engines[count++] = engine;
> +	}
> +
> +	return intel_execlists_create_virtual(engines, count);
> +}
> +
> +struct intel_context *intel_migrate_create_context(struct intel_migrate *m)
> +{
> +	struct intel_context *ce;
> +
> +	ce = __migrate_engines(m->ce->engine->gt);
> +	if (IS_ERR(ce))
> +		return ce;
> +
> +	ce->ring = __intel_context_ring_size(SZ_512K);
> +
> +	i915_vm_put(ce->vm);
> +	ce->vm = i915_vm_get(m->ce->vm);
> +
> +	return ce;
> +}
> +
> +static inline struct sgt_dma sg_sgt(struct scatterlist *sg)
> +{
> +	dma_addr_t addr = sg_dma_address(sg);
> +
> +	return (struct sgt_dma){ sg, addr, addr + sg_dma_len(sg) };
> +}
> +
> +static int emit_pte(struct i915_request *rq,
> +		    struct sgt_dma *it,
> +		    u64 encode,
> +		    int offset,
> +		    int length)
> +{
> +	int total = 0;
> +
> +	offset >>= 12;
> +	offset *= sizeof(u64);
> +	offset += 2 * CHUNK_SZ;
> +
> +	do {
> +		u32 *cs;
> +
> +		cs = intel_ring_begin(rq, 8);
> +		if (IS_ERR(cs))
> +			return PTR_ERR(cs);
> +
> +		*cs++ = MI_STORE_DWORD_IMM_GEN4;
> +		*cs++ = offset;
> +		*cs++ = 0;
> +		*cs++ = lower_32_bits(encode | it->dma);
> +		*cs++ = MI_STORE_DWORD_IMM_GEN4;
> +		*cs++ = offset + 4;
> +		*cs++ = 0;
> +		*cs++ = upper_32_bits(encode | it->dma);
> +		intel_ring_advance(rq, cs);
> +
> +		offset += 8;
> +		total += I915_GTT_PAGE_SIZE;
> +
> +		it->dma += I915_GTT_PAGE_SIZE;
> +		if (it->dma >= it->max) {
> +			it->sg = __sg_next(it->sg);
> +			if (!it->sg || sg_dma_len(it->sg) == 0)
> +				break;
> +
> +			it->dma = sg_dma_address(it->sg);
> +			it->max = it->dma + sg_dma_len(it->sg);
> +		}
> +
> +		if (total == length)
> +			break;
> +	} while (1);
> +
> +	return total;
> +}
> +
> +static bool wa_1209644611_applies(int gen, u32 size)
> +{
> +	u32 height = size >> PAGE_SHIFT;
> +
> +	if (gen != 11)
> +		return false;
> +
> +	return height % 4 == 3 && height <= 8;
> +}
> +
> +static int emit_copy(struct i915_request *rq, int size)
> +{
> +	const int gen = INTEL_GEN(rq->engine->i915);
> +	u32 *cs;
> +
> +	cs = intel_ring_begin(rq, gen >= 8 ? 10 : 6);
> +	if (IS_ERR(cs))
> +		return PTR_ERR(cs);
> +
> +	if (gen >= 9 && !wa_1209644611_applies(gen, size)) {migrate_create_context(m);
+	if (IS_ERR(ce))
+		ce = intel_context_get(m->ce);
+	GEM_BUG_ON(IS_ERR(ce));
+
+	rq = intel_context_migrate_pages(ce, src, dst);
+
> +		*cs++ = GEN9_XY_FAST_COPY_BLT_CMD | (10 - 2);
> +		*cs++ = BLT_DEPTH_32 | PAGE_SIZE;
> +		*cs++ = 0;
> +		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
> +		*cs++ = CHUNK_SZ; /* dst offset */
> +		*cs++ = 0;
> +		*cs++ = 0;
> +		*cs++ = PAGE_SIZE;
> +		*cs++ = 0; /* src offset */
> +		*cs++ = 0;
> +	} else if (gen >= 8) {
> +		*cs++ = XY_SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (10 - 2);
> +		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
> +		*cs++ = 0;
> +		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE / 4;
> +		*cs++ = CHUNK_SZ; /* dst offset */
> +		*cs++ = 0;
> +		*cs++ = 0;
> +		*cs++ = PAGE_SIZE;
> +		*cs++ = 0; /* src offset */
> +		*cs++ = 0;
> +	} else {
> +		*cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2);
> +		*cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | PAGE_SIZE;
> +		*cs++ = size >> PAGE_SHIFT << 16 | PAGE_SIZE;
> +		*cs++ = CHUNK_SZ; /* dst offset */
> +		*cs++ = PAGE_SIZE;
> +		*cs++ = 0; /* src offset */
> +	}
> +
> +	intel_ring_advance(rq, cs);
> +	return 0;
> +}
> +
> +struct i915_request *
> +intel_context_migrate_pages(struct intel_context *ce,
> +			    struct scatterlist *src,
> +			    struct scatterlist *dst)
> +{
> +	struct sgt_dma it_s = sg_sgt(src), it_d = sg_sgt(dst);
> +	u64 encode = ce->vm->pte_encode(0, I915_CACHE_LLC, 0); /* flags */
> +	struct i915_request *rq;
> +	int len;
> +	int err;
> +
> +	/* GEM_BUG_ON(ce->vm != migrate_vm); */
> +
> +	err = intel_context_pin(ce);
> +	if (err)
> +		return ERR_PTR(err);
> +
> +	GEM_BUG_ON(ce->ring->size < SZ_64K);
> +
> +	do {
> +		rq = i915_request_create(ce);
> +		if (IS_ERR(rq)) {
> +			err = PTR_ERR(rq);
> +			goto out_ce;
> +		}
> +
> +		len = emit_pte(rq, &it_s, encode, 0, CHUNK_SZ);
> +		if (len <= 0) {
> +			err = len;
> +			goto out_rq;
> +		}
> +
> +		if (emit_pte(rq, &it_d, encode, CHUNK_SZ, len) < len) {
> +			err = -EINVAL;
> +			goto out_rq;
> +		}

Source and destination PTEs into the reserved [0, sz * 2) area?

> +
> +		err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
> +		if (err)
> +			goto out_rq;
> +
> +		err = emit_copy(rq, len);

Right so copy can use fixed offsets.

> +		if (err)
> +			goto out_rq;
> +
> +		if (!it_s.sg)
> +			i915_request_get(rq);
> +out_rq:
> +		i915_request_add(rq);
> +		if (it_s.sg)
> +			cond_resched();

 From what context does this run? No preemptible?

> +	} while (err == 0 && it_s.sg);
> +
> +out_ce:
> +	intel_context_unpin(ce);
> +	return err ? ERR_PTR(err) : rq;
> +}
> +
> +struct i915_request *
> +intel_migrate_pages(struct intel_migrate *m,
> +		    struct scatterlist *src,
> +		    struct scatterlist *dst)
> +{
> +	struct intel_context *ce;
> +	struct i915_request *rq;
> +
> +	if (!m->ce)
> +		return ERR_PTR(-ENODEV);
> +
> +	ce = intel_migrate_create_context(m);
> +	if (IS_ERR(ce))
> +		ce = intel_context_get(m->ce);

If virtual cannot be create use a common pre-created context?

> +	GEM_BUG_ON(IS_ERR(ce));
> +
> +	rq = intel_context_migrate_pages(ce, src, dst);
> +
> +	intel_context_put(ce);

Context is single use for some concrete reason? But it has fallback to a 
single context so not sure. Plan to allow using users context, or to 
inherit their priority so because of that?

...

So I guess overall this is an alternative to fixed vma windows but can 
be pipelined.

I did not get the foreach part. Why do you have to iterate existing 
entries to add entries? Can't you just populate the reserved range from 
the stashed bo dma addresses?

Regards,

Tvrtko

> +	return rq;
> +}
> +
> +void intel_migrate_fini(struct intel_migrate *m)
> +{
> +	if (!m->ce)
> +		return;
> +
> +	intel_context_unpin(m->ce);
> +	intel_context_put(m->ce);
> +}
> +
> +#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
> +#include "selftest_migrate.c" 0, CHUNK_SZ)
> +#endif
> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.h b/drivers/gpu/drm/i915/gt/intel_migrate.h
> new file mode 100644
> index 000000000000..8c3c446fbd33
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.h
> @@ -0,0 +1,33 @@
> +/* SPDX-License-Identifier: MIT */
> +/*
> + * Copyright © 2020 Intel Corporation
> + */
> +
> +#ifndef __INTEL_MIGRATE__
> +#define __INTEL_MIGRATE__
> +
> +struct i915_request;
> +struct intel_context;
> +struct intel_gt;
> +struct scatterlist;
> +
> +struct intel_migrate {
> +	struct intel_context *ce;
> +};
> +
> +int intel_migrate_init(struct intel_migrate *m, struct intel_gt *gt);
> +
> +struct i915_request *
> +intel_migrate_pages(struct intel_migrate *m,
> +		    struct scatterlist *src,
> +		    struct scatterlist *dst);
> +
> +struct intel_context *intel_migrate_create_context(struct intel_migrate *m);
> +struct i915_request *
> +intel_context_migrate_pages(struct intel_context *ce,
> +			    struct scatterlist *src,
> +			    struct scatterlist *dst);
> +
> +void intel_migrate_fini(struct intel_migrate *m);
> +
> +#endif /* __INTEL_MIGRATE__ */
> diff --git a/drivers/gpu/drm/i915/gt/selftest_migrate.c b/drivers/gpu/drm/i915/gt/selftest_migrate.c
> new file mode 100644
> index 000000000000..d5102058fe3b
> --- /dev/null
> +++ b/drivers/gpu/drm/i915/gt/selftest_migrate.c
> @@ -0,0 +1,105 @@
> +// SPDX-License-Identifier: MIT
> +/*
> + * Copyright © 2020 Intel Corporation
> + */
> +
> +static int live_migrate(void *arg)
> +{
> +	struct intel_migrate *m = arg;
> +	struct drm_i915_private *i915 = m->ce->engine->i915;
> +	const unsigned int sizes[] = {
> +		SZ_4K,
> +		SZ_64K,
> +		SZ_2M,
> +		SZ_64M,
> +		//SZ_2G,
> +	};
> +	int err = 0;
> +	int i, j;
> +
> +	for (i = 0; i < ARRAY_SIZE(sizes); i++) {
> +		struct drm_i915_gem_object *src, *dst;
> +		struct i915_request *rq;
> +		u32 *vaddr;
> +
> +		src = i915_gem_object_create_internal(i915, sizes[i]);
> +		if (IS_ERR(src))
> +			break;
> +
> +		vaddr = i915_gem_object_pin_map(src, I915_MAP_WC);
> +		if (IS_ERR(vaddr)) {
> +			i915_gem_object_put(src);
> +			break;
> +		}
> +
> +		for (j = 0; j < sizes[i] / sizeof(u32); j++)
> +			vaddr[j] = j;
> +		i915_gem_object_flush_map(src);
> +
> +		dst = i915_gem_object_create_internal(i915, sizes[i]);
> +		if (IS_ERR(dst)) {
> +			i915_gem_object_put(dst);
> +			break;
> +		}
> +
> +		vaddr = i915_gem_object_pin_map(dst, I915_MAP_WC);
> +		if (IS_ERR(vaddr)) {
> +			i915_gem_object_put(dst);
> +			i915_gem_object_put(src);
> +			break;
> +		}
> +
> +		for (j = 0; j < sizes[i] / sizeof(u32); j++)
> +			vaddr[j] = ~j;
> +		i915_gem_object_flush_map(dst);
> +
> +		rq = intel_migrate_pages(m,
> +					 src->mm.pages->sgl,
> +					 dst->mm.pages->sgl);
> +		if (IS_ERR(rq)) {
> +			pr_err("Migration failed, size: %u\n", sizes[i]);
> +			err = PTR_ERR(rq);
> +		}
> +
> +		if (i915_request_wait(rq, 0, HZ) < 0) {
> +			pr_err("Migration timed out, size: %u\n", sizes[i]);
> +			err = -ETIME;
> +		}
> +		i915_request_put(rq);
> +
> +		for (j = 0; j < sizes[i] / sizeof(u32); j++) {
> +			if (vaddr[j] != j) {
> +				pr_err("Copy failed, size: %u, offset: %zu\n",
> +				       sizes[i], j * sizeof(u32));
> +				igt_hexdump(vaddr + round_down(j, 1024), 4096);
> +				err = -EINVAL;
> +				break;
> +			}
> +		}
> +
> +		i915_gem_object_put(dst);
> +		i915_gem_object_put(src);
> +		i915_gem_drain_freed_objects(i915);
> +		if (err)
> +			break;
> +	}
> +
> +	return err;
> +}
> +
> +int intel_migrate_live_selftests(struct drm_i915_private *i915)
> +{
> +	static const struct i915_subtest tests[] = {
> +		SUBTEST(live_migrate),
> +	};
> +	struct intel_migrate m;
> +	int err;
> +
> +	if (intel_migrate_init(&m, &i915->gt))
> +		return 0;
> +
> +	err = i915_subtests(tests, &m);
> +	intel_migrate_fini(&m);
> +
> +	return err;
> +}
> diff --git a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> index a92c0e9b7e6b..be5e0191eaea 100644
> --- a/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> +++ b/drivers/gpu/drm/i915/selftests/i915_live_selftests.h
> @@ -26,6 +26,7 @@ selftest(gt_mocs, intel_mocs_live_selftests)
>   selftest(gt_pm, intel_gt_pm_live_selftests)
>   selftest(gt_heartbeat, intel_heartbeat_live_selftests)
>   selftest(requests, i915_request_live_selftests)
> +selftest(migrate, intel_migrate_live_selftests)
>   selftest(active, i915_active_live_selftests)
>   selftest(objects, i915_gem_object_live_selftests)
>   selftest(mman, i915_gem_mman_live_selftests)
> 
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-11-30 13:12   ` Tvrtko Ursulin
@ 2020-11-30 13:39     ` Chris Wilson
  2020-11-30 14:11       ` Chris Wilson
  2020-11-30 16:07       ` Tvrtko Ursulin
  0 siblings, 2 replies; 16+ messages in thread
From: Chris Wilson @ 2020-11-30 13:39 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2020-11-30 13:12:55)
> 
> On 28/11/2020 18:40, Chris Wilson wrote:
> > If we pipeline the PTE updates and then do the copy of those pages
> > within a single unpreemptible command packet, we can submit the copies
> > and leave them to be scheduled without having to synchronously wait
> > under a global lock.
> > 
> > Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> > ---
> >   drivers/gpu/drm/i915/Makefile                 |   1 +
> >   drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
> >   drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
> >   drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
> >   drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
> >   .../drm/i915/selftests/i915_live_selftests.h  |   1 +
> >   6 files changed, 511 insertions(+)
> >   create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
> >   create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
> >   create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c
> > 
> > diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> > index e5574e506a5c..0b2e12c87f9d 100644
> > --- a/drivers/gpu/drm/i915/Makefile
> > +++ b/drivers/gpu/drm/i915/Makefile
> > @@ -103,6 +103,7 @@ gt-y += \
> >       gt/intel_gtt.o \
> >       gt/intel_llc.o \
> >       gt/intel_lrc.o \
> > +     gt/intel_migrate.o \
> >       gt/intel_mocs.o \
> >       gt/intel_ppgtt.o \
> >       gt/intel_rc6.o \
> > diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
> > index ac58fcda4927..079d26b47a97 100644
> > --- a/drivers/gpu/drm/i915/gt/intel_engine.h
> > +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
> > @@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
> >   #define I915_GEM_HWS_PREEMPT_ADDR   (I915_GEM_HWS_PREEMPT * sizeof(u32))
> >   #define I915_GEM_HWS_SEQNO          0x40
> >   #define I915_GEM_HWS_SEQNO_ADDR             (I915_GEM_HWS_SEQNO * sizeof(u32))
> > +#define I915_GEM_HWS_MIGRATE         (0x42 * sizeof(u32))
> >   #define I915_GEM_HWS_SCRATCH                0x80
> >   
> >   #define I915_HWS_CSB_BUF0_INDEX             0x10
> > diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > new file mode 100644
> > index 000000000000..4d7bd32eb8d4
> > --- /dev/null
> > +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> > @@ -0,0 +1,370 @@
> > +// SPDX-License-Identifier: MIT
> > +/*
> > + * Copyright © 2020 Intel Corporation
> > + */
> > +
> > +#include "i915_drv.h"
> > +#include "intel_context.h"
> > +#include "intel_gt.h"
> > +#include "intel_gtt.h"
> > +#include "intel_lrc.h" /* virtual engine */
> > +#include "intel_migrate.h"
> > +#include "intel_ring.h"
> > +
> > +#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
> > +
> > +static void insert_pte(struct i915_address_space *vm,
> > +                    struct i915_page_table *pt,
> > +                    void *data)
> > +{
> > +     u64 *offset = data;
> > +
> > +     vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
> > +     *offset += PAGE_SIZE;
> > +}
> > +
> > +static struct i915_address_space *migrate_vm(struct intel_gt *gt)
> > +{
> > +     struct i915_vm_pt_stash stash = {};
> > +     struct i915_ppgtt *vm;
> > +     u64 offset, sz;
> > +     int err;
> > +
> > +     vm = i915_ppgtt_create(gt);
> > +     if (IS_ERR(vm))
> > +             return ERR_CAST(vm);
> > +
> > +     if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
> > +             err = -ENODEV;
> > +             goto err_vm;
> > +     }
> > +
> > +     /*
> > +      * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
> > +      * 4x2 page directories for source/destination.
> > +      */
> > +     sz = 2 * CHUNK_SZ;
> > +     offset = sz;
> > +
> > +     /*
> > +      * We need another page directory setup so that we can write
> > +      * the 8x512 PTE in each chunk.
> > +      */
> > +     sz += (sz >> 12) * sizeof(u64);
> > +
> > +     err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
> > +     if (err)
> > +             goto err_vm;
> > +
> > +     err = i915_vm_pin_pt_stash(&vm->vm, &stash);
> > +     if (err) {
> > +             i915_vm_free_pt_stash(&vm->vm, &stash);
> > +             goto err_vm;
> > +     }
> > +
> > +     vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
> > +     i915_vm_free_pt_stash(&vm->vm, &stash);
> > +
> > +     /* Now allow the GPU to rewrite the PTE via its own ppGTT */
> > +     vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);
> 
> This is just making the [0 - sz) gva point to the allocated sz bytes of 
> backing store?

Not quite, we are pointing [offset, sz) to the page directories
themselves. When we write into that range, we are modifying the PTE of
this ppGTT. (Breaking the fourth wall, allowing the non-privileged
context to update its own page tables.)

This vm could be engine->vm, but I don't think we actually save anything
by keeping them independent, and we gain the safety of not futzing with
the kernel_context.

Remind me that kernel_vm can share their scratches, even when read-only
is not supported.

> 
> > +
> > +     return &vm->vm;
> > +
> > +err_vm:
> > +     i915_vm_put(&vm->vm);
> > +     return ERR_PTR(err);
> > +}
> > +
> > +struct i915_request *
> > +intel_context_migrate_pages(struct intel_context *ce,
> > +                         struct scatterlist *src,
> > +                         struct scatterlist *dst)
> > +{
> > +     struct sgt_dma it_s = sg_sgt(src), it_d = sg_sgt(dst);
> > +     u64 encode = ce->vm->pte_encode(0, I915_CACHE_LLC, 0); /* flags */
> > +     struct i915_request *rq;
> > +     int len;
> > +     int err;
> > +
> > +     /* GEM_BUG_ON(ce->vm != migrate_vm); */
> > +
> > +     err = intel_context_pin(ce);
> > +     if (err)
> > +             return ERR_PTR(err);
> > +
> > +     GEM_BUG_ON(ce->ring->size < SZ_64K);
> > +
> > +     do {
> > +             rq = i915_request_create(ce);
> > +             if (IS_ERR(rq)) {
> > +                     err = PTR_ERR(rq);
> > +                     goto out_ce;
> > +             }
> > +
> > +             len = emit_pte(rq, &it_s, encode, 0, CHUNK_SZ);
> > +             if (len <= 0) {
> > +                     err = len;
> > +                     goto out_rq;
> > +             }
> > +
> > +             if (emit_pte(rq, &it_d, encode, CHUNK_SZ, len) < len) {
> > +                     err = -EINVAL;
> > +                     goto out_rq;
> > +             }
> 
> Source and destination PTEs into the reserved [0, sz * 2) area?

Yes.

> 
> > +
> > +             err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
> > +             if (err)
> > +                     goto out_rq;
> > +
> > +             err = emit_copy(rq, len);
> 
> Right so copy can use fixed offsets.
> 
> > +             if (err)
> > +                     goto out_rq;
> > +
> > +             if (!it_s.sg)
> > +                     i915_request_get(rq);
> > +out_rq:
> > +             i915_request_add(rq);
> > +             if (it_s.sg)
> > +                     cond_resched();
> 
>  From what context does this run? No preemptible?

Has to be process context; numerous allocations, implicit waits (that we
want to avoid in practice), and the timeline (per-context) mutex to
guard access to the ringbuffer.

> > +     } while (err == 0 && it_s.sg);
> > +
> > +out_ce:
> > +     intel_context_unpin(ce);
> > +     return err ? ERR_PTR(err) : rq;
> > +}
> > +
> > +struct i915_request *
> > +intel_migrate_pages(struct intel_migrate *m,
> > +                 struct scatterlist *src,
> > +                 struct scatterlist *dst)
> > +{
> > +     struct intel_context *ce;
> > +     struct i915_request *rq;
> > +
> > +     if (!m->ce)
> > +             return ERR_PTR(-ENODEV);
> > +
> > +     ce = intel_migrate_create_context(m);
> > +     if (IS_ERR(ce))
> > +             ce = intel_context_get(m->ce);
> 
> If virtual cannot be create use a common pre-created context?

Yes, I also had the try-to-pin and fallback to using single perma-pinned
context at one point. There's a balance to be found between automagic
failure handling or error propagation.

> > +     GEM_BUG_ON(IS_ERR(ce));
> > +
> > +     rq = intel_context_migrate_pages(ce, src, dst);
> > +
> > +     intel_context_put(ce);
> 
> Context is single use for some concrete reason?

The intended use case is for each client to have their own context, I
was thinking along the lines of gem_context.migrate; with the global one
for under pressure and where we cannot link back to a user.

As for single-use, one upon a time I expected that we would have a
pool of contexts for reuse. Atm, I like the idea of gem_context.migrate
more, but it all depends on getting contexts down into the get_pages.

Been there, done that. :|

> But it has fallback to a 
> single context so not sure. Plan to allow using users context, or to 
> inherit their priority so because of that?

Exactly, allow to create and use contexts per user to allow individual
priorities and rescheduling.

> So I guess overall this is an alternative to fixed vma windows but can 
> be pipelined.
> 
> I did not get the foreach part. Why do you have to iterate existing 
> entries to add entries? Can't you just populate the reserved range from 
> the stashed bo dma addresses?

This is identical to the current window scheme, except that instead of
having to lock the migration vm for the duration of the entire execution,
as the ppGTT is managed synchronously on the CPU (and so only one thread
can use the migration vm and must wait for the execution to finish
before the next can adjust the window), we push the ppGTT management into
the GPU command packet. Each client/context can then build their copy
request in parallel and leave it to the scheduler. And then with the
fence returned, subsequent use of the pages can also be scheduled
asynchronously.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-11-30 13:39     ` Chris Wilson
@ 2020-11-30 14:11       ` Chris Wilson
  2020-11-30 16:07       ` Tvrtko Ursulin
  1 sibling, 0 replies; 16+ messages in thread
From: Chris Wilson @ 2020-11-30 14:11 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Chris Wilson (2020-11-30 13:39:54)
> Quoting Tvrtko Ursulin (2020-11-30 13:12:55)
> > 
> > On 28/11/2020 18:40, Chris Wilson wrote:
> > > +struct i915_request *
> > > +intel_context_migrate_pages(struct intel_context *ce,
> > > +                         struct scatterlist *src,
> > > +                         struct scatterlist *dst)
> > > +{
> > > +     struct sgt_dma it_s = sg_sgt(src), it_d = sg_sgt(dst);
> > > +     u64 encode = ce->vm->pte_encode(0, I915_CACHE_LLC, 0); /* flags */
> > > +     struct i915_request *rq;
> > > +     int len;
> > > +     int err;
> > > +
> > > +     /* GEM_BUG_ON(ce->vm != migrate_vm); */
> > > +
> > > +     err = intel_context_pin(ce);
> > > +     if (err)
> > > +             return ERR_PTR(err);
> > > +
> > > +     GEM_BUG_ON(ce->ring->size < SZ_64K);
> > > +
> > > +     do {
> > > +             rq = i915_request_create(ce);
> > > +             if (IS_ERR(rq)) {
> > > +                     err = PTR_ERR(rq);
> > > +                     goto out_ce;
> > > +             }
> > > +
> > > +             len = emit_pte(rq, &it_s, encode, 0, CHUNK_SZ);
> > > +             if (len <= 0) {
> > > +                     err = len;
> > > +                     goto out_rq;
> > > +             }
> > > +
> > > +             if (emit_pte(rq, &it_d, encode, CHUNK_SZ, len) < len) {
> > > +                     err = -EINVAL;
> > > +                     goto out_rq;
> > > +             }
> > 
> > Source and destination PTEs into the reserved [0, sz * 2) area?
> 
> Yes.
> 
> > 
> > > +
> > > +             err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
> > > +             if (err)
> > > +                     goto out_rq;
> > > +
> > > +             err = emit_copy(rq, len);
> > 
> > Right so copy can use fixed offsets.
> > 
> > > +             if (err)
> > > +                     goto out_rq;
> > > +
> > > +             if (!it_s.sg)
> > > +                     i915_request_get(rq);
> > > +out_rq:
> > > +             i915_request_add(rq);
> > > +             if (it_s.sg)
> > > +                     cond_resched();
> > 
> >  From what context does this run? No preemptible?
> 
> Has to be process context; numerous allocations, implicit waits (that we
> want to avoid in practice), and the timeline (per-context) mutex to
> guard access to the ringbuffer.

Another thought was to allocate new contexts on the fly; as we can only
copy about 500MiB before stalling using GPU PTE updates.

However, I thought reallocations would be worse for the code flow.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [RFC,1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt
  2020-11-28 18:40 [Intel-gfx] [RFC 1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt Chris Wilson
                   ` (2 preceding siblings ...)
  2020-11-28 18:40 ` [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration Chris Wilson
@ 2020-11-30 15:19 ` Patchwork
  2020-11-30 15:51 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork
  4 siblings, 0 replies; 16+ messages in thread
From: Patchwork @ 2020-11-30 15:19 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx

== Series Details ==

Series: series starting with [RFC,1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt
URL   : https://patchwork.freedesktop.org/series/84376/
State : warning

== Summary ==

$ dim checkpatch origin/drm-tip
cfaf43ff6b76 drm/i915/gt: Add an insert_entry for gen8_ppgtt
bca7cfcb320e drm/i915/gt: Add a routine to iterate over the pagetables of a GTT
88fd7da3f27b drm/i915/gt: Export the pinned context constructor
dbfef9ac08ef drm/i915/gt: Pipelined page migration
-:38: WARNING:FILE_PATH_CHANGES: added, moved or deleted file(s), does MAINTAINERS need updating?
#38: 
new file mode 100644

total: 0 errors, 1 warnings, 0 checks, 529 lines checked


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Intel-gfx] ✗ Fi.CI.BAT: failure for series starting with [RFC,1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt
  2020-11-28 18:40 [Intel-gfx] [RFC 1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt Chris Wilson
                   ` (3 preceding siblings ...)
  2020-11-30 15:19 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [RFC,1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt Patchwork
@ 2020-11-30 15:51 ` Patchwork
  4 siblings, 0 replies; 16+ messages in thread
From: Patchwork @ 2020-11-30 15:51 UTC (permalink / raw)
  To: Chris Wilson; +Cc: intel-gfx


[-- Attachment #1.1: Type: text/plain, Size: 9764 bytes --]

== Series Details ==

Series: series starting with [RFC,1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt
URL   : https://patchwork.freedesktop.org/series/84376/
State : failure

== Summary ==

CI Bug Log - changes from CI_DRM_9405 -> Patchwork_19009
====================================================

Summary
-------

  **FAILURE**

  Serious unknown changes coming with Patchwork_19009 absolutely need to be
  verified manually.
  
  If you think the reported changes have nothing to do with the changes
  introduced in Patchwork_19009, please notify your bug team to allow them
  to document this new failure mode, which will reduce false positives in CI.

  External URL: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/index.html

Possible new issues
-------------------

  Here are the unknown changes that may have been introduced in Patchwork_19009:

### IGT changes ###

#### Possible regressions ####

  * {igt@i915_selftest@live@migrate} (NEW):
    - fi-snb-2600:        NOTRUN -> [INCOMPLETE][1]
   [1]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-snb-2600/igt@i915_selftest@live@migrate.html
    - fi-ivb-3770:        NOTRUN -> [INCOMPLETE][2]
   [2]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-ivb-3770/igt@i915_selftest@live@migrate.html
    - fi-hsw-4770:        NOTRUN -> [INCOMPLETE][3]
   [3]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-hsw-4770/igt@i915_selftest@live@migrate.html
    - fi-byt-j1900:       NOTRUN -> [INCOMPLETE][4]
   [4]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-byt-j1900/igt@i915_selftest@live@migrate.html
    - fi-snb-2520m:       NOTRUN -> [INCOMPLETE][5]
   [5]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-snb-2520m/igt@i915_selftest@live@migrate.html

  * igt@runner@aborted:
    - fi-snb-2520m:       NOTRUN -> [FAIL][6]
   [6]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-snb-2520m/igt@runner@aborted.html
    - fi-snb-2600:        NOTRUN -> [FAIL][7]
   [7]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-snb-2600/igt@runner@aborted.html
    - fi-ivb-3770:        NOTRUN -> [FAIL][8]
   [8]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-ivb-3770/igt@runner@aborted.html

  
New tests
---------

  New tests have been introduced between CI_DRM_9405 and Patchwork_19009:

### New CI tests (1) ###

  * boot:
    - Statuses : 1 fail(s) 40 pass(s)
    - Exec time: [0.0] s

  


### New IGT tests (1) ###

  * igt@i915_selftest@live@migrate:
    - Statuses : 5 incomplete(s) 31 pass(s)
    - Exec time: [0.0, 17.86] s

  

Known issues
------------

  Here are the changes found in Patchwork_19009 that come from known issues:

### IGT changes ###

#### Issues hit ####

  * igt@gem_exec_suspend@basic-s0:
    - fi-apl-guc:         [PASS][9] -> [DMESG-WARN][10] ([i915#62]) +1 similar issue
   [9]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-apl-guc/igt@gem_exec_suspend@basic-s0.html
   [10]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-apl-guc/igt@gem_exec_suspend@basic-s0.html

  * igt@gem_mmap_gtt@basic:
    - fi-tgl-y:           [PASS][11] -> [DMESG-WARN][12] ([i915#402]) +1 similar issue
   [11]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-tgl-y/igt@gem_mmap_gtt@basic.html
   [12]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-tgl-y/igt@gem_mmap_gtt@basic.html

  * igt@i915_module_load@reload:
    - fi-tgl-u2:          [PASS][13] -> [DMESG-WARN][14] ([i915#1982] / [k.org#205379])
   [13]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-tgl-u2/igt@i915_module_load@reload.html
   [14]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-tgl-u2/igt@i915_module_load@reload.html

  * igt@i915_pm_rpm@module-reload:
    - fi-byt-j1900:       [PASS][15] -> [DMESG-WARN][16] ([i915#1982])
   [15]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-byt-j1900/igt@i915_pm_rpm@module-reload.html
   [16]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-byt-j1900/igt@i915_pm_rpm@module-reload.html

  * igt@i915_selftest@live@coherency:
    - fi-gdg-551:         [PASS][17] -> [DMESG-FAIL][18] ([i915#1748])
   [17]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-gdg-551/igt@i915_selftest@live@coherency.html
   [18]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-gdg-551/igt@i915_selftest@live@coherency.html

  * igt@kms_chamelium@dp-crc-fast:
    - fi-cml-u2:          [PASS][19] -> [DMESG-WARN][20] ([i915#1982])
   [19]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-cml-u2/igt@kms_chamelium@dp-crc-fast.html
   [20]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-cml-u2/igt@kms_chamelium@dp-crc-fast.html

  * igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic:
    - fi-bsw-kefka:       [PASS][21] -> [DMESG-WARN][22] ([i915#1982])
   [21]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-bsw-kefka/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic.html
   [22]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-bsw-kefka/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-atomic.html

  * igt@kms_cursor_legacy@basic-busy-flip-before-cursor-legacy:
    - fi-icl-u2:          [PASS][23] -> [DMESG-WARN][24] ([i915#1982])
   [23]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-icl-u2/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-legacy.html
   [24]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-icl-u2/igt@kms_cursor_legacy@basic-busy-flip-before-cursor-legacy.html

  
#### Possible fixes ####

  * igt@core_hotunplug@unbind-rebind:
    - fi-icl-u2:          [DMESG-WARN][25] ([i915#1982]) -> [PASS][26] +1 similar issue
   [25]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-icl-u2/igt@core_hotunplug@unbind-rebind.html
   [26]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-icl-u2/igt@core_hotunplug@unbind-rebind.html

  * igt@i915_module_load@reload:
    - fi-tgl-y:           [DMESG-WARN][27] ([i915#1982] / [k.org#205379]) -> [PASS][28]
   [27]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-tgl-y/igt@i915_module_load@reload.html
   [28]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-tgl-y/igt@i915_module_load@reload.html

  * igt@i915_pm_rpm@basic-pci-d3-state:
    - {fi-ehl-1}:         [DMESG-WARN][29] ([i915#1982]) -> [PASS][30]
   [29]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-ehl-1/igt@i915_pm_rpm@basic-pci-d3-state.html
   [30]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-ehl-1/igt@i915_pm_rpm@basic-pci-d3-state.html
    - fi-bsw-kefka:       [DMESG-WARN][31] ([i915#1982]) -> [PASS][32]
   [31]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-bsw-kefka/igt@i915_pm_rpm@basic-pci-d3-state.html
   [32]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-bsw-kefka/igt@i915_pm_rpm@basic-pci-d3-state.html

  * igt@i915_selftest@live@execlists:
    - fi-icl-y:           [INCOMPLETE][33] ([i915#1037] / [i915#2276]) -> [PASS][34]
   [33]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-icl-y/igt@i915_selftest@live@execlists.html
   [34]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-icl-y/igt@i915_selftest@live@execlists.html

  * igt@kms_busy@basic@flip:
    - fi-kbl-soraka:      [DMESG-WARN][35] ([i915#1982]) -> [PASS][36]
   [35]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-kbl-soraka/igt@kms_busy@basic@flip.html
   [36]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-kbl-soraka/igt@kms_busy@basic@flip.html
    - fi-tgl-y:           [DMESG-WARN][37] ([i915#1982]) -> [PASS][38]
   [37]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-tgl-y/igt@kms_busy@basic@flip.html
   [38]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-tgl-y/igt@kms_busy@basic@flip.html

  * {igt@prime_vgem@basic-userptr}:
    - fi-tgl-y:           [DMESG-WARN][39] ([i915#402]) -> [PASS][40] +2 similar issues
   [39]: https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_9405/fi-tgl-y/igt@prime_vgem@basic-userptr.html
   [40]: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/fi-tgl-y/igt@prime_vgem@basic-userptr.html

  
  {name}: This element is suppressed. This means it is ignored when computing
          the status of the difference (SUCCESS, WARNING, or FAILURE).

  [i915#1037]: https://gitlab.freedesktop.org/drm/intel/issues/1037
  [i915#1748]: https://gitlab.freedesktop.org/drm/intel/issues/1748
  [i915#1982]: https://gitlab.freedesktop.org/drm/intel/issues/1982
  [i915#2276]: https://gitlab.freedesktop.org/drm/intel/issues/2276
  [i915#402]: https://gitlab.freedesktop.org/drm/intel/issues/402
  [i915#62]: https://gitlab.freedesktop.org/drm/intel/issues/62
  [k.org#205379]: https://bugzilla.kernel.org/show_bug.cgi?id=205379


Participating hosts (45 -> 41)
------------------------------

  Missing    (4): fi-ilk-m540 fi-bsw-cyan fi-bdw-samus fi-hsw-4200u 


Build changes
-------------

  * Linux: CI_DRM_9405 -> Patchwork_19009

  CI-20190529: 20190529
  CI_DRM_9405: 9b8ec3277e6ac37b18a215370c353f0ae1a187b7 @ git://anongit.freedesktop.org/gfx-ci/linux
  IGT_5876: cf2f41b3d3dfabaf3a4837062f996f3491a350b1 @ git://anongit.freedesktop.org/xorg/app/intel-gpu-tools
  Patchwork_19009: dbfef9ac08efbab53b45e9cf288b6bff8a28a964 @ git://anongit.freedesktop.org/gfx-ci/linux


== Linux commits ==

dbfef9ac08ef drm/i915/gt: Pipelined page migration
88fd7da3f27b drm/i915/gt: Export the pinned context constructor
bca7cfcb320e drm/i915/gt: Add a routine to iterate over the pagetables of a GTT
cfaf43ff6b76 drm/i915/gt: Add an insert_entry for gen8_ppgtt

== Logs ==

For more details see: https://intel-gfx-ci.01.org/tree/drm-tip/Patchwork_19009/index.html

[-- Attachment #1.2: Type: text/html, Size: 11815 bytes --]

[-- Attachment #2: Type: text/plain, Size: 160 bytes --]

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-11-30 13:39     ` Chris Wilson
  2020-11-30 14:11       ` Chris Wilson
@ 2020-11-30 16:07       ` Tvrtko Ursulin
  2020-11-30 16:21         ` Chris Wilson
  1 sibling, 1 reply; 16+ messages in thread
From: Tvrtko Ursulin @ 2020-11-30 16:07 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 30/11/2020 13:39, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-11-30 13:12:55)
>> On 28/11/2020 18:40, Chris Wilson wrote:
>>> If we pipeline the PTE updates and then do the copy of those pages
>>> within a single unpreemptible command packet, we can submit the copies
>>> and leave them to be scheduled without having to synchronously wait
>>> under a global lock.
>>>
>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>> ---
>>>    drivers/gpu/drm/i915/Makefile                 |   1 +
>>>    drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
>>>    drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
>>>    drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
>>>    drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
>>>    .../drm/i915/selftests/i915_live_selftests.h  |   1 +
>>>    6 files changed, 511 insertions(+)
>>>    create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
>>>    create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
>>>    create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c
>>>
>>> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
>>> index e5574e506a5c..0b2e12c87f9d 100644
>>> --- a/drivers/gpu/drm/i915/Makefile
>>> +++ b/drivers/gpu/drm/i915/Makefile
>>> @@ -103,6 +103,7 @@ gt-y += \
>>>        gt/intel_gtt.o \
>>>        gt/intel_llc.o \
>>>        gt/intel_lrc.o \
>>> +     gt/intel_migrate.o \
>>>        gt/intel_mocs.o \
>>>        gt/intel_ppgtt.o \
>>>        gt/intel_rc6.o \
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
>>> index ac58fcda4927..079d26b47a97 100644
>>> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
>>> @@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
>>>    #define I915_GEM_HWS_PREEMPT_ADDR   (I915_GEM_HWS_PREEMPT * sizeof(u32))
>>>    #define I915_GEM_HWS_SEQNO          0x40
>>>    #define I915_GEM_HWS_SEQNO_ADDR             (I915_GEM_HWS_SEQNO * sizeof(u32))
>>> +#define I915_GEM_HWS_MIGRATE         (0x42 * sizeof(u32))
>>>    #define I915_GEM_HWS_SCRATCH                0x80
>>>    
>>>    #define I915_HWS_CSB_BUF0_INDEX             0x10
>>> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
>>> new file mode 100644
>>> index 000000000000..4d7bd32eb8d4
>>> --- /dev/null
>>> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
>>> @@ -0,0 +1,370 @@
>>> +// SPDX-License-Identifier: MIT
>>> +/*
>>> + * Copyright © 2020 Intel Corporation
>>> + */
>>> +
>>> +#include "i915_drv.h"
>>> +#include "intel_context.h"
>>> +#include "intel_gt.h"
>>> +#include "intel_gtt.h"
>>> +#include "intel_lrc.h" /* virtual engine */
>>> +#include "intel_migrate.h"
>>> +#include "intel_ring.h"
>>> +
>>> +#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
>>> +
>>> +static void insert_pte(struct i915_address_space *vm,
>>> +                    struct i915_page_table *pt,
>>> +                    void *data)
>>> +{
>>> +     u64 *offset = data;
>>> +
>>> +     vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
>>> +     *offset += PAGE_SIZE;
>>> +}
>>> +
>>> +static struct i915_address_space *migrate_vm(struct intel_gt *gt)
>>> +{
>>> +     struct i915_vm_pt_stash stash = {};
>>> +     struct i915_ppgtt *vm;
>>> +     u64 offset, sz;
>>> +     int err;
>>> +
>>> +     vm = i915_ppgtt_create(gt);
>>> +     if (IS_ERR(vm))
>>> +             return ERR_CAST(vm);
>>> +
>>> +     if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
>>> +             err = -ENODEV;
>>> +             goto err_vm;
>>> +     }
>>> +
>>> +     /*
>>> +      * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
>>> +      * 4x2 page directories for source/destination.
>>> +      */
>>> +     sz = 2 * CHUNK_SZ;
>>> +     offset = sz;
>>> +
>>> +     /*
>>> +      * We need another page directory setup so that we can write
>>> +      * the 8x512 PTE in each chunk.
>>> +      */
>>> +     sz += (sz >> 12) * sizeof(u64);
>>> +
>>> +     err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
>>> +     if (err)
>>> +             goto err_vm;
>>> +
>>> +     err = i915_vm_pin_pt_stash(&vm->vm, &stash);
>>> +     if (err) {
>>> +             i915_vm_free_pt_stash(&vm->vm, &stash);
>>> +             goto err_vm;
>>> +     }
>>> +
>>> +     vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
>>> +     i915_vm_free_pt_stash(&vm->vm, &stash);
>>> +
>>> +     /* Now allow the GPU to rewrite the PTE via its own ppGTT */
>>> +     vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);
>>
>> This is just making the [0 - sz) gva point to the allocated sz bytes of
>> backing store?
> 
> Not quite, we are pointing [offset, sz) to the page directories
> themselves. When we write into that range, we are modifying the PTE of
> this ppGTT. (Breaking the fourth wall, allowing the non-privileged
> context to update its own page tables.)

Confusing, so first step is here making [0, sz) gva ptes point to pte 
backing store.

Which means emit_pte when called from intel_context_migrate_pages is 
emitting sdw which will write into [0, sz] gva, effectively placing the 
src and dest object content at those location.

Then blit copies the data.

What happens if context is re-used? Gva's no longer point to PTEs so 
how? Is a step to re-initialize after use missing? More sdw after the 
blit copy to make it point back to ptes?

> 
> This vm could be engine->vm, but I don't think we actually save anything
> by keeping them independent, and we gain the safety of not futzing with
> the kernel_context.
> 
> Remind me that kernel_vm can share their scratches, even when read-only
> is not supported.
> 
>>
>>> +
>>> +     return &vm->vm;
>>> +
>>> +err_vm:
>>> +     i915_vm_put(&vm->vm);
>>> +     return ERR_PTR(err);
>>> +}
>>> +
>>> +struct i915_request *
>>> +intel_context_migrate_pages(struct intel_context *ce,
>>> +                         struct scatterlist *src,
>>> +                         struct scatterlist *dst)
>>> +{
>>> +     struct sgt_dma it_s = sg_sgt(src), it_d = sg_sgt(dst);
>>> +     u64 encode = ce->vm->pte_encode(0, I915_CACHE_LLC, 0); /* flags */
>>> +     struct i915_request *rq;
>>> +     int len;
>>> +     int err;
>>> +
>>> +     /* GEM_BUG_ON(ce->vm != migrate_vm); */
>>> +
>>> +     err = intel_context_pin(ce);
>>> +     if (err)
>>> +             return ERR_PTR(err);
>>> +
>>> +     GEM_BUG_ON(ce->ring->size < SZ_64K);
>>> +
>>> +     do {
>>> +             rq = i915_request_create(ce);
>>> +             if (IS_ERR(rq)) {
>>> +                     err = PTR_ERR(rq);
>>> +                     goto out_ce;
>>> +             }
>>> +
>>> +             len = emit_pte(rq, &it_s, encode, 0, CHUNK_SZ);
>>> +             if (len <= 0) {
>>> +                     err = len;
>>> +                     goto out_rq;
>>> +             }
>>> +
>>> +             if (emit_pte(rq, &it_d, encode, CHUNK_SZ, len) < len) {
>>> +                     err = -EINVAL;
>>> +                     goto out_rq;
>>> +             }
>>
>> Source and destination PTEs into the reserved [0, sz * 2) area?
> 
> Yes.
> 
>>
>>> +
>>> +             err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
>>> +             if (err)
>>> +                     goto out_rq;
>>> +
>>> +             err = emit_copy(rq, len);
>>
>> Right so copy can use fixed offsets.
>>
>>> +             if (err)
>>> +                     goto out_rq;
>>> +
>>> +             if (!it_s.sg)
>>> +                     i915_request_get(rq);
>>> +out_rq:
>>> +             i915_request_add(rq);
>>> +             if (it_s.sg)
>>> +                     cond_resched();
>>
>>   From what context does this run? No preemptible?
> 
> Has to be process context; numerous allocations, implicit waits (that we
> want to avoid in practice), and the timeline (per-context) mutex to
> guard access to the ringbuffer.

I guess on non-preemptible, or not fully preemptible kernel it is useful.

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-11-30 16:07       ` Tvrtko Ursulin
@ 2020-11-30 16:21         ` Chris Wilson
  2020-11-30 16:26           ` Tvrtko Ursulin
  0 siblings, 1 reply; 16+ messages in thread
From: Chris Wilson @ 2020-11-30 16:21 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2020-11-30 16:07:44)
> 
> On 30/11/2020 13:39, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-11-30 13:12:55)
> >> On 28/11/2020 18:40, Chris Wilson wrote:
> >>> If we pipeline the PTE updates and then do the copy of those pages
> >>> within a single unpreemptible command packet, we can submit the copies
> >>> and leave them to be scheduled without having to synchronously wait
> >>> under a global lock.
> >>>
> >>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>> ---
> >>>    drivers/gpu/drm/i915/Makefile                 |   1 +
> >>>    drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
> >>>    drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
> >>>    drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
> >>>    drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
> >>>    .../drm/i915/selftests/i915_live_selftests.h  |   1 +
> >>>    6 files changed, 511 insertions(+)
> >>>    create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
> >>>    create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
> >>>    create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c
> >>>
> >>> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> >>> index e5574e506a5c..0b2e12c87f9d 100644
> >>> --- a/drivers/gpu/drm/i915/Makefile
> >>> +++ b/drivers/gpu/drm/i915/Makefile
> >>> @@ -103,6 +103,7 @@ gt-y += \
> >>>        gt/intel_gtt.o \
> >>>        gt/intel_llc.o \
> >>>        gt/intel_lrc.o \
> >>> +     gt/intel_migrate.o \
> >>>        gt/intel_mocs.o \
> >>>        gt/intel_ppgtt.o \
> >>>        gt/intel_rc6.o \
> >>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
> >>> index ac58fcda4927..079d26b47a97 100644
> >>> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
> >>> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
> >>> @@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
> >>>    #define I915_GEM_HWS_PREEMPT_ADDR   (I915_GEM_HWS_PREEMPT * sizeof(u32))
> >>>    #define I915_GEM_HWS_SEQNO          0x40
> >>>    #define I915_GEM_HWS_SEQNO_ADDR             (I915_GEM_HWS_SEQNO * sizeof(u32))
> >>> +#define I915_GEM_HWS_MIGRATE         (0x42 * sizeof(u32))
> >>>    #define I915_GEM_HWS_SCRATCH                0x80
> >>>    
> >>>    #define I915_HWS_CSB_BUF0_INDEX             0x10
> >>> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
> >>> new file mode 100644
> >>> index 000000000000..4d7bd32eb8d4
> >>> --- /dev/null
> >>> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> >>> @@ -0,0 +1,370 @@
> >>> +// SPDX-License-Identifier: MIT
> >>> +/*
> >>> + * Copyright © 2020 Intel Corporation
> >>> + */
> >>> +
> >>> +#include "i915_drv.h"
> >>> +#include "intel_context.h"
> >>> +#include "intel_gt.h"
> >>> +#include "intel_gtt.h"
> >>> +#include "intel_lrc.h" /* virtual engine */
> >>> +#include "intel_migrate.h"
> >>> +#include "intel_ring.h"
> >>> +
> >>> +#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
> >>> +
> >>> +static void insert_pte(struct i915_address_space *vm,
> >>> +                    struct i915_page_table *pt,
> >>> +                    void *data)
> >>> +{
> >>> +     u64 *offset = data;
> >>> +
> >>> +     vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
> >>> +     *offset += PAGE_SIZE;
> >>> +}
> >>> +
> >>> +static struct i915_address_space *migrate_vm(struct intel_gt *gt)
> >>> +{
> >>> +     struct i915_vm_pt_stash stash = {};
> >>> +     struct i915_ppgtt *vm;
> >>> +     u64 offset, sz;
> >>> +     int err;
> >>> +
> >>> +     vm = i915_ppgtt_create(gt);
> >>> +     if (IS_ERR(vm))
> >>> +             return ERR_CAST(vm);
> >>> +
> >>> +     if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
> >>> +             err = -ENODEV;
> >>> +             goto err_vm;
> >>> +     }
> >>> +
> >>> +     /*
> >>> +      * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
> >>> +      * 4x2 page directories for source/destination.
> >>> +      */
> >>> +     sz = 2 * CHUNK_SZ;
> >>> +     offset = sz;
> >>> +
> >>> +     /*
> >>> +      * We need another page directory setup so that we can write
> >>> +      * the 8x512 PTE in each chunk.
> >>> +      */
> >>> +     sz += (sz >> 12) * sizeof(u64);
> >>> +
> >>> +     err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
> >>> +     if (err)
> >>> +             goto err_vm;
> >>> +
> >>> +     err = i915_vm_pin_pt_stash(&vm->vm, &stash);
> >>> +     if (err) {
> >>> +             i915_vm_free_pt_stash(&vm->vm, &stash);
> >>> +             goto err_vm;
> >>> +     }
> >>> +
> >>> +     vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
> >>> +     i915_vm_free_pt_stash(&vm->vm, &stash);
> >>> +
> >>> +     /* Now allow the GPU to rewrite the PTE via its own ppGTT */
> >>> +     vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);
> >>
> >> This is just making the [0 - sz) gva point to the allocated sz bytes of
> >> backing store?
> > 
> > Not quite, we are pointing [offset, sz) to the page directories
> > themselves. When we write into that range, we are modifying the PTE of
> > this ppGTT. (Breaking the fourth wall, allowing the non-privileged
> > context to update its own page tables.)
> 
> Confusing, so first step is here making [0, sz) gva ptes point to pte 
> backing store.
> 
> Which means emit_pte when called from intel_context_migrate_pages is 
> emitting sdw which will write into [0, sz] gva, effectively placing the 
> src and dest object content at those location.
> 
> Then blit copies the data.
> 
> What happens if context is re-used? Gva's no longer point to PTEs so 
> how? Is a step to re-initialize after use missing? More sdw after the 
> blit copy to make it point back to ptes?

Every chunk starts with writing the PTEs used by the chunk. Within the
chunk, the GPU commands are not preemptible, so the PTEs cannot be
replaced before they are consumed.

We do leave the PTE pointing to stale pages after the blit, but since
the vm is only used by first rewriting the PTE, those dangling PTE
should never be chased.

So each chunk does a fixed copy between the same pair of addresses
within the vm; the magic is in rewriting the PTE to point at the pages
of interest for the chunk.

> >>> +             i915_request_add(rq);
> >>> +             if (it_s.sg)
> >>> +                     cond_resched();
> >>
> >>   From what context does this run? No preemptible?
> > 
> > Has to be process context; numerous allocations, implicit waits (that we
> > want to avoid in practice), and the timeline (per-context) mutex to
> > guard access to the ringbuffer.
> 
> I guess on non-preemptible, or not fully preemptible kernel it is useful.

Oh, the cond_resched() is almost certainly overkill. But potentially a
very large loop so worth being careful.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-11-30 16:21         ` Chris Wilson
@ 2020-11-30 16:26           ` Tvrtko Ursulin
  2020-11-30 16:44             ` Chris Wilson
  0 siblings, 1 reply; 16+ messages in thread
From: Tvrtko Ursulin @ 2020-11-30 16:26 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 30/11/2020 16:21, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-11-30 16:07:44)
>>
>> On 30/11/2020 13:39, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2020-11-30 13:12:55)
>>>> On 28/11/2020 18:40, Chris Wilson wrote:
>>>>> If we pipeline the PTE updates and then do the copy of those pages
>>>>> within a single unpreemptible command packet, we can submit the copies
>>>>> and leave them to be scheduled without having to synchronously wait
>>>>> under a global lock.
>>>>>
>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>> ---
>>>>>     drivers/gpu/drm/i915/Makefile                 |   1 +
>>>>>     drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
>>>>>     drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
>>>>>     drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
>>>>>     drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
>>>>>     .../drm/i915/selftests/i915_live_selftests.h  |   1 +
>>>>>     6 files changed, 511 insertions(+)
>>>>>     create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
>>>>>     create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
>>>>>     create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c
>>>>>
>>>>> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
>>>>> index e5574e506a5c..0b2e12c87f9d 100644
>>>>> --- a/drivers/gpu/drm/i915/Makefile
>>>>> +++ b/drivers/gpu/drm/i915/Makefile
>>>>> @@ -103,6 +103,7 @@ gt-y += \
>>>>>         gt/intel_gtt.o \
>>>>>         gt/intel_llc.o \
>>>>>         gt/intel_lrc.o \
>>>>> +     gt/intel_migrate.o \
>>>>>         gt/intel_mocs.o \
>>>>>         gt/intel_ppgtt.o \
>>>>>         gt/intel_rc6.o \
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
>>>>> index ac58fcda4927..079d26b47a97 100644
>>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
>>>>> @@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
>>>>>     #define I915_GEM_HWS_PREEMPT_ADDR   (I915_GEM_HWS_PREEMPT * sizeof(u32))
>>>>>     #define I915_GEM_HWS_SEQNO          0x40
>>>>>     #define I915_GEM_HWS_SEQNO_ADDR             (I915_GEM_HWS_SEQNO * sizeof(u32))
>>>>> +#define I915_GEM_HWS_MIGRATE         (0x42 * sizeof(u32))
>>>>>     #define I915_GEM_HWS_SCRATCH                0x80
>>>>>     
>>>>>     #define I915_HWS_CSB_BUF0_INDEX             0x10
>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
>>>>> new file mode 100644
>>>>> index 000000000000..4d7bd32eb8d4
>>>>> --- /dev/null
>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
>>>>> @@ -0,0 +1,370 @@
>>>>> +// SPDX-License-Identifier: MIT
>>>>> +/*
>>>>> + * Copyright © 2020 Intel Corporation
>>>>> + */
>>>>> +
>>>>> +#include "i915_drv.h"
>>>>> +#include "intel_context.h"
>>>>> +#include "intel_gt.h"
>>>>> +#include "intel_gtt.h"
>>>>> +#include "intel_lrc.h" /* virtual engine */
>>>>> +#include "intel_migrate.h"
>>>>> +#include "intel_ring.h"
>>>>> +
>>>>> +#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
>>>>> +
>>>>> +static void insert_pte(struct i915_address_space *vm,
>>>>> +                    struct i915_page_table *pt,
>>>>> +                    void *data)
>>>>> +{
>>>>> +     u64 *offset = data;
>>>>> +
>>>>> +     vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
>>>>> +     *offset += PAGE_SIZE;
>>>>> +}
>>>>> +
>>>>> +static struct i915_address_space *migrate_vm(struct intel_gt *gt)
>>>>> +{
>>>>> +     struct i915_vm_pt_stash stash = {};
>>>>> +     struct i915_ppgtt *vm;
>>>>> +     u64 offset, sz;
>>>>> +     int err;
>>>>> +
>>>>> +     vm = i915_ppgtt_create(gt);
>>>>> +     if (IS_ERR(vm))
>>>>> +             return ERR_CAST(vm);
>>>>> +
>>>>> +     if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
>>>>> +             err = -ENODEV;
>>>>> +             goto err_vm;
>>>>> +     }
>>>>> +
>>>>> +     /*
>>>>> +      * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
>>>>> +      * 4x2 page directories for source/destination.
>>>>> +      */
>>>>> +     sz = 2 * CHUNK_SZ;
>>>>> +     offset = sz;
>>>>> +
>>>>> +     /*
>>>>> +      * We need another page directory setup so that we can write
>>>>> +      * the 8x512 PTE in each chunk.
>>>>> +      */
>>>>> +     sz += (sz >> 12) * sizeof(u64);
>>>>> +
>>>>> +     err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
>>>>> +     if (err)
>>>>> +             goto err_vm;
>>>>> +
>>>>> +     err = i915_vm_pin_pt_stash(&vm->vm, &stash);
>>>>> +     if (err) {
>>>>> +             i915_vm_free_pt_stash(&vm->vm, &stash);
>>>>> +             goto err_vm;
>>>>> +     }
>>>>> +
>>>>> +     vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
>>>>> +     i915_vm_free_pt_stash(&vm->vm, &stash);
>>>>> +
>>>>> +     /* Now allow the GPU to rewrite the PTE via its own ppGTT */
>>>>> +     vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);
>>>>
>>>> This is just making the [0 - sz) gva point to the allocated sz bytes of
>>>> backing store?
>>>
>>> Not quite, we are pointing [offset, sz) to the page directories
>>> themselves. When we write into that range, we are modifying the PTE of
>>> this ppGTT. (Breaking the fourth wall, allowing the non-privileged
>>> context to update its own page tables.)
>>
>> Confusing, so first step is here making [0, sz) gva ptes point to pte
>> backing store.
>>
>> Which means emit_pte when called from intel_context_migrate_pages is
>> emitting sdw which will write into [0, sz] gva, effectively placing the
>> src and dest object content at those location.
>>
>> Then blit copies the data.
>>
>> What happens if context is re-used? Gva's no longer point to PTEs so
>> how? Is a step to re-initialize after use missing? More sdw after the
>> blit copy to make it point back to ptes?
> 
> Every chunk starts with writing the PTEs used by the chunk. Within the
> chunk, the GPU commands are not preemptible, so the PTEs cannot be
> replaced before they are consumed.
> 
> We do leave the PTE pointing to stale pages after the blit, but since
> the vm is only used by first rewriting the PTE, those dangling PTE
> should never be chased.
> 
> So each chunk does a fixed copy between the same pair of addresses
> within the vm; the magic is in rewriting the PTE to point at the pages
> of interest for the chunk.

Isn't the vm getting potentially re-used on the fall-back path, if 
virtual engine creation fails? In which case dangling PTEs would be 
written to. Or I am still missing something?

>>>>> +             i915_request_add(rq);
>>>>> +             if (it_s.sg)
>>>>> +                     cond_resched();
>>>>
>>>>    From what context does this run? No preemptible?
>>>
>>> Has to be process context; numerous allocations, implicit waits (that we
>>> want to avoid in practice), and the timeline (per-context) mutex to
>>> guard access to the ringbuffer.
>>
>> I guess on non-preemptible, or not fully preemptible kernel it is useful.
> 
> Oh, the cond_resched() is almost certainly overkill. But potentially a
> very large loop so worth being careful.

Yes, could be a large loop so doesn't harm.

Is the ringbuffer large enough to avoid stalls in that path?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-11-30 16:26           ` Tvrtko Ursulin
@ 2020-11-30 16:44             ` Chris Wilson
  2020-12-01  9:26               ` Tvrtko Ursulin
  0 siblings, 1 reply; 16+ messages in thread
From: Chris Wilson @ 2020-11-30 16:44 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2020-11-30 16:26:44)
> 
> On 30/11/2020 16:21, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-11-30 16:07:44)
> >>
> >> On 30/11/2020 13:39, Chris Wilson wrote:
> >>> Quoting Tvrtko Ursulin (2020-11-30 13:12:55)
> >>>> On 28/11/2020 18:40, Chris Wilson wrote:
> >>>>> If we pipeline the PTE updates and then do the copy of those pages
> >>>>> within a single unpreemptible command packet, we can submit the copies
> >>>>> and leave them to be scheduled without having to synchronously wait
> >>>>> under a global lock.
> >>>>>
> >>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>> ---
> >>>>>     drivers/gpu/drm/i915/Makefile                 |   1 +
> >>>>>     drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
> >>>>>     drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
> >>>>>     drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
> >>>>>     drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
> >>>>>     .../drm/i915/selftests/i915_live_selftests.h  |   1 +
> >>>>>     6 files changed, 511 insertions(+)
> >>>>>     create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
> >>>>>     create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
> >>>>>     create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c
> >>>>>
> >>>>> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> >>>>> index e5574e506a5c..0b2e12c87f9d 100644
> >>>>> --- a/drivers/gpu/drm/i915/Makefile
> >>>>> +++ b/drivers/gpu/drm/i915/Makefile
> >>>>> @@ -103,6 +103,7 @@ gt-y += \
> >>>>>         gt/intel_gtt.o \
> >>>>>         gt/intel_llc.o \
> >>>>>         gt/intel_lrc.o \
> >>>>> +     gt/intel_migrate.o \
> >>>>>         gt/intel_mocs.o \
> >>>>>         gt/intel_ppgtt.o \
> >>>>>         gt/intel_rc6.o \
> >>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
> >>>>> index ac58fcda4927..079d26b47a97 100644
> >>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
> >>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
> >>>>> @@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
> >>>>>     #define I915_GEM_HWS_PREEMPT_ADDR   (I915_GEM_HWS_PREEMPT * sizeof(u32))
> >>>>>     #define I915_GEM_HWS_SEQNO          0x40
> >>>>>     #define I915_GEM_HWS_SEQNO_ADDR             (I915_GEM_HWS_SEQNO * sizeof(u32))
> >>>>> +#define I915_GEM_HWS_MIGRATE         (0x42 * sizeof(u32))
> >>>>>     #define I915_GEM_HWS_SCRATCH                0x80
> >>>>>     
> >>>>>     #define I915_HWS_CSB_BUF0_INDEX             0x10
> >>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
> >>>>> new file mode 100644
> >>>>> index 000000000000..4d7bd32eb8d4
> >>>>> --- /dev/null
> >>>>> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> >>>>> @@ -0,0 +1,370 @@
> >>>>> +// SPDX-License-Identifier: MIT
> >>>>> +/*
> >>>>> + * Copyright © 2020 Intel Corporation
> >>>>> + */
> >>>>> +
> >>>>> +#include "i915_drv.h"
> >>>>> +#include "intel_context.h"
> >>>>> +#include "intel_gt.h"
> >>>>> +#include "intel_gtt.h"
> >>>>> +#include "intel_lrc.h" /* virtual engine */
> >>>>> +#include "intel_migrate.h"
> >>>>> +#include "intel_ring.h"
> >>>>> +
> >>>>> +#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
> >>>>> +
> >>>>> +static void insert_pte(struct i915_address_space *vm,
> >>>>> +                    struct i915_page_table *pt,
> >>>>> +                    void *data)
> >>>>> +{
> >>>>> +     u64 *offset = data;
> >>>>> +
> >>>>> +     vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
> >>>>> +     *offset += PAGE_SIZE;
> >>>>> +}
> >>>>> +
> >>>>> +static struct i915_address_space *migrate_vm(struct intel_gt *gt)
> >>>>> +{
> >>>>> +     struct i915_vm_pt_stash stash = {};
> >>>>> +     struct i915_ppgtt *vm;
> >>>>> +     u64 offset, sz;
> >>>>> +     int err;
> >>>>> +
> >>>>> +     vm = i915_ppgtt_create(gt);
> >>>>> +     if (IS_ERR(vm))
> >>>>> +             return ERR_CAST(vm);
> >>>>> +
> >>>>> +     if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
> >>>>> +             err = -ENODEV;
> >>>>> +             goto err_vm;
> >>>>> +     }
> >>>>> +
> >>>>> +     /*
> >>>>> +      * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
> >>>>> +      * 4x2 page directories for source/destination.
> >>>>> +      */
> >>>>> +     sz = 2 * CHUNK_SZ;
> >>>>> +     offset = sz;
> >>>>> +
> >>>>> +     /*
> >>>>> +      * We need another page directory setup so that we can write
> >>>>> +      * the 8x512 PTE in each chunk.
> >>>>> +      */
> >>>>> +     sz += (sz >> 12) * sizeof(u64);
> >>>>> +
> >>>>> +     err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
> >>>>> +     if (err)
> >>>>> +             goto err_vm;
> >>>>> +
> >>>>> +     err = i915_vm_pin_pt_stash(&vm->vm, &stash);
> >>>>> +     if (err) {
> >>>>> +             i915_vm_free_pt_stash(&vm->vm, &stash);
> >>>>> +             goto err_vm;
> >>>>> +     }
> >>>>> +
> >>>>> +     vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
> >>>>> +     i915_vm_free_pt_stash(&vm->vm, &stash);
> >>>>> +
> >>>>> +     /* Now allow the GPU to rewrite the PTE via its own ppGTT */
> >>>>> +     vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);
> >>>>
> >>>> This is just making the [0 - sz) gva point to the allocated sz bytes of
> >>>> backing store?
> >>>
> >>> Not quite, we are pointing [offset, sz) to the page directories
> >>> themselves. When we write into that range, we are modifying the PTE of
> >>> this ppGTT. (Breaking the fourth wall, allowing the non-privileged
> >>> context to update its own page tables.)
> >>
> >> Confusing, so first step is here making [0, sz) gva ptes point to pte
> >> backing store.
> >>
> >> Which means emit_pte when called from intel_context_migrate_pages is
> >> emitting sdw which will write into [0, sz] gva, effectively placing the
> >> src and dest object content at those location.
> >>
> >> Then blit copies the data.
> >>
> >> What happens if context is re-used? Gva's no longer point to PTEs so
> >> how? Is a step to re-initialize after use missing? More sdw after the
> >> blit copy to make it point back to ptes?
> > 
> > Every chunk starts with writing the PTEs used by the chunk. Within the
> > chunk, the GPU commands are not preemptible, so the PTEs cannot be
> > replaced before they are consumed.
> > 
> > We do leave the PTE pointing to stale pages after the blit, but since
> > the vm is only used by first rewriting the PTE, those dangling PTE
> > should never be chased.
> > 
> > So each chunk does a fixed copy between the same pair of addresses
> > within the vm; the magic is in rewriting the PTE to point at the pages
> > of interest for the chunk.
> 
> Isn't the vm getting potentially re-used on the fall-back path, if 
> virtual engine creation fails? In which case dangling PTEs would be 
> written to. Or I am still missing something?

The same vm is used by all contexts (only one is created).

We require that the PTE write + copy occur uninterrupted, which is fine
as we control the arbitration points.

> >>>>> +             i915_request_add(rq);
> >>>>> +             if (it_s.sg)
> >>>>> +                     cond_resched();
> >>>>
> >>>>    From what context does this run? No preemptible?
> >>>
> >>> Has to be process context; numerous allocations, implicit waits (that we
> >>> want to avoid in practice), and the timeline (per-context) mutex to
> >>> guard access to the ringbuffer.
> >>
> >> I guess on non-preemptible, or not fully preemptible kernel it is useful.
> > 
> > Oh, the cond_resched() is almost certainly overkill. But potentially a
> > very large loop so worth being careful.
> 
> Yes, could be a large loop so doesn't harm.
> 
> Is the ringbuffer large enough to avoid stalls in that path?

Is any ringbuffer large enough to be able to copy between 2 128PiB
objects?

It works out that we can only handle ~500M copies before the buffer is
full and we must clear a chunk. Switching to a new context at that point
would keep on extending it until we run out of lmem and/or smem, but the
cost of fabricating a new context at that point makes it dubious.

I think that the greater impact would be from different clients using the
global context; fighting over the ringbuffer is still an improvement over
everyone doing a synchronous copy and blocking all, but the single
timeline still acts as a global serialisation and priority barrier.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-11-30 16:44             ` Chris Wilson
@ 2020-12-01  9:26               ` Tvrtko Ursulin
  2020-12-01  9:33                 ` Chris Wilson
  0 siblings, 1 reply; 16+ messages in thread
From: Tvrtko Ursulin @ 2020-12-01  9:26 UTC (permalink / raw)
  To: Chris Wilson, intel-gfx


On 30/11/2020 16:44, Chris Wilson wrote:
> Quoting Tvrtko Ursulin (2020-11-30 16:26:44)
>>
>> On 30/11/2020 16:21, Chris Wilson wrote:
>>> Quoting Tvrtko Ursulin (2020-11-30 16:07:44)
>>>>
>>>> On 30/11/2020 13:39, Chris Wilson wrote:
>>>>> Quoting Tvrtko Ursulin (2020-11-30 13:12:55)
>>>>>> On 28/11/2020 18:40, Chris Wilson wrote:
>>>>>>> If we pipeline the PTE updates and then do the copy of those pages
>>>>>>> within a single unpreemptible command packet, we can submit the copies
>>>>>>> and leave them to be scheduled without having to synchronously wait
>>>>>>> under a global lock.
>>>>>>>
>>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
>>>>>>> ---
>>>>>>>      drivers/gpu/drm/i915/Makefile                 |   1 +
>>>>>>>      drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
>>>>>>>      drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
>>>>>>>      drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
>>>>>>>      drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
>>>>>>>      .../drm/i915/selftests/i915_live_selftests.h  |   1 +
>>>>>>>      6 files changed, 511 insertions(+)
>>>>>>>      create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
>>>>>>>      create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
>>>>>>>      create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
>>>>>>> index e5574e506a5c..0b2e12c87f9d 100644
>>>>>>> --- a/drivers/gpu/drm/i915/Makefile
>>>>>>> +++ b/drivers/gpu/drm/i915/Makefile
>>>>>>> @@ -103,6 +103,7 @@ gt-y += \
>>>>>>>          gt/intel_gtt.o \
>>>>>>>          gt/intel_llc.o \
>>>>>>>          gt/intel_lrc.o \
>>>>>>> +     gt/intel_migrate.o \
>>>>>>>          gt/intel_mocs.o \
>>>>>>>          gt/intel_ppgtt.o \
>>>>>>>          gt/intel_rc6.o \
>>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
>>>>>>> index ac58fcda4927..079d26b47a97 100644
>>>>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
>>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
>>>>>>> @@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
>>>>>>>      #define I915_GEM_HWS_PREEMPT_ADDR   (I915_GEM_HWS_PREEMPT * sizeof(u32))
>>>>>>>      #define I915_GEM_HWS_SEQNO          0x40
>>>>>>>      #define I915_GEM_HWS_SEQNO_ADDR             (I915_GEM_HWS_SEQNO * sizeof(u32))
>>>>>>> +#define I915_GEM_HWS_MIGRATE         (0x42 * sizeof(u32))
>>>>>>>      #define I915_GEM_HWS_SCRATCH                0x80
>>>>>>>      
>>>>>>>      #define I915_HWS_CSB_BUF0_INDEX             0x10
>>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
>>>>>>> new file mode 100644
>>>>>>> index 000000000000..4d7bd32eb8d4
>>>>>>> --- /dev/null
>>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
>>>>>>> @@ -0,0 +1,370 @@
>>>>>>> +// SPDX-License-Identifier: MIT
>>>>>>> +/*
>>>>>>> + * Copyright © 2020 Intel Corporation
>>>>>>> + */
>>>>>>> +
>>>>>>> +#include "i915_drv.h"
>>>>>>> +#include "intel_context.h"
>>>>>>> +#include "intel_gt.h"
>>>>>>> +#include "intel_gtt.h"
>>>>>>> +#include "intel_lrc.h" /* virtual engine */
>>>>>>> +#include "intel_migrate.h"
>>>>>>> +#include "intel_ring.h"
>>>>>>> +
>>>>>>> +#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
>>>>>>> +
>>>>>>> +static void insert_pte(struct i915_address_space *vm,
>>>>>>> +                    struct i915_page_table *pt,
>>>>>>> +                    void *data)
>>>>>>> +{
>>>>>>> +     u64 *offset = data;
>>>>>>> +
>>>>>>> +     vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
>>>>>>> +     *offset += PAGE_SIZE;
>>>>>>> +}
>>>>>>> +
>>>>>>> +static struct i915_address_space *migrate_vm(struct intel_gt *gt)
>>>>>>> +{
>>>>>>> +     struct i915_vm_pt_stash stash = {};
>>>>>>> +     struct i915_ppgtt *vm;
>>>>>>> +     u64 offset, sz;
>>>>>>> +     int err;
>>>>>>> +
>>>>>>> +     vm = i915_ppgtt_create(gt);
>>>>>>> +     if (IS_ERR(vm))
>>>>>>> +             return ERR_CAST(vm);
>>>>>>> +
>>>>>>> +     if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
>>>>>>> +             err = -ENODEV;
>>>>>>> +             goto err_vm;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     /*
>>>>>>> +      * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
>>>>>>> +      * 4x2 page directories for source/destination.
>>>>>>> +      */
>>>>>>> +     sz = 2 * CHUNK_SZ;
>>>>>>> +     offset = sz;
>>>>>>> +
>>>>>>> +     /*
>>>>>>> +      * We need another page directory setup so that we can write
>>>>>>> +      * the 8x512 PTE in each chunk.
>>>>>>> +      */
>>>>>>> +     sz += (sz >> 12) * sizeof(u64);
>>>>>>> +
>>>>>>> +     err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
>>>>>>> +     if (err)
>>>>>>> +             goto err_vm;
>>>>>>> +
>>>>>>> +     err = i915_vm_pin_pt_stash(&vm->vm, &stash);
>>>>>>> +     if (err) {
>>>>>>> +             i915_vm_free_pt_stash(&vm->vm, &stash);
>>>>>>> +             goto err_vm;
>>>>>>> +     }
>>>>>>> +
>>>>>>> +     vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
>>>>>>> +     i915_vm_free_pt_stash(&vm->vm, &stash);
>>>>>>> +
>>>>>>> +     /* Now allow the GPU to rewrite the PTE via its own ppGTT */
>>>>>>> +     vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);
>>>>>>
>>>>>> This is just making the [0 - sz) gva point to the allocated sz bytes of
>>>>>> backing store?
>>>>>
>>>>> Not quite, we are pointing [offset, sz) to the page directories
>>>>> themselves. When we write into that range, we are modifying the PTE of
>>>>> this ppGTT. (Breaking the fourth wall, allowing the non-privileged
>>>>> context to update its own page tables.)
>>>>
>>>> Confusing, so first step is here making [0, sz) gva ptes point to pte
>>>> backing store.
>>>>
>>>> Which means emit_pte when called from intel_context_migrate_pages is
>>>> emitting sdw which will write into [0, sz] gva, effectively placing the
>>>> src and dest object content at those location.
>>>>
>>>> Then blit copies the data.
>>>>
>>>> What happens if context is re-used? Gva's no longer point to PTEs so
>>>> how? Is a step to re-initialize after use missing? More sdw after the
>>>> blit copy to make it point back to ptes?
>>>
>>> Every chunk starts with writing the PTEs used by the chunk. Within the
>>> chunk, the GPU commands are not preemptible, so the PTEs cannot be
>>> replaced before they are consumed.
>>>
>>> We do leave the PTE pointing to stale pages after the blit, but since
>>> the vm is only used by first rewriting the PTE, those dangling PTE
>>> should never be chased.
>>>
>>> So each chunk does a fixed copy between the same pair of addresses
>>> within the vm; the magic is in rewriting the PTE to point at the pages
>>> of interest for the chunk.
>>
>> Isn't the vm getting potentially re-used on the fall-back path, if
>> virtual engine creation fails? In which case dangling PTEs would be
>> written to. Or I am still missing something?
> 
> The same vm is used by all contexts (only one is created).

Right, true. I'll try again because the overall scheme is still 
confusing me..

Migrate_vm makes [0, 2 * CHUNK_SZ) point to PTE backing store for the 
same range.

Src emit_pte writes up to [0, CHUNK_SZ) effectively re-writing PTEs for 
the same range

Dest emit_pte same for the destination range

Virtual address [0, 2 * CHUNK_SZ) ptes now point to src and dest objects 
backing store.

Blit copy emits blitter commands to copy between src and dest.

Now second copy comes and calls emit_pte which again writes to [0, 
CHUNK_SZ) virtual range. How does that end up in the PTE backing store 
and not previous object backing store?

Regards,

Tvrtko

_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-12-01  9:26               ` Tvrtko Ursulin
@ 2020-12-01  9:33                 ` Chris Wilson
  2020-12-01  9:49                   ` Chris Wilson
  0 siblings, 1 reply; 16+ messages in thread
From: Chris Wilson @ 2020-12-01  9:33 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Tvrtko Ursulin (2020-12-01 09:26:05)
> 
> On 30/11/2020 16:44, Chris Wilson wrote:
> > Quoting Tvrtko Ursulin (2020-11-30 16:26:44)
> >>
> >> On 30/11/2020 16:21, Chris Wilson wrote:
> >>> Quoting Tvrtko Ursulin (2020-11-30 16:07:44)
> >>>>
> >>>> On 30/11/2020 13:39, Chris Wilson wrote:
> >>>>> Quoting Tvrtko Ursulin (2020-11-30 13:12:55)
> >>>>>> On 28/11/2020 18:40, Chris Wilson wrote:
> >>>>>>> If we pipeline the PTE updates and then do the copy of those pages
> >>>>>>> within a single unpreemptible command packet, we can submit the copies
> >>>>>>> and leave them to be scheduled without having to synchronously wait
> >>>>>>> under a global lock.
> >>>>>>>
> >>>>>>> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
> >>>>>>> ---
> >>>>>>>      drivers/gpu/drm/i915/Makefile                 |   1 +
> >>>>>>>      drivers/gpu/drm/i915/gt/intel_engine.h        |   1 +
> >>>>>>>      drivers/gpu/drm/i915/gt/intel_migrate.c       | 370 ++++++++++++++++++
> >>>>>>>      drivers/gpu/drm/i915/gt/intel_migrate.h       |  33 ++
> >>>>>>>      drivers/gpu/drm/i915/gt/selftest_migrate.c    | 105 +++++
> >>>>>>>      .../drm/i915/selftests/i915_live_selftests.h  |   1 +
> >>>>>>>      6 files changed, 511 insertions(+)
> >>>>>>>      create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.c
> >>>>>>>      create mode 100644 drivers/gpu/drm/i915/gt/intel_migrate.h
> >>>>>>>      create mode 100644 drivers/gpu/drm/i915/gt/selftest_migrate.c
> >>>>>>>
> >>>>>>> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> >>>>>>> index e5574e506a5c..0b2e12c87f9d 100644
> >>>>>>> --- a/drivers/gpu/drm/i915/Makefile
> >>>>>>> +++ b/drivers/gpu/drm/i915/Makefile
> >>>>>>> @@ -103,6 +103,7 @@ gt-y += \
> >>>>>>>          gt/intel_gtt.o \
> >>>>>>>          gt/intel_llc.o \
> >>>>>>>          gt/intel_lrc.o \
> >>>>>>> +     gt/intel_migrate.o \
> >>>>>>>          gt/intel_mocs.o \
> >>>>>>>          gt/intel_ppgtt.o \
> >>>>>>>          gt/intel_rc6.o \
> >>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_engine.h b/drivers/gpu/drm/i915/gt/intel_engine.h
> >>>>>>> index ac58fcda4927..079d26b47a97 100644
> >>>>>>> --- a/drivers/gpu/drm/i915/gt/intel_engine.h
> >>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_engine.h
> >>>>>>> @@ -188,6 +188,7 @@ intel_write_status_page(struct intel_engine_cs *engine, int reg, u32 value)
> >>>>>>>      #define I915_GEM_HWS_PREEMPT_ADDR   (I915_GEM_HWS_PREEMPT * sizeof(u32))
> >>>>>>>      #define I915_GEM_HWS_SEQNO          0x40
> >>>>>>>      #define I915_GEM_HWS_SEQNO_ADDR             (I915_GEM_HWS_SEQNO * sizeof(u32))
> >>>>>>> +#define I915_GEM_HWS_MIGRATE         (0x42 * sizeof(u32))
> >>>>>>>      #define I915_GEM_HWS_SCRATCH                0x80
> >>>>>>>      
> >>>>>>>      #define I915_HWS_CSB_BUF0_INDEX             0x10
> >>>>>>> diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
> >>>>>>> new file mode 100644
> >>>>>>> index 000000000000..4d7bd32eb8d4
> >>>>>>> --- /dev/null
> >>>>>>> +++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
> >>>>>>> @@ -0,0 +1,370 @@
> >>>>>>> +// SPDX-License-Identifier: MIT
> >>>>>>> +/*
> >>>>>>> + * Copyright © 2020 Intel Corporation
> >>>>>>> + */
> >>>>>>> +
> >>>>>>> +#include "i915_drv.h"
> >>>>>>> +#include "intel_context.h"
> >>>>>>> +#include "intel_gt.h"
> >>>>>>> +#include "intel_gtt.h"
> >>>>>>> +#include "intel_lrc.h" /* virtual engine */
> >>>>>>> +#include "intel_migrate.h"
> >>>>>>> +#include "intel_ring.h"
> >>>>>>> +
> >>>>>>> +#define CHUNK_SZ SZ_8M /* ~1ms at 8GiB/s preemption delay */
> >>>>>>> +
> >>>>>>> +static void insert_pte(struct i915_address_space *vm,
> >>>>>>> +                    struct i915_page_table *pt,
> >>>>>>> +                    void *data)
> >>>>>>> +{
> >>>>>>> +     u64 *offset = data;
> >>>>>>> +
> >>>>>>> +     vm->insert_page(vm, px_dma(pt), *offset, I915_CACHE_NONE, 0);
> >>>>>>> +     *offset += PAGE_SIZE;
> >>>>>>> +}
> >>>>>>> +
> >>>>>>> +static struct i915_address_space *migrate_vm(struct intel_gt *gt)
> >>>>>>> +{
> >>>>>>> +     struct i915_vm_pt_stash stash = {};
> >>>>>>> +     struct i915_ppgtt *vm;
> >>>>>>> +     u64 offset, sz;
> >>>>>>> +     int err;
> >>>>>>> +
> >>>>>>> +     vm = i915_ppgtt_create(gt);
> >>>>>>> +     if (IS_ERR(vm))
> >>>>>>> +             return ERR_CAST(vm);
> >>>>>>> +
> >>>>>>> +     if (!vm->vm.allocate_va_range || !vm->vm.foreach) {
> >>>>>>> +             err = -ENODEV;
> >>>>>>> +             goto err_vm;
> >>>>>>> +     }
> >>>>>>> +
> >>>>>>> +     /*
> >>>>>>> +      * We copy in 8MiB chunks. Each PDE covers 2MiB, so we need
> >>>>>>> +      * 4x2 page directories for source/destination.
> >>>>>>> +      */
> >>>>>>> +     sz = 2 * CHUNK_SZ;
> >>>>>>> +     offset = sz;
> >>>>>>> +
> >>>>>>> +     /*
> >>>>>>> +      * We need another page directory setup so that we can write
> >>>>>>> +      * the 8x512 PTE in each chunk.
> >>>>>>> +      */
> >>>>>>> +     sz += (sz >> 12) * sizeof(u64);
> >>>>>>> +
> >>>>>>> +     err = i915_vm_alloc_pt_stash(&vm->vm, &stash, sz);
> >>>>>>> +     if (err)
> >>>>>>> +             goto err_vm;
> >>>>>>> +
> >>>>>>> +     err = i915_vm_pin_pt_stash(&vm->vm, &stash);
> >>>>>>> +     if (err) {
> >>>>>>> +             i915_vm_free_pt_stash(&vm->vm, &stash);
> >>>>>>> +             goto err_vm;
> >>>>>>> +     }
> >>>>>>> +
> >>>>>>> +     vm->vm.allocate_va_range(&vm->vm, &stash, 0, sz);
> >>>>>>> +     i915_vm_free_pt_stash(&vm->vm, &stash);
> >>>>>>> +
> >>>>>>> +     /* Now allow the GPU to rewrite the PTE via its own ppGTT */
> >>>>>>> +     vm->vm.foreach(&vm->vm, 0, sz, insert_pte, &offset);
> >>>>>>
> >>>>>> This is just making the [0 - sz) gva point to the allocated sz bytes of
> >>>>>> backing store?
> >>>>>
> >>>>> Not quite, we are pointing [offset, sz) to the page directories
> >>>>> themselves. When we write into that range, we are modifying the PTE of
> >>>>> this ppGTT. (Breaking the fourth wall, allowing the non-privileged
> >>>>> context to update its own page tables.)
> >>>>
> >>>> Confusing, so first step is here making [0, sz) gva ptes point to pte
> >>>> backing store.
> >>>>
> >>>> Which means emit_pte when called from intel_context_migrate_pages is
> >>>> emitting sdw which will write into [0, sz] gva, effectively placing the
> >>>> src and dest object content at those location.
> >>>>
> >>>> Then blit copies the data.
> >>>>
> >>>> What happens if context is re-used? Gva's no longer point to PTEs so
> >>>> how? Is a step to re-initialize after use missing? More sdw after the
> >>>> blit copy to make it point back to ptes?
> >>>
> >>> Every chunk starts with writing the PTEs used by the chunk. Within the
> >>> chunk, the GPU commands are not preemptible, so the PTEs cannot be
> >>> replaced before they are consumed.
> >>>
> >>> We do leave the PTE pointing to stale pages after the blit, but since
> >>> the vm is only used by first rewriting the PTE, those dangling PTE
> >>> should never be chased.
> >>>
> >>> So each chunk does a fixed copy between the same pair of addresses
> >>> within the vm; the magic is in rewriting the PTE to point at the pages
> >>> of interest for the chunk.
> >>
> >> Isn't the vm getting potentially re-used on the fall-back path, if
> >> virtual engine creation fails? In which case dangling PTEs would be
> >> written to. Or I am still missing something?
> > 
> > The same vm is used by all contexts (only one is created).
> 
> Right, true. I'll try again because the overall scheme is still 
> confusing me..
> 
> Migrate_vm makes [0, 2 * CHUNK_SZ) point to PTE backing store for the 
> same range.

[2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9]

> Src emit_pte writes up to [0, CHUNK_SZ) effectively re-writing PTEs for 
> the same range

Apart from the PD backpointers start at 2 * CHUNK_SZ...

> Dest emit_pte same for the destination range

That starts at 2 * CHUNK_SZ + CHUNK_SZ >> 9

So I passed in the target address range to emit_pte

	len = emit_pte(rq, &it_src, encode, 0, CHUNK_SZ);
	err = emit_pte(rq, &it_dst, encode, CHUNK_SZ, len);

but the first thing emit_pte does is compute the PD offset for that
range:

        offset >>= 12;
        offset *= sizeof(u64);
        offset += 2 * CHUNK_SZ;

> Virtual address [0, 2 * CHUNK_SZ) ptes now point to src and dest objects 
> backing store.

Yes.

> Blit copy emits blitter commands to copy between src and dest.
> 
> Now second copy comes and calls emit_pte which again writes to [0, 
> CHUNK_SZ) virtual range. How does that end up in the PTE backing store 
> and not previous object backing store?

It all boils down to the PD being offset.
-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration
  2020-12-01  9:33                 ` Chris Wilson
@ 2020-12-01  9:49                   ` Chris Wilson
  0 siblings, 0 replies; 16+ messages in thread
From: Chris Wilson @ 2020-12-01  9:49 UTC (permalink / raw)
  To: Tvrtko Ursulin, intel-gfx

Quoting Chris Wilson (2020-12-01 09:33:20)
> Quoting Tvrtko Ursulin (2020-12-01 09:26:05)
> > Now second copy comes and calls emit_pte which again writes to [0, 
> > CHUNK_SZ) virtual range. How does that end up in the PTE backing store 
> > and not previous object backing store?
> 
> It all boils down to the PD being offset.

        /*
         * We construct a very special VM for use by all migration contexts,
         * it is kept pinned so that it can be used at any time. As we need
         * to pre-allocate the page directories for the migration VM, this
         * limits us to only using a small number of prepared vma.
         *
	 * To be able to pipeline and reschedule migration operations while
         * avoiding unnecessary contention on the vm itself, the PTE updates
         * are inline with the blits. All the blits use the same fixed
         * addresses, with the backing store redirection being updated on the
         * fly. Only 2 implicit vma are used for all migration operations.
         *
	 * We lay the ppGTT out as:
         *
         *      [0, CHUNK_SZ) -> first object
         *      [CHUNK_SZ, 2 * CHUNK_SZ) -> second object
         *      [2 * CHUNK_SZ, 2 * CHUNK_SZ + 2 * CHUNK_SZ >> 9] -> PTE
         *
         * By exposing the dma addresses of the page directories themselves
         * within the ppGTT, we are then able to rewrite the PTE prior to use.
         */

-Chris
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2020-12-01  9:49 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-28 18:40 [Intel-gfx] [RFC 1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt Chris Wilson
2020-11-28 18:40 ` [Intel-gfx] [RFC 2/4] drm/i915/gt: Add a routine to iterate over the pagetables of a GTT Chris Wilson
2020-11-28 18:40 ` [Intel-gfx] [RFC 3/4] drm/i915/gt: Export the pinned context constructor Chris Wilson
2020-11-28 18:40 ` [Intel-gfx] [RFC 4/4] drm/i915/gt: Pipelined page migration Chris Wilson
2020-11-30 13:12   ` Tvrtko Ursulin
2020-11-30 13:39     ` Chris Wilson
2020-11-30 14:11       ` Chris Wilson
2020-11-30 16:07       ` Tvrtko Ursulin
2020-11-30 16:21         ` Chris Wilson
2020-11-30 16:26           ` Tvrtko Ursulin
2020-11-30 16:44             ` Chris Wilson
2020-12-01  9:26               ` Tvrtko Ursulin
2020-12-01  9:33                 ` Chris Wilson
2020-12-01  9:49                   ` Chris Wilson
2020-11-30 15:19 ` [Intel-gfx] ✗ Fi.CI.CHECKPATCH: warning for series starting with [RFC,1/4] drm/i915/gt: Add an insert_entry for gen8_ppgtt Patchwork
2020-11-30 15:51 ` [Intel-gfx] ✗ Fi.CI.BAT: failure " Patchwork

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.