All of lore.kernel.org
 help / color / mirror / Atom feed
* Support for amdgpu VM update via CPU on large-bar systems
@ 2017-05-09 21:47 Kasiviswanathan, Harish
       [not found] ` <DM3PR1201MB10382F282D646D630708246D8CEF0-BBcFnVpqZhWjUUTFdQAMQmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Kasiviswanathan, Harish @ 2017-05-09 21:47 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

[-- Attachment #1: Type: text/plain, Size: 364 bytes --]

Hi,

Please review the patch set that supports amdgpu VM update via CPU. This feature provides improved performance for compute (HSA) where mapping / unmapping is carried out (by Kernel) independent of command submissions (done directly by user space). This version doesn't support shadow copy of VM page tables for CPU based update.

Best Regards,
Harish


[-- Attachment #2: 0001-drm-amdgpu-Add-vm-context-module-param.patch --]
[-- Type: application/octet-stream, Size: 6483 bytes --]

From 109b8cbf83c704e881ae92261fe556fb71a5bffd Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Tue, 9 May 2017 12:04:24 -0400
Subject: [PATCH 1/4] drm/amdgpu: Add vm context module param

Add VM context module param (amdgpu.vm_update_context) that can used to
control how the VM pde/pte are updated for Graphics and Compute.

BIT0 controls Graphics and BIT1 Compute.
 BIT0 [= 0] Graphics updated by SDMA [= 1] by CPU
 BIT1 [= 0] Compute updated by SDMA [= 1] by CPU

However, CPU based update is only supported for large BAR systems.

By default, vm_update_context = 2, indicating that Graphics VMs will be
updated via SDMA and Compute VMs will be updated via CPU (for large BAR
system).

Change-Id: I95b2913b30d1027d1224a97e0ec92c08ae6494f2
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 17 +++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  | 17 ++++++++++++++++-
 5 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5310781..d420ae5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -93,6 +93,7 @@
 extern int amdgpu_vm_block_size;
 extern int amdgpu_vm_fault_stop;
 extern int amdgpu_vm_debug;
+extern int amdgpu_vm_update_context;
 extern int amdgpu_dc;
 extern int amdgpu_sched_jobs;
 extern int amdgpu_sched_hw_submission;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 416908a..5b8a6ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -93,6 +93,7 @@
 int amdgpu_vm_block_size = -1;
 int amdgpu_vm_fault_stop = 0;
 int amdgpu_vm_debug = 0;
+int amdgpu_vm_update_context = 2;
 int amdgpu_vram_page_split = 1024;
 int amdgpu_exp_hw_support = 0;
 int amdgpu_dc = -1;
@@ -179,6 +180,9 @@
 MODULE_PARM_DESC(vm_debug, "Debug VM handling (0 = disabled (default), 1 = enabled)");
 module_param_named(vm_debug, amdgpu_vm_debug, int, 0644);
 
+MODULE_PARM_DESC(vm_update_context, "VM update using CPU on large BAR. (0 = never, 1 = Graphics only, 2 = Compute only (default), 3 = Both");
+module_param_named(vm_update_context, amdgpu_vm_update_context, int, 0444);
+
 MODULE_PARM_DESC(vram_page_split, "Number of pages after we split VRAM allocations (default 1024, -1 = disable)");
 module_param_named(vram_page_split, amdgpu_vram_page_split, int, 0444);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index c2d4465..e2b394e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -757,7 +757,9 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
 		goto out_suspend;
 	}
 
-	r = amdgpu_vm_init(adev, &fpriv->vm);
+	r = amdgpu_vm_init(adev, &fpriv->vm,
+			   !!(amdgpu_vm_update_context &
+			   AMDGPU_VM_USE_CPU_UPDATE_FOR_GRAPHICS_MASK));
 	if (r) {
 		kfree(fpriv);
 		goto out_suspend;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ed97a2e..b5871f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -694,6 +694,13 @@ static u64 amdgpu_vm_adjust_mc_addr(struct amdgpu_device *adev, u64 mc_addr)
 	return addr;
 }
 
+static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev)
+{
+	if (adev->mc.visible_vram_size > 0 &&
+			adev->mc.real_vram_size == adev->mc.visible_vram_size)
+		return true;
+	return false;
+}
 /**
  * amdgpu_vm_flush - hardware flush the vm
  *
@@ -2234,10 +2241,12 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint64_t vm_size)
  *
  * @adev: amdgpu_device pointer
  * @vm: requested vm
+ * @vm_update_mode: FALSE use SDMA, TRUE use CPU to update page tables
  *
  * Init @vm fields.
  */
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm
+		   bool vm_update_mode)
 {
 	const unsigned align = min(AMDGPU_VM_PTB_ALIGN_SIZE,
 		AMDGPU_VM_PTE_COUNT(adev) * 8);
@@ -2265,7 +2274,11 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 				  rq, amdgpu_sched_jobs);
 	if (r)
 		return r;
-
+	/* CPU update is only supported for large BAR system */
+	vm->is_vm_update_mode_cpu = (vm_update_mode &&
+				     amdgpu_vm_is_large_bar(adev));
+	DRM_DEBUG_DRIVER("VM update mode is %s\n",
+			 vm->is_vm_update_mode_cpu ? "CPU" : "SDMA");
 	vm->last_dir_update = NULL;
 
 	r = amdgpu_bo_create(adev, amdgpu_vm_bo_size(adev, 0), align, true,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index abc0bab..46fde0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -96,6 +96,17 @@ struct amdgpu_vm_pt {
 	unsigned		last_entry_used;
 };
 
+/* amdgpu_vm_update_context - User parameter that controls how VM pde/pte are
+ *  updated for Graphics and Compute. BIT0 controls Graphics and BIT1 controls
+ *  Compute.
+ *  BIT0 [= 0] Graphics updated by SDMA [= 1] by CPU
+ *  BIT1 [= 0] Compute updated by SDMA [= 1] by CPU
+ *  NOTE: CPU update is supported only in large BAR systems
+ */
+#define AMDGPU_VM_USE_CPU_UPDATE_FOR_GRAPHICS_MASK (1 << 0)
+#define AMDGPU_VM_USE_CPU_UPDATE_FOR_COMPUTE_MASK (1 << 1)
+
+
 struct amdgpu_vm {
 	/* tree of virtual addresses mapped */
 	struct rb_root		va;
@@ -129,6 +140,9 @@ struct amdgpu_vm {
 	struct amdgpu_vm_id	*reserved_vmid[AMDGPU_MAX_VMHUBS];
 	/* each VM will map on CSA */
 	struct amdgpu_bo_va *csa_bo_va;
+
+	/* Flag to indicate if VM tables are updated by CPU or GPU (SDMA) */
+	bool                    is_vm_update_mode_cpu : 1;
 };
 
 struct amdgpu_vm_id {
@@ -190,7 +204,8 @@ struct amdgpu_vm_manager {
 
 void amdgpu_vm_manager_init(struct amdgpu_device *adev);
 void amdgpu_vm_manager_fini(struct amdgpu_device *adev);
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm);
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+		   bool vm_update_mode);
 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
 			 struct list_head *validated,
-- 
1.9.1


[-- Attachment #3: 0002-drm-amdgpu-Support-page-directory-update-via-CPU.patch --]
[-- Type: application/octet-stream, Size: 7819 bytes --]

From 575a3780e9b946443055ef59afa1336f3883d4a5 Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Tue, 11 Apr 2017 11:15:32 -0400
Subject: [PATCH 2/4] drm/amdgpu: Support page directory update via CPU

If amdgpu.vm_update_context param is set to use CPU, then Page
Directories will be updated by CPU instead of SDMA. CPU update however
is supported only on Large BAR systems.

NOTE: Support for page table entries will be done in a separate commit.

Change-Id: I842983c292e7961bc4aad8f4542239189d72d08c
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 149 ++++++++++++++++++++++++---------
 1 file changed, 109 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index b5871f4..f26e052 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -271,6 +271,7 @@ static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
 				  uint64_t saddr, uint64_t eaddr,
 				  unsigned level)
 {
+	u64 flags;
 	unsigned shift = (adev->vm_manager.num_level - level) *
 		adev->vm_manager.block_size;
 	unsigned pt_idx, from, to;
@@ -299,6 +300,12 @@ static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
 	saddr = saddr & ((1 << shift) - 1);
 	eaddr = eaddr & ((1 << shift) - 1);
 
+	flags = (vm->is_vm_update_mode_cpu) ?
+			AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED :
+			AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
+			AMDGPU_GEM_CREATE_SHADOW;
+
+
 	/* walk over the address space and allocate the page tables */
 	for (pt_idx = from; pt_idx <= to; ++pt_idx) {
 		struct reservation_object *resv = vm->root.bo->tbo.resv;
@@ -310,8 +317,7 @@ static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
 					     amdgpu_vm_bo_size(adev, level),
 					     AMDGPU_GPU_PAGE_SIZE, true,
 					     AMDGPU_GEM_DOMAIN_VRAM,
-					     AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
-					     AMDGPU_GEM_CREATE_SHADOW |
+					     flags |
 					     AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
 					     AMDGPU_GEM_CREATE_VRAM_CLEARED,
 					     NULL, resv, &pt);
@@ -909,6 +915,33 @@ static uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
 	return result;
 }
 
+/**
+ * amdgpu_vm_cpu_set_ptes - helper to update page tables via CPU
+ *
+ * @params: see amdgpu_pte_update_params definition
+ * @pe: addr of the page entry
+ * @addr: dst addr to write into pe
+ * @count: number of page entries to update
+ * @incr: increase next addr by incr bytes
+ * @flags: hw access flags
+ *
+ */
+static void amdgpu_vm_cpu_set_ptes(struct amdgpu_pte_update_params *params,
+				   uint64_t pe, uint64_t addr,
+				   unsigned count, uint32_t incr,
+				   uint64_t flags)
+{
+	unsigned int i;
+
+	for (i = 0; i < count; i++) {
+		amdgpu_gart_set_pte_pde(params->adev, (void *)pe,
+					i, addr, flags);
+		addr += incr;
+	}
+	mb();
+	amdgpu_gart_flush_gpu_tlb(params->adev, 0);
+}
+
 /*
  * amdgpu_vm_update_level - update a single level in the hierarchy
  *
@@ -934,38 +967,67 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 	struct amdgpu_pte_update_params params;
 	struct fence *fence = NULL;
 
-	int r;
+	int r = 0;
 
 	if (!parent->entries)
 		return 0;
-	ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
 
-	/* padding, etc. */
-	ndw = 64;
-
-	/* assume the worst case */
-	ndw += parent->last_entry_used * 6;
+	memset(&params, 0, sizeof(params));
+	params.adev = adev;
+	shadow = parent->bo->shadow;
 
-	pd_addr = amdgpu_bo_gpu_offset(parent->bo);
+	/* If is_vm_update_mode_cpu flag is set, then try to update page table
+	 * using CPU. If failed, fallback to use SDMA
+	 */
+	WARN_ON(vm->is_vm_update_mode_cpu && shadow);
+	if (vm->is_vm_update_mode_cpu && !shadow) {
+		struct amdgpu_sync sync;
 
-	shadow = parent->bo->shadow;
-	if (shadow) {
-		r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
+		r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
 		if (r)
-			return r;
-		shadow_addr = amdgpu_bo_gpu_offset(shadow);
-		ndw *= 2;
-	} else {
-		shadow_addr = 0;
+			dev_warn(adev->dev, "Page table update using CPU failed. Fallback to SDMA\n");
+		else {
+			/* Wait for BO to be free. SDMA could be clearing it */
+			amdgpu_sync_create(&sync);
+			amdgpu_sync_resv(adev, &sync, parent->bo->tbo.resv,
+					 AMDGPU_FENCE_OWNER_VM);
+			amdgpu_sync_wait(&sync);
+			amdgpu_sync_free(&sync);
+			params.func = amdgpu_vm_cpu_set_ptes;
+		}
 	}
 
-	r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
-	if (r)
-		return r;
+	if (r || !vm->is_vm_update_mode_cpu) {
+		if (shadow) {
+			r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
+			if (r)
+				return r;
+		}
+		ring = container_of(vm->entity.sched, struct amdgpu_ring,
+				    sched);
 
-	memset(&params, 0, sizeof(params));
-	params.adev = adev;
-	params.ib = &job->ibs[0];
+		/* padding, etc. */
+		ndw = 64;
+
+		/* assume the worst case */
+		ndw += parent->last_entry_used * 6;
+
+		pd_addr = amdgpu_bo_gpu_offset(parent->bo);
+
+		if (shadow) {
+			shadow_addr = amdgpu_bo_gpu_offset(shadow);
+			ndw *= 2;
+		} else {
+			shadow_addr = 0;
+		}
+
+		r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
+		if (r)
+			return r;
+
+		params.ib = &job->ibs[0];
+		params.func = amdgpu_vm_do_set_ptes;
+	}
 
 	/* walk over the address space and update the directory */
 	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
@@ -1000,15 +1062,15 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 					amdgpu_vm_adjust_mc_addr(adev, last_pt);
 
 				if (shadow)
-					amdgpu_vm_do_set_ptes(&params,
-							      last_shadow,
-							      pt_addr, count,
-							      incr,
-							      AMDGPU_PTE_VALID);
-
-				amdgpu_vm_do_set_ptes(&params, last_pde,
-						      pt_addr, count, incr,
-						      AMDGPU_PTE_VALID);
+					params.func(&params,
+						    last_shadow,
+						    pt_addr, count,
+						    incr,
+						    AMDGPU_PTE_VALID);
+
+				params.func(&params, last_pde,
+					    pt_addr, count, incr,
+					    AMDGPU_PTE_VALID);
 			}
 
 			count = 1;
@@ -1024,14 +1086,16 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 		uint64_t pt_addr = amdgpu_vm_adjust_mc_addr(adev, last_pt);
 
 		if (vm->root.bo->shadow)
-			amdgpu_vm_do_set_ptes(&params, last_shadow, pt_addr,
-					      count, incr, AMDGPU_PTE_VALID);
+			params.func(&params, last_shadow, pt_addr,
+				    count, incr, AMDGPU_PTE_VALID);
 
-		amdgpu_vm_do_set_ptes(&params, last_pde, pt_addr,
-				      count, incr, AMDGPU_PTE_VALID);
+		params.func(&params, last_pde, pt_addr,
+			    count, incr, AMDGPU_PTE_VALID);
 	}
 
-	if (params.ib->length_dw == 0) {
+	if (params.func == amdgpu_vm_cpu_set_ptes)
+		amdgpu_bo_kunmap(parent->bo);
+	else if (params.ib->length_dw == 0) {
 		amdgpu_job_free(job);
 	} else {
 		amdgpu_ring_pad_ib(ring, params.ib);
@@ -2254,6 +2318,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm
 	struct amdgpu_ring *ring;
 	struct amd_sched_rq *rq;
 	int r, i;
+	u64 flags;
 
 	vm->va = RB_ROOT;
 	vm->client_id = atomic64_inc_return(&adev->vm_manager.client_counter);
@@ -2281,10 +2346,14 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm
 			 vm->is_vm_update_mode_cpu ? "CPU" : "SDMA");
 	vm->last_dir_update = NULL;
 
+	flags = (vm->is_vm_update_mode_cpu) ?
+			AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED :
+			AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
+			AMDGPU_GEM_CREATE_SHADOW;
+
 	r = amdgpu_bo_create(adev, amdgpu_vm_bo_size(adev, 0), align, true,
 			     AMDGPU_GEM_DOMAIN_VRAM,
-			     AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
-			     AMDGPU_GEM_CREATE_SHADOW |
+			     flags |
 			     AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
 			     AMDGPU_GEM_CREATE_VRAM_CLEARED,
 			     NULL, NULL, &vm->root.bo);
-- 
1.9.1


[-- Attachment #4: 0003-drm-amdgpu-Return-EINVAL-if-no-PT-BO.patch --]
[-- Type: application/octet-stream, Size: 4623 bytes --]

From 82abbb495917342b81b00c798841dc800d7a211d Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Wed, 12 Apr 2017 12:20:16 -0400
Subject: [PATCH 3/4] drm/amdgpu: Return EINVAL if no PT BO

This change is also useful for the upcoming changes where page tables
can be updated by CPU.

Change-Id: I07510ed60c94cf1944ee96bb4b16c40ec88ea17c
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 48 +++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index f26e052..44d6d34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1190,8 +1190,9 @@ static struct amdgpu_bo *amdgpu_vm_get_pt(struct amdgpu_pte_update_params *p,
  * @flags: mapping flags
  *
  * Update the page tables in the range @start - @end.
+ * Returns 0 for success, -EINVAL for failure.
  */
-static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
+static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 				  uint64_t start, uint64_t end,
 				  uint64_t dst, uint64_t flags)
 {
@@ -1209,12 +1210,12 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 	pt = amdgpu_vm_get_pt(params, addr);
 	if (!pt) {
 		pr_err("PT not found, aborting update_ptes\n");
-		return;
+		return -EINVAL;
 	}
 
 	if (params->shadow) {
 		if (!pt->shadow)
-			return;
+			return 0;
 		pt = pt->shadow;
 	}
 	if ((addr & ~mask) == (end & ~mask))
@@ -1236,12 +1237,12 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 		pt = amdgpu_vm_get_pt(params, addr);
 		if (!pt) {
 			pr_err("PT not found, aborting update_ptes\n");
-			return;
+			return -EINVAL;
 		}
 
 		if (params->shadow) {
 			if (!pt->shadow)
-				return;
+				return 0;
 			pt = pt->shadow;
 		}
 
@@ -1276,6 +1277,8 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 
 	params->func(params, cur_pe_start, cur_dst, cur_nptes,
 		     AMDGPU_GPU_PAGE_SIZE, flags);
+
+	return 0;
 }
 
 /*
@@ -1287,11 +1290,14 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
  * @end: last PTE to handle
  * @dst: addr those PTEs should point to
  * @flags: hw mapping flags
+ * Returns 0 for success, -EINVAL for failure.
  */
-static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
+static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
 				uint64_t start, uint64_t end,
 				uint64_t dst, uint64_t flags)
 {
+	int r;
+
 	/**
 	 * The MC L1 TLB supports variable sized pages, based on a fragment
 	 * field in the PTE. When this field is set to a non-zero value, page
@@ -1320,28 +1326,30 @@ static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
 
 	/* system pages are non continuously */
 	if (params->src || !(flags & AMDGPU_PTE_VALID) ||
-	    (frag_start >= frag_end)) {
-
-		amdgpu_vm_update_ptes(params, start, end, dst, flags);
-		return;
-	}
+	    (frag_start >= frag_end))
+		return amdgpu_vm_update_ptes(params, start, end, dst, flags);
 
 	/* handle the 4K area at the beginning */
 	if (start != frag_start) {
-		amdgpu_vm_update_ptes(params, start, frag_start,
-				      dst, flags);
+		r = amdgpu_vm_update_ptes(params, start, frag_start,
+					  dst, flags);
+		if (r)
+			return r;
 		dst += (frag_start - start) * AMDGPU_GPU_PAGE_SIZE;
 	}
 
 	/* handle the area in the middle */
-	amdgpu_vm_update_ptes(params, frag_start, frag_end, dst,
-			      flags | frag_flags);
+	r = amdgpu_vm_update_ptes(params, frag_start, frag_end, dst,
+				  flags | frag_flags);
+	if (r)
+		return r;
 
 	/* handle the 4K area at the end */
 	if (frag_end != end) {
 		dst += (frag_end - frag_start) * AMDGPU_GPU_PAGE_SIZE;
-		amdgpu_vm_update_ptes(params, frag_end, end, dst, flags);
+		r = amdgpu_vm_update_ptes(params, frag_end, end, dst, flags);
 	}
+	return r;
 }
 
 /**
@@ -1462,9 +1470,13 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 		goto error_free;
 
 	params.shadow = true;
-	amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+	r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+	if (r)
+		goto error_free;
 	params.shadow = false;
-	amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+	r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+	if (r)
+		goto error_free;
 
 	amdgpu_ring_pad_ib(ring, params.ib);
 	WARN_ON(params.ib->length_dw > ndw);
-- 
1.9.1


[-- Attachment #5: 0004-drm-amdgpu-Support-page-table-update-via-CPU.patch --]
[-- Type: application/octet-stream, Size: 5744 bytes --]

From faa24ca969392e19f650239b90123eee5b717d0d Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Wed, 19 Apr 2017 16:54:16 -0400
Subject: [PATCH 4/4] drm/amdgpu: Support page table update via CPU

Change-Id: I5fe1009de222e60c406b36d7a4271ac15d80e286
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 113 ++++++++++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 44d6d34..e969c60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -79,6 +79,12 @@ struct amdgpu_pte_update_params {
 		     uint64_t flags);
 	/* indicate update pt or its shadow */
 	bool shadow;
+	/* The next three are used during VM update by CPU */
+	bool update_by_cpu;
+	/* DMA addresses to use for mapping */
+	dma_addr_t *pages_addr;
+	/* Kernel pointer of PD/PT BO that needs to be updated */
+	void *kptr;
 };
 
 /* Helper to disable partial resident texture feature from a fence callback */
@@ -919,12 +925,14 @@ static uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
  * amdgpu_vm_cpu_set_ptes - helper to update page tables via CPU
  *
  * @params: see amdgpu_pte_update_params definition
- * @pe: addr of the page entry
+ * @pe: kmap addr of the page entry
  * @addr: dst addr to write into pe
  * @count: number of page entries to update
  * @incr: increase next addr by incr bytes
  * @flags: hw access flags
  *
+ * Used for updating System Memory not GART mapped to same device,
+ * VRAM memory and Page directory entries.
  */
 static void amdgpu_vm_cpu_set_ptes(struct amdgpu_pte_update_params *params,
 				   uint64_t pe, uint64_t addr,
@@ -932,12 +940,17 @@ static void amdgpu_vm_cpu_set_ptes(struct amdgpu_pte_update_params *params,
 				   uint64_t flags)
 {
 	unsigned int i;
+	uint64_t value;
 
 	for (i = 0; i < count; i++) {
+		value = params->pages_addr ?
+			amdgpu_vm_map_gart(params->pages_addr, addr) :
+			addr;
 		amdgpu_gart_set_pte_pde(params->adev, (void *)pe,
-					i, addr, flags);
+					i, value, flags);
 		addr += incr;
 	}
+
 	mb();
 	amdgpu_gart_flush_gpu_tlb(params->adev, 0);
 }
@@ -985,7 +998,8 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 
 		r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
 		if (r)
-			dev_warn(adev->dev, "Page table update using CPU failed. Fallback to SDMA\n");
+			dev_warn(adev->dev,
+				 "Page table update using CPU failed. Fallback to SDMA\n");
 		else {
 			/* Wait for BO to be free. SDMA could be clearing it */
 			amdgpu_sync_create(&sync);
@@ -1180,6 +1194,59 @@ static struct amdgpu_bo *amdgpu_vm_get_pt(struct amdgpu_pte_update_params *p,
 }
 
 /**
+ * amdgpu_vm_update_ptes_cpu - Update the page tables in the range
+ *  start - @end using CPU.
+ * See amdgpu_vm_update_ptes for parameter description.
+ *
+ */
+static int amdgpu_vm_update_ptes_cpu(struct amdgpu_pte_update_params *params,
+				     uint64_t start, uint64_t end,
+				     uint64_t dst, uint64_t flags)
+{
+	struct amdgpu_device *adev = params->adev;
+	const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
+	void *pe_ptr;
+	uint64_t addr;
+	struct amdgpu_bo *pt;
+	unsigned int nptes;
+	int r;
+
+	/* initialize the variables */
+	addr = start;
+
+	/* walk over the address space and update the page tables */
+	while (addr < end) {
+		pt = amdgpu_vm_get_pt(params, addr);
+		if (!pt) {
+			pr_err("PT not found, aborting update_ptes\n");
+			return -EINVAL;
+		}
+
+		WARN_ON(params->shadow);
+
+		r = amdgpu_bo_kmap(pt, &pe_ptr);
+		if (r)
+			return r;
+
+		pe_ptr += (addr & mask) * 8;
+
+		if ((addr & ~mask) == (end & ~mask))
+			nptes = end - addr;
+		else
+			nptes = AMDGPU_VM_PTE_COUNT(adev) - (addr & mask);
+
+		params->func(params, (uint64_t)pe_ptr, dst, nptes,
+			     AMDGPU_GPU_PAGE_SIZE, flags);
+
+		amdgpu_bo_kunmap(pt);
+		addr += nptes;
+		dst += nptes * AMDGPU_GPU_PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+/**
  * amdgpu_vm_update_ptes - make sure that page tables are valid
  *
  * @params: see amdgpu_pte_update_params definition
@@ -1205,6 +1272,13 @@ static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 	unsigned nptes; /* next number of ptes to be updated */
 	uint64_t next_pe_start;
 
+	if (params->update_by_cpu == true)
+		/* Return CPU based update status. If failed, retry using
+		 * SDMA will be done
+		 */
+		return amdgpu_vm_update_ptes_cpu(params, start, end,
+						  dst, flags);
+
 	/* initialize the variables */
 	addr = start;
 	pt = amdgpu_vm_get_pt(params, addr);
@@ -1391,6 +1465,39 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 	params.vm = vm;
 	params.src = src;
 
+	if (vm->is_vm_update_mode_cpu) {
+		struct amdgpu_sync sync;
+
+		amdgpu_sync_create(&sync);
+		amdgpu_sync_resv(adev, &sync, vm->root.bo->tbo.resv,
+				 AMDGPU_FENCE_OWNER_VM);
+		amdgpu_sync_wait(&sync);
+		amdgpu_sync_free(&sync);
+
+		params.func = amdgpu_vm_cpu_set_ptes;
+
+		/* params.src is used as flag to indicate system Memory */
+		if (pages_addr)
+			params.src = ~0;
+
+		params.pages_addr = pages_addr;
+		params.update_by_cpu = true;
+
+		params.shadow = false;
+		r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+		if (!r)
+			return r;
+
+		dev_warn(adev->dev,
+			 "CPU update of VM failed. Fallback to SDMA\n");
+
+		/* Reset params for SDMA fallback path */
+		params.update_by_cpu = false;
+		params.pages_addr = NULL;
+		params.kptr = NULL;
+		params.func = NULL;
+	}
+
 	ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
 
 	/* sync to everything on unmapping */
-- 
1.9.1


[-- Attachment #6: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: Support for amdgpu VM update via CPU on large-bar systems
       [not found] ` <DM3PR1201MB10382F282D646D630708246D8CEF0-BBcFnVpqZhWjUUTFdQAMQmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-05-12  8:25   ` zhoucm1
       [not found]     ` <5915717A.3000209-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: zhoucm1 @ 2017-05-12  8:25 UTC (permalink / raw)
  To: Kasiviswanathan, Harish, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 986 bytes --]


On 2017年05月10日 05:47, Kasiviswanathan, Harish wrote:
> Hi,
>
> Please review the patch set that supports amdgpu VM update via CPU. This feature provides improved performance for compute (HSA) where mapping / unmapping is carried out (by Kernel) independent of command submissions (done directly by user space). This version doesn't support shadow copy of VM page tables for CPU based update.
I think your improved performance is from less waiting for cs, 
generally, SDMA engine updating page table is faster than CPU, otherwise 
we don't need sdma for updating PT.
So whether your this improvement proves we have some redundant sync when 
mapping / unmapping, if yes, we should fix that, then not sure if CPU 
method is need or not.

Regards,
David Zhou
>
> Best Regards,
> Harish
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[-- Attachment #1.2: Type: text/html, Size: 1985 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Support for amdgpu VM update via CPU on large-bar systems
       [not found]     ` <5915717A.3000209-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-12  8:33       ` Christian König
       [not found]         ` <21e206ed-8d60-450f-6d23-e01c68c5ad73-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Christian König @ 2017-05-12  8:33 UTC (permalink / raw)
  To: zhoucm1, Kasiviswanathan, Harish,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 1568 bytes --]

Am 12.05.2017 um 10:25 schrieb zhoucm1:
>
> On 2017年05月10日 05:47, Kasiviswanathan, Harish wrote:
>> Hi,
>>
>> Please review the patch set that supports amdgpu VM update via CPU. This feature provides improved performance for compute (HSA) where mapping / unmapping is carried out (by Kernel) independent of command submissions (done directly by user space). This version doesn't support shadow copy of VM page tables for CPU based update.
> I think your improved performance is from less waiting for cs, 
> generally, SDMA engine updating page table is faster than CPU, 
> otherwise we don't need sdma for updating PT.
> So whether your this improvement proves we have some redundant sync 
> when mapping / unmapping, if yes, we should fix that, then not sure if 
> CPU method is need or not.

The problem is that the KFD is designed synchronously for page table 
updates. In other words they need to wait for the update to finish and 
that takes time.

Apart from that your comment is absolutely correct, we found that the 
SDMA is sometimes much faster to do the update than the CPU.

Regards,
Christian.

>
> Regards,
> David Zhou
>> Best Regards,
>> Harish
>>
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 3353 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Support for amdgpu VM update via CPU on large-bar systems
       [not found]         ` <21e206ed-8d60-450f-6d23-e01c68c5ad73-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-12  8:37           ` zhoucm1
       [not found]             ` <59157453.8010308-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: zhoucm1 @ 2017-05-12  8:37 UTC (permalink / raw)
  To: Christian König, Kasiviswanathan, Harish,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 1881 bytes --]



On 2017年05月12日 16:33, Christian König wrote:
> Am 12.05.2017 um 10:25 schrieb zhoucm1:
>>
>> On 2017年05月10日 05:47, Kasiviswanathan, Harish wrote:
>>> Hi,
>>>
>>> Please review the patch set that supports amdgpu VM update via CPU. This feature provides improved performance for compute (HSA) where mapping / unmapping is carried out (by Kernel) independent of command submissions (done directly by user space). This version doesn't support shadow copy of VM page tables for CPU based update.
>> I think your improved performance is from less waiting for cs, 
>> generally, SDMA engine updating page table is faster than CPU, 
>> otherwise we don't need sdma for updating PT.
>> So whether your this improvement proves we have some redundant sync 
>> when mapping / unmapping, if yes, we should fix that, then not sure 
>> if CPU method is need or not.
>
> The problem is that the KFD is designed synchronously for page table 
> updates. In other words they need to wait for the update to finish and 
> that takes time.
>
> Apart from that your comment is absolutely correct, we found that the 
> SDMA is sometimes much faster to do the update than the CPU.
If the sdma is faster, even they wait for finish, which time is shorter 
than CPU, isn't it? Of course, the precondition is sdma is exclusive. 
They can reserve a sdma for PT updating.

Regards,
David Zhou
>
> Regards,
> Christian.
>
>>
>> Regards,
>> David Zhou
>>> Best Regards,
>>> Harish
>>>
>>>
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>


[-- Attachment #1.2: Type: text/html, Size: 4159 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Support for amdgpu VM update via CPU on large-bar systems
       [not found]             ` <59157453.8010308-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-12  8:43               ` Christian König
       [not found]                 ` <b82fc6cd-c19e-85c2-8628-89c2cb10c5c0-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Christian König @ 2017-05-12  8:43 UTC (permalink / raw)
  To: zhoucm1, Kasiviswanathan, Harish,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 2442 bytes --]

Am 12.05.2017 um 10:37 schrieb zhoucm1:
>
>
> On 2017年05月12日 16:33, Christian König wrote:
>> Am 12.05.2017 um 10:25 schrieb zhoucm1:
>>>
>>> On 2017年05月10日 05:47, Kasiviswanathan, Harish wrote:
>>>> Hi,
>>>>
>>>> Please review the patch set that supports amdgpu VM update via CPU. This feature provides improved performance for compute (HSA) where mapping / unmapping is carried out (by Kernel) independent of command submissions (done directly by user space). This version doesn't support shadow copy of VM page tables for CPU based update.
>>> I think your improved performance is from less waiting for cs, 
>>> generally, SDMA engine updating page table is faster than CPU, 
>>> otherwise we don't need sdma for updating PT.
>>> So whether your this improvement proves we have some redundant sync 
>>> when mapping / unmapping, if yes, we should fix that, then not sure 
>>> if CPU method is need or not.
>>
>> The problem is that the KFD is designed synchronously for page table 
>> updates. In other words they need to wait for the update to finish 
>> and that takes time.
>>
>> Apart from that your comment is absolutely correct, we found that the 
>> SDMA is sometimes much faster to do the update than the CPU.
> If the sdma is faster, even they wait for finish, which time is 
> shorter than CPU, isn't it? Of course, the precondition is sdma is 
> exclusive. They can reserve a sdma for PT updating.

No, if I understood Felix numbers correctly the setup and wait time for 
SDMA is a bit (but not much) longer than doing it with the CPU.

What would really help is to fix the KFD design and work with async page 
tables updates there as well.

Regards,
Christian.

>
> Regards,
> David Zhou
>>
>> Regards,
>> Christian.
>>
>>>
>>> Regards,
>>> David Zhou
>>>> Best Regards,
>>>> Harish
>>>>
>>>>
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>
>>>
>>>
>>> _______________________________________________
>>> amd-gfx mailing list
>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>
>>
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 5531 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Support for amdgpu VM update via CPU on large-bar systems
       [not found]                 ` <b82fc6cd-c19e-85c2-8628-89c2cb10c5c0-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-12  8:44                   ` zhoucm1
  2017-05-12 19:25                   ` Felix Kuehling
  1 sibling, 0 replies; 12+ messages in thread
From: zhoucm1 @ 2017-05-12  8:44 UTC (permalink / raw)
  To: Christian König, Kasiviswanathan, Harish,
	amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 2628 bytes --]



On 2017年05月12日 16:43, Christian König wrote:
> Am 12.05.2017 um 10:37 schrieb zhoucm1:
>>
>>
>> On 2017年05月12日 16:33, Christian König wrote:
>>> Am 12.05.2017 um 10:25 schrieb zhoucm1:
>>>>
>>>> On 2017年05月10日 05:47, Kasiviswanathan, Harish wrote:
>>>>> Hi,
>>>>>
>>>>> Please review the patch set that supports amdgpu VM update via CPU. This feature provides improved performance for compute (HSA) where mapping / unmapping is carried out (by Kernel) independent of command submissions (done directly by user space). This version doesn't support shadow copy of VM page tables for CPU based update.
>>>> I think your improved performance is from less waiting for cs, 
>>>> generally, SDMA engine updating page table is faster than CPU, 
>>>> otherwise we don't need sdma for updating PT.
>>>> So whether your this improvement proves we have some redundant sync 
>>>> when mapping / unmapping, if yes, we should fix that, then not sure 
>>>> if CPU method is need or not.
>>>
>>> The problem is that the KFD is designed synchronously for page table 
>>> updates. In other words they need to wait for the update to finish 
>>> and that takes time.
>>>
>>> Apart from that your comment is absolutely correct, we found that 
>>> the SDMA is sometimes much faster to do the update than the CPU.
>> If the sdma is faster, even they wait for finish, which time is 
>> shorter than CPU, isn't it? Of course, the precondition is sdma is 
>> exclusive. They can reserve a sdma for PT updating.
>
> No, if I understood Felix numbers correctly the setup and wait time 
> for SDMA is a bit (but not much) longer than doing it with the CPU.
>
> What would really help is to fix the KFD design and work with async 
> page tables updates there as well.
OK, no problem, just curious.

Regards,
David Zhou
>
> Regards,
> Christian.
>
>>
>> Regards,
>> David Zhou
>>>
>>> Regards,
>>> Christian.
>>>
>>>>
>>>> Regards,
>>>> David Zhou
>>>>> Best Regards,
>>>>> Harish
>>>>>
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> amd-gfx mailing list
>>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>>
>>>>
>>>>
>>>> _______________________________________________
>>>> amd-gfx mailing list
>>>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>>>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>>>
>>>
>>
>>
>>
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
>


[-- Attachment #1.2: Type: text/html, Size: 6179 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Support for amdgpu VM update via CPU on large-bar systems
       [not found]                 ` <b82fc6cd-c19e-85c2-8628-89c2cb10c5c0-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  2017-05-12  8:44                   ` zhoucm1
@ 2017-05-12 19:25                   ` Felix Kuehling
       [not found]                     ` <9f9f1e0c-caf6-7233-03c6-59196f672a72-5C7GfCeVMHo@public.gmane.org>
  1 sibling, 1 reply; 12+ messages in thread
From: Felix Kuehling @ 2017-05-12 19:25 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

On 17-05-12 04:43 AM, Christian König wrote:
> Am 12.05.2017 um 10:37 schrieb zhoucm1:
>>
>>
>>
>> If the sdma is faster, even they wait for finish, which time is
>> shorter than CPU, isn't it? Of course, the precondition is sdma is
>> exclusive. They can reserve a sdma for PT updating.
>>
>
> No, if I understood Felix numbers correctly the setup and wait time
> for SDMA is a bit (but not much) longer than doing it with the CPU.

I'm skeptical of claims that SDMA is faster. Even when you use SDMA to
write the page table, the CPU still has to do about the same amount of
work writing PTEs into the SDMA IBs. SDMA can only save CPU time in
certain cases:

  * Copying PTEs from GART table if they are on the same GPU (not
    possible on Vega10 due to different MTYPE bits)
  * Generating PTEs for contiguous VRAM BOs

At least for system memory BOs writing the PTEs directly to
write-combining VRAM should be faster than writing them to cached system
memory IBs first and then kicking off an SDMA transfer and waiting for
completion.

>
> What would really help is to fix the KFD design and work with async
> page tables updates there as well.

That problem goes much higher up the stack than just KFD. It would
affect memory management interfaces in the HSA runtime and HCC.

The basic idea is to make the GPU behave very similar to a CPU and to
have multi-threaded code where some threads run on the CPU and others on
the GPU almost seamlessly. You allocate memory and then you use the same
pointer in your CPU and GPU threads. Exposing the messiness of
asynchronous page table updates all the way up to the application would
destroy that programming model.

In this model, latency matters most. The longer it takes to kick off a
parallel GPU processing job, the less efficient scaling you get from the
GPUs parallel processing capabilities. Exposing asynchronous memory
management up the stack would allow the application to hide the latency
in some cases (if it can do other useful things in the mean time), but
it doesn't make the latency disappear.

An application that wants to hide memory management latency can do this,
even with the existing programming model, by separating memory
management and processing into separate threads.

Regards,
  Felix


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Support for amdgpu VM update via CPU on large-bar systems
       [not found]                     ` <9f9f1e0c-caf6-7233-03c6-59196f672a72-5C7GfCeVMHo@public.gmane.org>
@ 2017-05-13  9:08                       ` Christian König
  0 siblings, 0 replies; 12+ messages in thread
From: Christian König @ 2017-05-13  9:08 UTC (permalink / raw)
  To: Felix Kuehling, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 12.05.2017 um 21:25 schrieb Felix Kuehling:
> On 17-05-12 04:43 AM, Christian König wrote:
>> Am 12.05.2017 um 10:37 schrieb zhoucm1:
>>>
>>>
>>> If the sdma is faster, even they wait for finish, which time is
>>> shorter than CPU, isn't it? Of course, the precondition is sdma is
>>> exclusive. They can reserve a sdma for PT updating.
>>>
>> No, if I understood Felix numbers correctly the setup and wait time
>> for SDMA is a bit (but not much) longer than doing it with the CPU.
> I'm skeptical of claims that SDMA is faster. Even when you use SDMA to
> write the page table, the CPU still has to do about the same amount of
> work writing PTEs into the SDMA IBs. SDMA can only save CPU time in
> certain cases:
>
>    * Copying PTEs from GART table if they are on the same GPU (not
>      possible on Vega10 due to different MTYPE bits)
>    * Generating PTEs for contiguous VRAM BOs
>
> At least for system memory BOs writing the PTEs directly to
> write-combining VRAM should be faster than writing them to cached system
> memory IBs first and then kicking off an SDMA transfer and waiting for
> completion.

That's unfortunately not correct at all.

Nicolai did quite some measurements on this and even with WC enabled on 
most systems the SDMA is more efficient transferring even small amounts 
of memory over the bus than the CPU.

And no we couldn't figure why, it indeed doesn't make much sense when WC 
is enabled.

I think the SDMA is simply optimized for those kinds of transfers, so 
even considering the overhead of allocating an IB.

So anything larger than I would say 1KB is faster handled when you write 
it to system memory and then copy it to VRAM with the SDMA.

Regards,
Christian.
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Support for amdgpu VM update via CPU on large-bar systems
       [not found]         ` <DM3PR1201MB1038AAC8F7B04B7F3E015FE88CEC0-BBcFnVpqZhWjUUTFdQAMQmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-05-10 19:22           ` Christian König
  0 siblings, 0 replies; 12+ messages in thread
From: Christian König @ 2017-05-10 19:22 UTC (permalink / raw)
  To: Kasiviswanathan, Harish, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

> [HK]: From Compute perspective, all validations are synchronous. PT/PD BOs are validated when restored after evictions. However, the clearing of PT/PD BOs during creation are still done by GPU which mandates the waiting.
> [HK]: I am not clear on how to modify amdgpu_bo_kmap(). The function currently waits for the exclusive fence and now optionally it has to wait for all the shared fences.
The clear operation is added as the exclusive fence, so we should 
already wait for that in amdgpu_bo_kmap().

>   So are you suggesting to modify kmap function to amdgpu_bo_kmap(struct amdgpu_bo *bo, void **ptr, void *fence_owner). If fence_onwer != NULL it would wait for all the shared fences. Please clarify.
No, that would indeed be quite a bit ugly.

As far as I can see you can just completely drop the code, because we 
already wait for both clear as well as swap operations to finish in 
amdgpu_bo_kmap() by waiting for the exclusive fence.

Regards,
Christian.

Am 10.05.2017 um 20:50 schrieb Kasiviswanathan, Harish:
> Thanks Christian for the review. One clarification required. Please see my comments and question inline.
>
> From: Christian König [mailto:deathsimple@vodafone.de]
> Sent: Wednesday, May 10, 2017 3:24 AM
> To: Kasiviswanathan, Harish <Harish.Kasiviswanathan@amd.com>; amd-gfx@lists.freedesktop.org
> Subject: Re: Support for amdgpu VM update via CPU on large-bar systems
>
> Hi Harish,
>
> for the next time please use git send-email to send patches to the mailing list.
>
> Looks pretty good in general, but a few notes all over the place.
>
> Patch #1:
>
> +static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev)
> +{
> +    if (adev->mc.visible_vram_size > 0 &&
> +            adev->mc.real_vram_size == adev->mc.visible_vram_size)
> +        return true;
> +    return false;
> +}
> The visible_vram_size > 0 check looks superfluous. The coding style looks off, the second line of the "if" is to far right.
>
> And in general that should rather look like "return adev->mc.real_vram_size == adev->mc.visible_vram_size;".
>
>
> +    /* CPU update is only supported for large BAR system */
> +    vm->is_vm_update_mode_cpu = (vm_update_mode &&
> +                     amdgpu_vm_is_large_bar(adev));
> Mhm, it would be nice if we could enable this for testing even on systems with small BAR.
> [HK] : I will add bit in the module parameter that can override large-bar requirement.
>
> I would rather suggest to make the default value depend on if the BOX has a large or small BAR and let the user override the setting.
>
> Additional to that we should limit this to 64bit systems.
>
>
> +#define AMDGPU_VM_USE_CPU_UPDATE_FOR_GRAPHICS_MASK (1 << 0)
> +#define AMDGPU_VM_USE_CPU_UPDATE_FOR_COMPUTE_MASK (1 << 1)
> Not much of an issue, but the names are a bit long. Maybe use something like AMDGPU_VM_USE_CPU_FOR_GFX and AMDGPU_VM_USE_CPU_FOR_COMPUTE.
>
>
> +    bool                    is_vm_update_mode_cpu : 1;
> Please no bitmasks when we don't need them.
>
> Patch #2:
>
> +    u64 flags;
>       unsigned shift = (adev->vm_manager.num_level - level) *
>           adev->vm_manager.block_size;
> Reverse tree order.
>
>
> +    flags = (vm->is_vm_update_mode_cpu) ?
> +            AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED :
> +            AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
> +            AMDGPU_GEM_CREATE_SHADOW;
> +
> +
> ...
>
> -                         AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
> -                         AMDGPU_GEM_CREATE_SHADOW |
> +                         flags |
>                            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
>                            AMDGPU_GEM_CREATE_VRAM_CLEARED,
>
> I would rather write it like this:
>
> flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | AMDGPU_GEM_CREATE_VRAM_CLEARED;
> if (vm->is_vm_update_mode_cpu)
>      flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
> else
>      flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS | AMDGPU_GEM_CREATE_SHADOW;
>
>
> +    mb();
> +    amdgpu_gart_flush_gpu_tlb(params->adev, 0);
> You do this for the HDP flush, don't you?
>
> [HK]: Yes
>
>
> +            dev_warn(adev->dev, "Page table update using CPU failed. Fallback to SDMA\n");
> NACK, if kmapping the BO fails we most likely are either out of memory or out of address space.
>
> Falling back to the SDMA doesn't make the situation better, just return a proper error code here.
>
>
> +            /* Wait for BO to be free. SDMA could be clearing it */
> +            amdgpu_sync_create(&sync);
> +            amdgpu_sync_resv(adev, &sync, parent->bo->tbo.resv,
> +                     AMDGPU_FENCE_OWNER_VM);
> +            amdgpu_sync_wait(&sync);
> +            amdgpu_sync_free(&sync);
> Superfluous and a bit incorrect, amdgpu_bo_kmap already calls reservation_object_wait_timeout_rcu() but only for the exclusive fence.
>
> You probably ran into problems because of pipelined evictions, so that should be fixed in amdgpu_bo_kmap() instead.
>
> [HK]: From Compute perspective, all validations are synchronous. PT/PD BOs are validated when restored after evictions. However, the clearing of PT/PD BOs during creation are still done by GPU which mandates the waiting.
> [HK]: I am not clear on how to modify amdgpu_bo_kmap(). The function currently waits for the exclusive fence and now optionally it has to wait for all the shared fences. So are you suggesting to modify kmap function to amdgpu_bo_kmap(struct amdgpu_bo *bo, void **ptr, void *fence_owner). If fence_onwer != NULL it would wait for all the shared fences. Please clarify.
>    
>
>
> -                    amdgpu_vm_do_set_ptes(&params,
> -                                  last_shadow,
> -                                  pt_addr, count,
> -                                  incr,
> -                                  AMDGPU_PTE_VALID);
> -
> -                amdgpu_vm_do_set_ptes(&params, last_pde,
> -                              pt_addr, count, incr,
> -                              AMDGPU_PTE_VALID);
> +                    params.func(&params,
> +                            last_shadow,
> +                            pt_addr, count,
> +                            incr,
> +                            AMDGPU_PTE_VALID);
> +
> +                params.func(&params, last_pde,
> +                        pt_addr, count, incr,
> +                        AMDGPU_PTE_VALID);
> Might be a good idea to separate that into a cleanup patch.
>
> Patch #3: Looks good to me and is Reviewed-by: Christian König <christian.koenig@amd.com>.
>
> Please move that one to the beginning of the series and/or commit it directly to amd-staging-4.9.
>
> Patch #4:
>
> +    /* The next three are used during VM update by CPU */
> +    bool update_by_cpu;
> We can distinguish that as well by looking at the function pointer, don't we?
>
>
> +        dev_warn(adev->dev,
> +             "CPU update of VM failed. Fallback to SDMA\n");
> +
> +        /* Reset params for SDMA fallback path */
> +        params.update_by_cpu = false;
> +        params.pages_addr = NULL;
> +        params.kptr = NULL;
> +        params.func = NULL;
> Again completely drop the SDMA fallback path.
>
> Regards,
> Christian.
>
> Am 09.05.2017 um 20:34 schrieb Kasiviswanathan, Harish:
> Hi,
>
> Please review the patch set that supports amdgpu VM update via CPU. This feature provides improved performance for compute (HSA) where mapping / unmapping is carried out (by Kernel) independent of command submissions (done directly by user space). This version doesn't support shadow copy of VM page tables for CPU based update.
>
> Best Regards,
> Harish
>
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: Support for amdgpu VM update via CPU on large-bar systems
       [not found]     ` <1b144623-3315-5dd7-fcc7-3be9e24e0f21-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
@ 2017-05-10 18:50       ` Kasiviswanathan, Harish
       [not found]         ` <DM3PR1201MB1038AAC8F7B04B7F3E015FE88CEC0-BBcFnVpqZhWjUUTFdQAMQmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Kasiviswanathan, Harish @ 2017-05-10 18:50 UTC (permalink / raw)
  To: Christian König, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Thanks Christian for the review. One clarification required. Please see my comments and question inline.

From: Christian König [mailto:deathsimple@vodafone.de] 
Sent: Wednesday, May 10, 2017 3:24 AM
To: Kasiviswanathan, Harish <Harish.Kasiviswanathan@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: Support for amdgpu VM update via CPU on large-bar systems

Hi Harish,

for the next time please use git send-email to send patches to the mailing list.

Looks pretty good in general, but a few notes all over the place.

Patch #1:

+static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev)
+{
+    if (adev->mc.visible_vram_size > 0 &&
+            adev->mc.real_vram_size == adev->mc.visible_vram_size)
+        return true;
+    return false;
+}
The visible_vram_size > 0 check looks superfluous. The coding style looks off, the second line of the "if" is to far right.

And in general that should rather look like "return adev->mc.real_vram_size == adev->mc.visible_vram_size;".


+    /* CPU update is only supported for large BAR system */
+    vm->is_vm_update_mode_cpu = (vm_update_mode &&
+                     amdgpu_vm_is_large_bar(adev));
Mhm, it would be nice if we could enable this for testing even on systems with small BAR.
[HK] : I will add bit in the module parameter that can override large-bar requirement.

I would rather suggest to make the default value depend on if the BOX has a large or small BAR and let the user override the setting.

Additional to that we should limit this to 64bit systems.


+#define AMDGPU_VM_USE_CPU_UPDATE_FOR_GRAPHICS_MASK (1 << 0)
+#define AMDGPU_VM_USE_CPU_UPDATE_FOR_COMPUTE_MASK (1 << 1)
Not much of an issue, but the names are a bit long. Maybe use something like AMDGPU_VM_USE_CPU_FOR_GFX and AMDGPU_VM_USE_CPU_FOR_COMPUTE.


+    bool                    is_vm_update_mode_cpu : 1;
Please no bitmasks when we don't need them.

Patch #2:

+    u64 flags;
     unsigned shift = (adev->vm_manager.num_level - level) *
         adev->vm_manager.block_size;
Reverse tree order.


+    flags = (vm->is_vm_update_mode_cpu) ?
+            AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED :
+            AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
+            AMDGPU_GEM_CREATE_SHADOW;
+
+
...

-                         AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
-                         AMDGPU_GEM_CREATE_SHADOW |
+                         flags |
                          AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
                          AMDGPU_GEM_CREATE_VRAM_CLEARED,

I would rather write it like this:

flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | AMDGPU_GEM_CREATE_VRAM_CLEARED;
if (vm->is_vm_update_mode_cpu)
    flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
else
    flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS | AMDGPU_GEM_CREATE_SHADOW;


+    mb();
+    amdgpu_gart_flush_gpu_tlb(params->adev, 0);
You do this for the HDP flush, don't you?

[HK]: Yes


+            dev_warn(adev->dev, "Page table update using CPU failed. Fallback to SDMA\n");
NACK, if kmapping the BO fails we most likely are either out of memory or out of address space.

Falling back to the SDMA doesn't make the situation better, just return a proper error code here.


+            /* Wait for BO to be free. SDMA could be clearing it */
+            amdgpu_sync_create(&sync);
+            amdgpu_sync_resv(adev, &sync, parent->bo->tbo.resv,
+                     AMDGPU_FENCE_OWNER_VM);
+            amdgpu_sync_wait(&sync);
+            amdgpu_sync_free(&sync);
Superfluous and a bit incorrect, amdgpu_bo_kmap already calls reservation_object_wait_timeout_rcu() but only for the exclusive fence.

You probably ran into problems because of pipelined evictions, so that should be fixed in amdgpu_bo_kmap() instead.

[HK]: From Compute perspective, all validations are synchronous. PT/PD BOs are validated when restored after evictions. However, the clearing of PT/PD BOs during creation are still done by GPU which mandates the waiting. 
[HK]: I am not clear on how to modify amdgpu_bo_kmap(). The function currently waits for the exclusive fence and now optionally it has to wait for all the shared fences. So are you suggesting to modify kmap function to amdgpu_bo_kmap(struct amdgpu_bo *bo, void **ptr, void *fence_owner). If fence_onwer != NULL it would wait for all the shared fences. Please clarify.
  


-                    amdgpu_vm_do_set_ptes(&params,
-                                  last_shadow,
-                                  pt_addr, count,
-                                  incr,
-                                  AMDGPU_PTE_VALID);
-
-                amdgpu_vm_do_set_ptes(&params, last_pde,
-                              pt_addr, count, incr,
-                              AMDGPU_PTE_VALID);
+                    params.func(&params,
+                            last_shadow,
+                            pt_addr, count,
+                            incr,
+                            AMDGPU_PTE_VALID);
+
+                params.func(&params, last_pde,
+                        pt_addr, count, incr,
+                        AMDGPU_PTE_VALID);
Might be a good idea to separate that into a cleanup patch.

Patch #3: Looks good to me and is Reviewed-by: Christian König <christian.koenig@amd.com>.

Please move that one to the beginning of the series and/or commit it directly to amd-staging-4.9.

Patch #4:

+    /* The next three are used during VM update by CPU */
+    bool update_by_cpu;
We can distinguish that as well by looking at the function pointer, don't we?


+        dev_warn(adev->dev,
+             "CPU update of VM failed. Fallback to SDMA\n");
+
+        /* Reset params for SDMA fallback path */
+        params.update_by_cpu = false;
+        params.pages_addr = NULL;
+        params.kptr = NULL;
+        params.func = NULL;
Again completely drop the SDMA fallback path.

Regards,
Christian.

Am 09.05.2017 um 20:34 schrieb Kasiviswanathan, Harish:
Hi,

Please review the patch set that supports amdgpu VM update via CPU. This feature provides improved performance for compute (HSA) where mapping / unmapping is carried out (by Kernel) independent of command submissions (done directly by user space). This version doesn't support shadow copy of VM page tables for CPU based update.

Best Regards,
Harish




_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Support for amdgpu VM update via CPU on large-bar systems
       [not found] ` <DM3PR1201MB103847C11F89904752F962768CEF0-BBcFnVpqZhWjUUTFdQAMQmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
@ 2017-05-10  7:23   ` Christian König
       [not found]     ` <1b144623-3315-5dd7-fcc7-3be9e24e0f21-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Christian König @ 2017-05-10  7:23 UTC (permalink / raw)
  To: Kasiviswanathan, Harish, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW


[-- Attachment #1.1: Type: text/plain, Size: 5515 bytes --]

Hi Harish,

for the next time please use git send-email to send patches to the 
mailing list.

Looks pretty good in general, but a few notes all over the place.

Patch #1:
> +static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev)
> +{
> +    if (adev->mc.visible_vram_size > 0 &&
> +            adev->mc.real_vram_size == adev->mc.visible_vram_size)
> +        return true;
> +    return false;
> +}
The visible_vram_size > 0 check looks superfluous. The coding style 
looks off, the second line of the "if" is to far right.

And in general that should rather look like "return 
adev->mc.real_vram_size == adev->mc.visible_vram_size;".

> +    /* CPU update is only supported for large BAR system */
> +    vm->is_vm_update_mode_cpu = (vm_update_mode &&
> +                     amdgpu_vm_is_large_bar(adev));
Mhm, it would be nice if we could enable this for testing even on 
systems with small BAR.

I would rather suggest to make the default value depend on if the BOX 
has a large or small BAR and let the user override the setting.

Additional to that we should limit this to 64bit systems.

> +#define AMDGPU_VM_USE_CPU_UPDATE_FOR_GRAPHICS_MASK (1 << 0)
> +#define AMDGPU_VM_USE_CPU_UPDATE_FOR_COMPUTE_MASK (1 << 1)
Not much of an issue, but the names are a bit long. Maybe use something 
like AMDGPU_VM_USE_CPU_FOR_GFX and AMDGPU_VM_USE_CPU_FOR_COMPUTE.

> +    bool is_vm_update_mode_cpu : 1;
Please no bitmasks when we don't need them.

Patch #2:
> +    u64 flags;
>      unsigned shift = (adev->vm_manager.num_level - level) *
>          adev->vm_manager.block_size;
Reverse tree order.

> +    flags = (vm->is_vm_update_mode_cpu) ?
> +            AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED :
> +            AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
> +            AMDGPU_GEM_CREATE_SHADOW;
> +
> +
...
> - AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
> -                         AMDGPU_GEM_CREATE_SHADOW |
> +                         flags |
>                           AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
>                           AMDGPU_GEM_CREATE_VRAM_CLEARED,

I would rather write it like this:

flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS | AMDGPU_GEM_CREATE_VRAM_CLEARED;
if (vm->is_vm_update_mode_cpu)
     flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
else
     flags |= AMDGPU_GEM_CREATE_NO_CPU_ACCESS | AMDGPU_GEM_CREATE_SHADOW;

> +    mb();
> +    amdgpu_gart_flush_gpu_tlb(params->adev, 0);
You do this for the HDP flush, don't you?

> +            dev_warn(adev->dev, "Page table update using CPU failed. 
> Fallback to SDMA\n");
NACK, if kmapping the BO fails we most likely are either out of memory 
or out of address space.

Falling back to the SDMA doesn't make the situation better, just return 
a proper error code here.

> +            /* Wait for BO to be free. SDMA could be clearing it */
> +            amdgpu_sync_create(&sync);
> +            amdgpu_sync_resv(adev, &sync, parent->bo->tbo.resv,
> +                     AMDGPU_FENCE_OWNER_VM);
> +            amdgpu_sync_wait(&sync);
> +            amdgpu_sync_free(&sync);
Superfluous and a bit incorrect, amdgpu_bo_kmap already calls 
reservation_object_wait_timeout_rcu() but only for the exclusive fence.

You probably ran into problems because of pipelined evictions, so that 
should be fixed in amdgpu_bo_kmap() instead.

> -  amdgpu_vm_do_set_ptes(&params,
> -                                  last_shadow,
> -                                  pt_addr, count,
> -                                  incr,
> -                                  AMDGPU_PTE_VALID);
> -
> -                amdgpu_vm_do_set_ptes(&params, last_pde,
> -                              pt_addr, count, incr,
> -                              AMDGPU_PTE_VALID);
> +                    params.func(&params,
> +                            last_shadow,
> +                            pt_addr, count,
> +                            incr,
> +                            AMDGPU_PTE_VALID);
> +
> +                params.func(&params, last_pde,
> +                        pt_addr, count, incr,
> +                        AMDGPU_PTE_VALID);
Might be a good idea to separate that into a cleanup patch.

Patch #3: Looks good to me and is Reviewed-by: Christian König 
<christian.koenig-5C7GfCeVMHo@public.gmane.org>.

Please move that one to the beginning of the series and/or commit it 
directly to amd-staging-4.9.

Patch #4:
> +    /* The next three are used during VM update by CPU */
> +    bool update_by_cpu;
We can distinguish that as well by looking at the function pointer, 
don't we?

> +        dev_warn(adev->dev,
> +             "CPU update of VM failed. Fallback to SDMA\n");
> +
> +        /* Reset params for SDMA fallback path */
> +        params.update_by_cpu = false;
> +        params.pages_addr = NULL;
> +        params.kptr = NULL;
> +        params.func = NULL;
Again completely drop the SDMA fallback path.

Regards,
Christian.

Am 09.05.2017 um 20:34 schrieb Kasiviswanathan, Harish:
> Hi,
>
> Please review the patch set that supports amdgpu VM update via CPU. This feature provides improved performance for compute (HSA) where mapping / unmapping is carried out (by Kernel) independent of command submissions (done directly by user space). This version doesn't support shadow copy of VM page tables for CPU based update.
>
> Best Regards,
> Harish
>
>
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW@public.gmane.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



[-- Attachment #1.2: Type: text/html, Size: 9055 bytes --]

[-- Attachment #2: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Support for amdgpu VM update via CPU on large-bar systems
@ 2017-05-09 18:34 Kasiviswanathan, Harish
       [not found] ` <DM3PR1201MB103847C11F89904752F962768CEF0-BBcFnVpqZhWjUUTFdQAMQmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
  0 siblings, 1 reply; 12+ messages in thread
From: Kasiviswanathan, Harish @ 2017-05-09 18:34 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

[-- Attachment #1: Type: text/plain, Size: 364 bytes --]

Hi,

Please review the patch set that supports amdgpu VM update via CPU. This feature provides improved performance for compute (HSA) where mapping / unmapping is carried out (by Kernel) independent of command submissions (done directly by user space). This version doesn't support shadow copy of VM page tables for CPU based update.

Best Regards,
Harish


[-- Attachment #2: 0001-drm-amdgpu-Add-vm-context-module-param.patch --]
[-- Type: application/octet-stream, Size: 6483 bytes --]

From 109b8cbf83c704e881ae92261fe556fb71a5bffd Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Tue, 9 May 2017 12:04:24 -0400
Subject: [PATCH 1/4] drm/amdgpu: Add vm context module param

Add VM context module param (amdgpu.vm_update_context) that can used to
control how the VM pde/pte are updated for Graphics and Compute.

BIT0 controls Graphics and BIT1 Compute.
 BIT0 [= 0] Graphics updated by SDMA [= 1] by CPU
 BIT1 [= 0] Compute updated by SDMA [= 1] by CPU

However, CPU based update is only supported for large BAR systems.

By default, vm_update_context = 2, indicating that Graphics VMs will be
updated via SDMA and Compute VMs will be updated via CPU (for large BAR
system).

Change-Id: I95b2913b30d1027d1224a97e0ec92c08ae6494f2
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  4 ++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 17 +++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  | 17 ++++++++++++++++-
 5 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 5310781..d420ae5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -93,6 +93,7 @@
 extern int amdgpu_vm_block_size;
 extern int amdgpu_vm_fault_stop;
 extern int amdgpu_vm_debug;
+extern int amdgpu_vm_update_context;
 extern int amdgpu_dc;
 extern int amdgpu_sched_jobs;
 extern int amdgpu_sched_hw_submission;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 416908a..5b8a6ff 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -93,6 +93,7 @@
 int amdgpu_vm_block_size = -1;
 int amdgpu_vm_fault_stop = 0;
 int amdgpu_vm_debug = 0;
+int amdgpu_vm_update_context = 2;
 int amdgpu_vram_page_split = 1024;
 int amdgpu_exp_hw_support = 0;
 int amdgpu_dc = -1;
@@ -179,6 +180,9 @@
 MODULE_PARM_DESC(vm_debug, "Debug VM handling (0 = disabled (default), 1 = enabled)");
 module_param_named(vm_debug, amdgpu_vm_debug, int, 0644);
 
+MODULE_PARM_DESC(vm_update_context, "VM update using CPU on large BAR. (0 = never, 1 = Graphics only, 2 = Compute only (default), 3 = Both");
+module_param_named(vm_update_context, amdgpu_vm_update_context, int, 0444);
+
 MODULE_PARM_DESC(vram_page_split, "Number of pages after we split VRAM allocations (default 1024, -1 = disable)");
 module_param_named(vram_page_split, amdgpu_vram_page_split, int, 0444);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index c2d4465..e2b394e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -757,7 +757,9 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
 		goto out_suspend;
 	}
 
-	r = amdgpu_vm_init(adev, &fpriv->vm);
+	r = amdgpu_vm_init(adev, &fpriv->vm,
+			   !!(amdgpu_vm_update_context &
+			   AMDGPU_VM_USE_CPU_UPDATE_FOR_GRAPHICS_MASK));
 	if (r) {
 		kfree(fpriv);
 		goto out_suspend;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index ed97a2e..b5871f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -694,6 +694,13 @@ static u64 amdgpu_vm_adjust_mc_addr(struct amdgpu_device *adev, u64 mc_addr)
 	return addr;
 }
 
+static bool amdgpu_vm_is_large_bar(struct amdgpu_device *adev)
+{
+	if (adev->mc.visible_vram_size > 0 &&
+			adev->mc.real_vram_size == adev->mc.visible_vram_size)
+		return true;
+	return false;
+}
 /**
  * amdgpu_vm_flush - hardware flush the vm
  *
@@ -2234,10 +2241,12 @@ void amdgpu_vm_adjust_size(struct amdgpu_device *adev, uint64_t vm_size)
  *
  * @adev: amdgpu_device pointer
  * @vm: requested vm
+ * @vm_update_mode: FALSE use SDMA, TRUE use CPU to update page tables
  *
  * Init @vm fields.
  */
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm
+		   bool vm_update_mode)
 {
 	const unsigned align = min(AMDGPU_VM_PTB_ALIGN_SIZE,
 		AMDGPU_VM_PTE_COUNT(adev) * 8);
@@ -2265,7 +2274,11 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 				  rq, amdgpu_sched_jobs);
 	if (r)
 		return r;
-
+	/* CPU update is only supported for large BAR system */
+	vm->is_vm_update_mode_cpu = (vm_update_mode &&
+				     amdgpu_vm_is_large_bar(adev));
+	DRM_DEBUG_DRIVER("VM update mode is %s\n",
+			 vm->is_vm_update_mode_cpu ? "CPU" : "SDMA");
 	vm->last_dir_update = NULL;
 
 	r = amdgpu_bo_create(adev, amdgpu_vm_bo_size(adev, 0), align, true,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index abc0bab..46fde0f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -96,6 +96,17 @@ struct amdgpu_vm_pt {
 	unsigned		last_entry_used;
 };
 
+/* amdgpu_vm_update_context - User parameter that controls how VM pde/pte are
+ *  updated for Graphics and Compute. BIT0 controls Graphics and BIT1 controls
+ *  Compute.
+ *  BIT0 [= 0] Graphics updated by SDMA [= 1] by CPU
+ *  BIT1 [= 0] Compute updated by SDMA [= 1] by CPU
+ *  NOTE: CPU update is supported only in large BAR systems
+ */
+#define AMDGPU_VM_USE_CPU_UPDATE_FOR_GRAPHICS_MASK (1 << 0)
+#define AMDGPU_VM_USE_CPU_UPDATE_FOR_COMPUTE_MASK (1 << 1)
+
+
 struct amdgpu_vm {
 	/* tree of virtual addresses mapped */
 	struct rb_root		va;
@@ -129,6 +140,9 @@ struct amdgpu_vm {
 	struct amdgpu_vm_id	*reserved_vmid[AMDGPU_MAX_VMHUBS];
 	/* each VM will map on CSA */
 	struct amdgpu_bo_va *csa_bo_va;
+
+	/* Flag to indicate if VM tables are updated by CPU or GPU (SDMA) */
+	bool                    is_vm_update_mode_cpu : 1;
 };
 
 struct amdgpu_vm_id {
@@ -190,7 +204,8 @@ struct amdgpu_vm_manager {
 
 void amdgpu_vm_manager_init(struct amdgpu_device *adev);
 void amdgpu_vm_manager_fini(struct amdgpu_device *adev);
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm);
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+		   bool vm_update_mode);
 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
 			 struct list_head *validated,
-- 
1.9.1


[-- Attachment #3: 0002-drm-amdgpu-Support-page-directory-update-via-CPU.patch --]
[-- Type: application/octet-stream, Size: 7819 bytes --]

From 575a3780e9b946443055ef59afa1336f3883d4a5 Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Tue, 11 Apr 2017 11:15:32 -0400
Subject: [PATCH 2/4] drm/amdgpu: Support page directory update via CPU

If amdgpu.vm_update_context param is set to use CPU, then Page
Directories will be updated by CPU instead of SDMA. CPU update however
is supported only on Large BAR systems.

NOTE: Support for page table entries will be done in a separate commit.

Change-Id: I842983c292e7961bc4aad8f4542239189d72d08c
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 149 ++++++++++++++++++++++++---------
 1 file changed, 109 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index b5871f4..f26e052 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -271,6 +271,7 @@ static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
 				  uint64_t saddr, uint64_t eaddr,
 				  unsigned level)
 {
+	u64 flags;
 	unsigned shift = (adev->vm_manager.num_level - level) *
 		adev->vm_manager.block_size;
 	unsigned pt_idx, from, to;
@@ -299,6 +300,12 @@ static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
 	saddr = saddr & ((1 << shift) - 1);
 	eaddr = eaddr & ((1 << shift) - 1);
 
+	flags = (vm->is_vm_update_mode_cpu) ?
+			AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED :
+			AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
+			AMDGPU_GEM_CREATE_SHADOW;
+
+
 	/* walk over the address space and allocate the page tables */
 	for (pt_idx = from; pt_idx <= to; ++pt_idx) {
 		struct reservation_object *resv = vm->root.bo->tbo.resv;
@@ -310,8 +317,7 @@ static int amdgpu_vm_alloc_levels(struct amdgpu_device *adev,
 					     amdgpu_vm_bo_size(adev, level),
 					     AMDGPU_GPU_PAGE_SIZE, true,
 					     AMDGPU_GEM_DOMAIN_VRAM,
-					     AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
-					     AMDGPU_GEM_CREATE_SHADOW |
+					     flags |
 					     AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
 					     AMDGPU_GEM_CREATE_VRAM_CLEARED,
 					     NULL, resv, &pt);
@@ -909,6 +915,33 @@ static uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
 	return result;
 }
 
+/**
+ * amdgpu_vm_cpu_set_ptes - helper to update page tables via CPU
+ *
+ * @params: see amdgpu_pte_update_params definition
+ * @pe: addr of the page entry
+ * @addr: dst addr to write into pe
+ * @count: number of page entries to update
+ * @incr: increase next addr by incr bytes
+ * @flags: hw access flags
+ *
+ */
+static void amdgpu_vm_cpu_set_ptes(struct amdgpu_pte_update_params *params,
+				   uint64_t pe, uint64_t addr,
+				   unsigned count, uint32_t incr,
+				   uint64_t flags)
+{
+	unsigned int i;
+
+	for (i = 0; i < count; i++) {
+		amdgpu_gart_set_pte_pde(params->adev, (void *)pe,
+					i, addr, flags);
+		addr += incr;
+	}
+	mb();
+	amdgpu_gart_flush_gpu_tlb(params->adev, 0);
+}
+
 /*
  * amdgpu_vm_update_level - update a single level in the hierarchy
  *
@@ -934,38 +967,67 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 	struct amdgpu_pte_update_params params;
 	struct fence *fence = NULL;
 
-	int r;
+	int r = 0;
 
 	if (!parent->entries)
 		return 0;
-	ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
 
-	/* padding, etc. */
-	ndw = 64;
-
-	/* assume the worst case */
-	ndw += parent->last_entry_used * 6;
+	memset(&params, 0, sizeof(params));
+	params.adev = adev;
+	shadow = parent->bo->shadow;
 
-	pd_addr = amdgpu_bo_gpu_offset(parent->bo);
+	/* If is_vm_update_mode_cpu flag is set, then try to update page table
+	 * using CPU. If failed, fallback to use SDMA
+	 */
+	WARN_ON(vm->is_vm_update_mode_cpu && shadow);
+	if (vm->is_vm_update_mode_cpu && !shadow) {
+		struct amdgpu_sync sync;
 
-	shadow = parent->bo->shadow;
-	if (shadow) {
-		r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
+		r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
 		if (r)
-			return r;
-		shadow_addr = amdgpu_bo_gpu_offset(shadow);
-		ndw *= 2;
-	} else {
-		shadow_addr = 0;
+			dev_warn(adev->dev, "Page table update using CPU failed. Fallback to SDMA\n");
+		else {
+			/* Wait for BO to be free. SDMA could be clearing it */
+			amdgpu_sync_create(&sync);
+			amdgpu_sync_resv(adev, &sync, parent->bo->tbo.resv,
+					 AMDGPU_FENCE_OWNER_VM);
+			amdgpu_sync_wait(&sync);
+			amdgpu_sync_free(&sync);
+			params.func = amdgpu_vm_cpu_set_ptes;
+		}
 	}
 
-	r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
-	if (r)
-		return r;
+	if (r || !vm->is_vm_update_mode_cpu) {
+		if (shadow) {
+			r = amdgpu_ttm_bind(&shadow->tbo, &shadow->tbo.mem);
+			if (r)
+				return r;
+		}
+		ring = container_of(vm->entity.sched, struct amdgpu_ring,
+				    sched);
 
-	memset(&params, 0, sizeof(params));
-	params.adev = adev;
-	params.ib = &job->ibs[0];
+		/* padding, etc. */
+		ndw = 64;
+
+		/* assume the worst case */
+		ndw += parent->last_entry_used * 6;
+
+		pd_addr = amdgpu_bo_gpu_offset(parent->bo);
+
+		if (shadow) {
+			shadow_addr = amdgpu_bo_gpu_offset(shadow);
+			ndw *= 2;
+		} else {
+			shadow_addr = 0;
+		}
+
+		r = amdgpu_job_alloc_with_ib(adev, ndw * 4, &job);
+		if (r)
+			return r;
+
+		params.ib = &job->ibs[0];
+		params.func = amdgpu_vm_do_set_ptes;
+	}
 
 	/* walk over the address space and update the directory */
 	for (pt_idx = 0; pt_idx <= parent->last_entry_used; ++pt_idx) {
@@ -1000,15 +1062,15 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 					amdgpu_vm_adjust_mc_addr(adev, last_pt);
 
 				if (shadow)
-					amdgpu_vm_do_set_ptes(&params,
-							      last_shadow,
-							      pt_addr, count,
-							      incr,
-							      AMDGPU_PTE_VALID);
-
-				amdgpu_vm_do_set_ptes(&params, last_pde,
-						      pt_addr, count, incr,
-						      AMDGPU_PTE_VALID);
+					params.func(&params,
+						    last_shadow,
+						    pt_addr, count,
+						    incr,
+						    AMDGPU_PTE_VALID);
+
+				params.func(&params, last_pde,
+					    pt_addr, count, incr,
+					    AMDGPU_PTE_VALID);
 			}
 
 			count = 1;
@@ -1024,14 +1086,16 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 		uint64_t pt_addr = amdgpu_vm_adjust_mc_addr(adev, last_pt);
 
 		if (vm->root.bo->shadow)
-			amdgpu_vm_do_set_ptes(&params, last_shadow, pt_addr,
-					      count, incr, AMDGPU_PTE_VALID);
+			params.func(&params, last_shadow, pt_addr,
+				    count, incr, AMDGPU_PTE_VALID);
 
-		amdgpu_vm_do_set_ptes(&params, last_pde, pt_addr,
-				      count, incr, AMDGPU_PTE_VALID);
+		params.func(&params, last_pde, pt_addr,
+			    count, incr, AMDGPU_PTE_VALID);
 	}
 
-	if (params.ib->length_dw == 0) {
+	if (params.func == amdgpu_vm_cpu_set_ptes)
+		amdgpu_bo_kunmap(parent->bo);
+	else if (params.ib->length_dw == 0) {
 		amdgpu_job_free(job);
 	} else {
 		amdgpu_ring_pad_ib(ring, params.ib);
@@ -2254,6 +2318,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm
 	struct amdgpu_ring *ring;
 	struct amd_sched_rq *rq;
 	int r, i;
+	u64 flags;
 
 	vm->va = RB_ROOT;
 	vm->client_id = atomic64_inc_return(&adev->vm_manager.client_counter);
@@ -2281,10 +2346,14 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm
 			 vm->is_vm_update_mode_cpu ? "CPU" : "SDMA");
 	vm->last_dir_update = NULL;
 
+	flags = (vm->is_vm_update_mode_cpu) ?
+			AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED :
+			AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
+			AMDGPU_GEM_CREATE_SHADOW;
+
 	r = amdgpu_bo_create(adev, amdgpu_vm_bo_size(adev, 0), align, true,
 			     AMDGPU_GEM_DOMAIN_VRAM,
-			     AMDGPU_GEM_CREATE_NO_CPU_ACCESS |
-			     AMDGPU_GEM_CREATE_SHADOW |
+			     flags |
 			     AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
 			     AMDGPU_GEM_CREATE_VRAM_CLEARED,
 			     NULL, NULL, &vm->root.bo);
-- 
1.9.1


[-- Attachment #4: 0003-drm-amdgpu-Return-EINVAL-if-no-PT-BO.patch --]
[-- Type: application/octet-stream, Size: 4623 bytes --]

From 82abbb495917342b81b00c798841dc800d7a211d Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Wed, 12 Apr 2017 12:20:16 -0400
Subject: [PATCH 3/4] drm/amdgpu: Return EINVAL if no PT BO

This change is also useful for the upcoming changes where page tables
can be updated by CPU.

Change-Id: I07510ed60c94cf1944ee96bb4b16c40ec88ea17c
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 48 +++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index f26e052..44d6d34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1190,8 +1190,9 @@ static struct amdgpu_bo *amdgpu_vm_get_pt(struct amdgpu_pte_update_params *p,
  * @flags: mapping flags
  *
  * Update the page tables in the range @start - @end.
+ * Returns 0 for success, -EINVAL for failure.
  */
-static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
+static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 				  uint64_t start, uint64_t end,
 				  uint64_t dst, uint64_t flags)
 {
@@ -1209,12 +1210,12 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 	pt = amdgpu_vm_get_pt(params, addr);
 	if (!pt) {
 		pr_err("PT not found, aborting update_ptes\n");
-		return;
+		return -EINVAL;
 	}
 
 	if (params->shadow) {
 		if (!pt->shadow)
-			return;
+			return 0;
 		pt = pt->shadow;
 	}
 	if ((addr & ~mask) == (end & ~mask))
@@ -1236,12 +1237,12 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 		pt = amdgpu_vm_get_pt(params, addr);
 		if (!pt) {
 			pr_err("PT not found, aborting update_ptes\n");
-			return;
+			return -EINVAL;
 		}
 
 		if (params->shadow) {
 			if (!pt->shadow)
-				return;
+				return 0;
 			pt = pt->shadow;
 		}
 
@@ -1276,6 +1277,8 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 
 	params->func(params, cur_pe_start, cur_dst, cur_nptes,
 		     AMDGPU_GPU_PAGE_SIZE, flags);
+
+	return 0;
 }
 
 /*
@@ -1287,11 +1290,14 @@ static void amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
  * @end: last PTE to handle
  * @dst: addr those PTEs should point to
  * @flags: hw mapping flags
+ * Returns 0 for success, -EINVAL for failure.
  */
-static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
+static int amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
 				uint64_t start, uint64_t end,
 				uint64_t dst, uint64_t flags)
 {
+	int r;
+
 	/**
 	 * The MC L1 TLB supports variable sized pages, based on a fragment
 	 * field in the PTE. When this field is set to a non-zero value, page
@@ -1320,28 +1326,30 @@ static void amdgpu_vm_frag_ptes(struct amdgpu_pte_update_params	*params,
 
 	/* system pages are non continuously */
 	if (params->src || !(flags & AMDGPU_PTE_VALID) ||
-	    (frag_start >= frag_end)) {
-
-		amdgpu_vm_update_ptes(params, start, end, dst, flags);
-		return;
-	}
+	    (frag_start >= frag_end))
+		return amdgpu_vm_update_ptes(params, start, end, dst, flags);
 
 	/* handle the 4K area at the beginning */
 	if (start != frag_start) {
-		amdgpu_vm_update_ptes(params, start, frag_start,
-				      dst, flags);
+		r = amdgpu_vm_update_ptes(params, start, frag_start,
+					  dst, flags);
+		if (r)
+			return r;
 		dst += (frag_start - start) * AMDGPU_GPU_PAGE_SIZE;
 	}
 
 	/* handle the area in the middle */
-	amdgpu_vm_update_ptes(params, frag_start, frag_end, dst,
-			      flags | frag_flags);
+	r = amdgpu_vm_update_ptes(params, frag_start, frag_end, dst,
+				  flags | frag_flags);
+	if (r)
+		return r;
 
 	/* handle the 4K area at the end */
 	if (frag_end != end) {
 		dst += (frag_end - frag_start) * AMDGPU_GPU_PAGE_SIZE;
-		amdgpu_vm_update_ptes(params, frag_end, end, dst, flags);
+		r = amdgpu_vm_update_ptes(params, frag_end, end, dst, flags);
 	}
+	return r;
 }
 
 /**
@@ -1462,9 +1470,13 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 		goto error_free;
 
 	params.shadow = true;
-	amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+	r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+	if (r)
+		goto error_free;
 	params.shadow = false;
-	amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+	r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+	if (r)
+		goto error_free;
 
 	amdgpu_ring_pad_ib(ring, params.ib);
 	WARN_ON(params.ib->length_dw > ndw);
-- 
1.9.1


[-- Attachment #5: 0004-drm-amdgpu-Support-page-table-update-via-CPU.patch --]
[-- Type: application/octet-stream, Size: 5744 bytes --]

From faa24ca969392e19f650239b90123eee5b717d0d Mon Sep 17 00:00:00 2001
From: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
Date: Wed, 19 Apr 2017 16:54:16 -0400
Subject: [PATCH 4/4] drm/amdgpu: Support page table update via CPU

Change-Id: I5fe1009de222e60c406b36d7a4271ac15d80e286
Signed-off-by: Harish Kasiviswanathan <Harish.Kasiviswanathan@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 113 ++++++++++++++++++++++++++++++++-
 1 file changed, 110 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 44d6d34..e969c60 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -79,6 +79,12 @@ struct amdgpu_pte_update_params {
 		     uint64_t flags);
 	/* indicate update pt or its shadow */
 	bool shadow;
+	/* The next three are used during VM update by CPU */
+	bool update_by_cpu;
+	/* DMA addresses to use for mapping */
+	dma_addr_t *pages_addr;
+	/* Kernel pointer of PD/PT BO that needs to be updated */
+	void *kptr;
 };
 
 /* Helper to disable partial resident texture feature from a fence callback */
@@ -919,12 +925,14 @@ static uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr)
  * amdgpu_vm_cpu_set_ptes - helper to update page tables via CPU
  *
  * @params: see amdgpu_pte_update_params definition
- * @pe: addr of the page entry
+ * @pe: kmap addr of the page entry
  * @addr: dst addr to write into pe
  * @count: number of page entries to update
  * @incr: increase next addr by incr bytes
  * @flags: hw access flags
  *
+ * Used for updating System Memory not GART mapped to same device,
+ * VRAM memory and Page directory entries.
  */
 static void amdgpu_vm_cpu_set_ptes(struct amdgpu_pte_update_params *params,
 				   uint64_t pe, uint64_t addr,
@@ -932,12 +940,17 @@ static void amdgpu_vm_cpu_set_ptes(struct amdgpu_pte_update_params *params,
 				   uint64_t flags)
 {
 	unsigned int i;
+	uint64_t value;
 
 	for (i = 0; i < count; i++) {
+		value = params->pages_addr ?
+			amdgpu_vm_map_gart(params->pages_addr, addr) :
+			addr;
 		amdgpu_gart_set_pte_pde(params->adev, (void *)pe,
-					i, addr, flags);
+					i, value, flags);
 		addr += incr;
 	}
+
 	mb();
 	amdgpu_gart_flush_gpu_tlb(params->adev, 0);
 }
@@ -985,7 +998,8 @@ static int amdgpu_vm_update_level(struct amdgpu_device *adev,
 
 		r = amdgpu_bo_kmap(parent->bo, (void **)&pd_addr);
 		if (r)
-			dev_warn(adev->dev, "Page table update using CPU failed. Fallback to SDMA\n");
+			dev_warn(adev->dev,
+				 "Page table update using CPU failed. Fallback to SDMA\n");
 		else {
 			/* Wait for BO to be free. SDMA could be clearing it */
 			amdgpu_sync_create(&sync);
@@ -1180,6 +1194,59 @@ static struct amdgpu_bo *amdgpu_vm_get_pt(struct amdgpu_pte_update_params *p,
 }
 
 /**
+ * amdgpu_vm_update_ptes_cpu - Update the page tables in the range
+ *  start - @end using CPU.
+ * See amdgpu_vm_update_ptes for parameter description.
+ *
+ */
+static int amdgpu_vm_update_ptes_cpu(struct amdgpu_pte_update_params *params,
+				     uint64_t start, uint64_t end,
+				     uint64_t dst, uint64_t flags)
+{
+	struct amdgpu_device *adev = params->adev;
+	const uint64_t mask = AMDGPU_VM_PTE_COUNT(adev) - 1;
+	void *pe_ptr;
+	uint64_t addr;
+	struct amdgpu_bo *pt;
+	unsigned int nptes;
+	int r;
+
+	/* initialize the variables */
+	addr = start;
+
+	/* walk over the address space and update the page tables */
+	while (addr < end) {
+		pt = amdgpu_vm_get_pt(params, addr);
+		if (!pt) {
+			pr_err("PT not found, aborting update_ptes\n");
+			return -EINVAL;
+		}
+
+		WARN_ON(params->shadow);
+
+		r = amdgpu_bo_kmap(pt, &pe_ptr);
+		if (r)
+			return r;
+
+		pe_ptr += (addr & mask) * 8;
+
+		if ((addr & ~mask) == (end & ~mask))
+			nptes = end - addr;
+		else
+			nptes = AMDGPU_VM_PTE_COUNT(adev) - (addr & mask);
+
+		params->func(params, (uint64_t)pe_ptr, dst, nptes,
+			     AMDGPU_GPU_PAGE_SIZE, flags);
+
+		amdgpu_bo_kunmap(pt);
+		addr += nptes;
+		dst += nptes * AMDGPU_GPU_PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+/**
  * amdgpu_vm_update_ptes - make sure that page tables are valid
  *
  * @params: see amdgpu_pte_update_params definition
@@ -1205,6 +1272,13 @@ static int amdgpu_vm_update_ptes(struct amdgpu_pte_update_params *params,
 	unsigned nptes; /* next number of ptes to be updated */
 	uint64_t next_pe_start;
 
+	if (params->update_by_cpu == true)
+		/* Return CPU based update status. If failed, retry using
+		 * SDMA will be done
+		 */
+		return amdgpu_vm_update_ptes_cpu(params, start, end,
+						  dst, flags);
+
 	/* initialize the variables */
 	addr = start;
 	pt = amdgpu_vm_get_pt(params, addr);
@@ -1391,6 +1465,39 @@ static int amdgpu_vm_bo_update_mapping(struct amdgpu_device *adev,
 	params.vm = vm;
 	params.src = src;
 
+	if (vm->is_vm_update_mode_cpu) {
+		struct amdgpu_sync sync;
+
+		amdgpu_sync_create(&sync);
+		amdgpu_sync_resv(adev, &sync, vm->root.bo->tbo.resv,
+				 AMDGPU_FENCE_OWNER_VM);
+		amdgpu_sync_wait(&sync);
+		amdgpu_sync_free(&sync);
+
+		params.func = amdgpu_vm_cpu_set_ptes;
+
+		/* params.src is used as flag to indicate system Memory */
+		if (pages_addr)
+			params.src = ~0;
+
+		params.pages_addr = pages_addr;
+		params.update_by_cpu = true;
+
+		params.shadow = false;
+		r = amdgpu_vm_frag_ptes(&params, start, last + 1, addr, flags);
+		if (!r)
+			return r;
+
+		dev_warn(adev->dev,
+			 "CPU update of VM failed. Fallback to SDMA\n");
+
+		/* Reset params for SDMA fallback path */
+		params.update_by_cpu = false;
+		params.pages_addr = NULL;
+		params.kptr = NULL;
+		params.func = NULL;
+	}
+
 	ring = container_of(vm->entity.sched, struct amdgpu_ring, sched);
 
 	/* sync to everything on unmapping */
-- 
1.9.1


[-- Attachment #6: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-05-13  9:08 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-05-09 21:47 Support for amdgpu VM update via CPU on large-bar systems Kasiviswanathan, Harish
     [not found] ` <DM3PR1201MB10382F282D646D630708246D8CEF0-BBcFnVpqZhWjUUTFdQAMQmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-05-12  8:25   ` zhoucm1
     [not found]     ` <5915717A.3000209-5C7GfCeVMHo@public.gmane.org>
2017-05-12  8:33       ` Christian König
     [not found]         ` <21e206ed-8d60-450f-6d23-e01c68c5ad73-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-12  8:37           ` zhoucm1
     [not found]             ` <59157453.8010308-5C7GfCeVMHo@public.gmane.org>
2017-05-12  8:43               ` Christian König
     [not found]                 ` <b82fc6cd-c19e-85c2-8628-89c2cb10c5c0-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-12  8:44                   ` zhoucm1
2017-05-12 19:25                   ` Felix Kuehling
     [not found]                     ` <9f9f1e0c-caf6-7233-03c6-59196f672a72-5C7GfCeVMHo@public.gmane.org>
2017-05-13  9:08                       ` Christian König
  -- strict thread matches above, loose matches on Subject: below --
2017-05-09 18:34 Kasiviswanathan, Harish
     [not found] ` <DM3PR1201MB103847C11F89904752F962768CEF0-BBcFnVpqZhWjUUTFdQAMQmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-05-10  7:23   ` Christian König
     [not found]     ` <1b144623-3315-5dd7-fcc7-3be9e24e0f21-ANTagKRnAhcb1SvskN2V4Q@public.gmane.org>
2017-05-10 18:50       ` Kasiviswanathan, Harish
     [not found]         ` <DM3PR1201MB1038AAC8F7B04B7F3E015FE88CEC0-BBcFnVpqZhWjUUTFdQAMQmrFom/aUZj6nBOFsp37pqbUKgpGm//BTAC/G2K4zDHf@public.gmane.org>
2017-05-10 19:22           ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.