All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr
@ 2021-10-19 18:14 Nirmoy Das
  2021-10-19 18:14 ` [PATCH 2/3] drm/amdgpu: do not pass ttm_resource_manager to vram_mgr Nirmoy Das
                   ` (2 more replies)
  0 siblings, 3 replies; 18+ messages in thread
From: Nirmoy Das @ 2021-10-19 18:14 UTC (permalink / raw)
  To: amd-gfx; +Cc: Christian.Koenig, andrey.grodzovsky, Nirmoy Das

Do not allow exported amdgpu_gtt_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call gtt_mgr functions.

Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 31 ++++++++++++---------
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  4 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |  4 +--
 4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..5807df52031c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 
 	amdgpu_virt_init_data_exchange(adev);
 	/* we need recover gart prior to run SMC/CP/SDMA resume */
-	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
+	amdgpu_gtt_mgr_recover(adev);
 
 	r = amdgpu_device_fw_loading(adev);
 	if (r)
@@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
 					amdgpu_inc_vram_lost(tmp_adev);
 				}
 
-				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
+				r = amdgpu_gtt_mgr_recover(tmp_adev);
 				if (r)
 					goto out;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
index c18f16b3be9c..5e41f8ef743a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -77,10 +77,8 @@ static ssize_t amdgpu_mem_info_gtt_used_show(struct device *dev,
 {
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = drm_to_adev(ddev);
-	struct ttm_resource_manager *man;
 
-	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
-	return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
+	return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(adev));
 }
 
 static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -206,14 +204,19 @@ static void amdgpu_gtt_mgr_del(struct ttm_resource_manager *man,
 /**
  * amdgpu_gtt_mgr_usage - return usage of GTT domain
  *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
  *
  * Return how many bytes are used in the GTT domain
  */
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev)
 {
-	struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-	s64 result = man->size - atomic64_read(&mgr->available);
+	struct ttm_resource_manager *man;
+	struct amdgpu_gtt_mgr *mgr;
+	s64 result;
+
+	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
+	mgr = to_gtt_mgr(man);
+	result = man->size - atomic64_read(&mgr->available);
 
 	return (result > 0 ? result : 0) * PAGE_SIZE;
 }
@@ -221,19 +224,20 @@ uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
 /**
  * amdgpu_gtt_mgr_recover - re-init gart
  *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
  *
  * Re-init the gart for each known BO in the GTT.
  */
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
+int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev)
 {
-	struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-	struct amdgpu_device *adev;
+	struct ttm_resource_manager *man;
+	struct amdgpu_gtt_mgr *mgr;
 	struct amdgpu_gtt_node *node;
 	struct drm_mm_node *mm_node;
 	int r = 0;
 
-	adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
+	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
+	mgr = to_gtt_mgr(man);
 	spin_lock(&mgr->lock);
 	drm_mm_for_each_node(mm_node, &mgr->mm) {
 		node = container_of(mm_node, typeof(*node), base.mm_nodes[0]);
@@ -260,6 +264,7 @@ static void amdgpu_gtt_mgr_debug(struct ttm_resource_manager *man,
 				 struct drm_printer *printer)
 {
 	struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
+	struct amdgpu_device *adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
 
 	spin_lock(&mgr->lock);
 	drm_mm_print(&mgr->mm, printer);
@@ -267,7 +272,7 @@ static void amdgpu_gtt_mgr_debug(struct ttm_resource_manager *man,
 
 	drm_printf(printer, "man size:%llu pages, gtt available:%lld pages, usage:%lluMB\n",
 		   man->size, (u64)atomic64_read(&mgr->available),
-		   amdgpu_gtt_mgr_usage(man) >> 20);
+		   amdgpu_gtt_mgr_usage(adev) >> 20);
 }
 
 static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index d2955ea4a62b..b9b38f70e416 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 		ui64 = amdgpu_vram_mgr_vis_usage(ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM));
 		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
 	case AMDGPU_INFO_GTT_USAGE:
-		ui64 = amdgpu_gtt_mgr_usage(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
+		ui64 = amdgpu_gtt_mgr_usage(adev);
 		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
 	case AMDGPU_INFO_GDS_CONFIG: {
 		struct drm_amdgpu_info_gds gds_info;
@@ -738,7 +738,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 		mem.gtt.usable_heap_size = mem.gtt.total_heap_size -
 			atomic64_read(&adev->gart_pin_size);
 		mem.gtt.heap_usage =
-			amdgpu_gtt_mgr_usage(gtt_man);
+			amdgpu_gtt_mgr_usage(adev);
 		mem.gtt.max_allocation = mem.gtt.usable_heap_size * 3 / 4;
 
 		return copy_to_user(out, &mem,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 91a087f9dc7c..6e337cacef51 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -114,8 +114,8 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev);
 void amdgpu_vram_mgr_fini(struct amdgpu_device *adev);
 
 bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_resource *mem);
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man);
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man);
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev);
+int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev);
 
 uint64_t amdgpu_preempt_mgr_usage(struct ttm_resource_manager *man);
 
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH 2/3] drm/amdgpu: do not pass ttm_resource_manager to vram_mgr
  2021-10-19 18:14 [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr Nirmoy Das
@ 2021-10-19 18:14 ` Nirmoy Das
  2021-10-19 18:14 ` [PATCH v2 3/3] drm/amdgpu: recover gart table at resume Nirmoy Das
  2021-10-20  6:49 ` [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr Christian König
  2 siblings, 0 replies; 18+ messages in thread
From: Nirmoy Das @ 2021-10-19 18:14 UTC (permalink / raw)
  To: amd-gfx; +Cc: Christian.Koenig, andrey.grodzovsky, Nirmoy Das

Do not allow exported amdgpu_vram_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call vram_mgr functions.

Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c       |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c      | 10 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c      |  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h      |  8 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c     |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 54 ++++++++++++--------
 7 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7077f21f0021..4837c579a787 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -531,9 +531,8 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int dma_buf_fd,
 uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
 {
 	struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-	struct ttm_resource_manager *vram_man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
 
-	return amdgpu_vram_mgr_usage(vram_man);
+	return amdgpu_vram_mgr_usage(adev);
 }
 
 uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 76fe5b71e35d..f4084ca8b614 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -298,7 +298,6 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
 {
 	s64 time_us, increment_us;
 	u64 free_vram, total_vram, used_vram;
-	struct ttm_resource_manager *vram_man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
 	/* Allow a maximum of 200 accumulated ms. This is basically per-IB
 	 * throttling.
 	 *
@@ -315,7 +314,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
 	}
 
 	total_vram = adev->gmc.real_vram_size - atomic64_read(&adev->vram_pin_size);
-	used_vram = amdgpu_vram_mgr_usage(vram_man);
+	used_vram = amdgpu_vram_mgr_usage(adev);
 	free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
 
 	spin_lock(&adev->mm_stats.lock);
@@ -362,7 +361,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct amdgpu_device *adev,
 	if (!amdgpu_gmc_vram_full_visible(&adev->gmc)) {
 		u64 total_vis_vram = adev->gmc.visible_vram_size;
 		u64 used_vis_vram =
-		  amdgpu_vram_mgr_vis_usage(vram_man);
+		  amdgpu_vram_mgr_vis_usage(adev);
 
 		if (used_vis_vram < total_vis_vram) {
 			u64 free_vis_vram = total_vis_vram - used_vis_vram;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index b9b38f70e416..34674ccabd67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -672,10 +672,10 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 		ui64 = atomic64_read(&adev->num_vram_cpu_page_faults);
 		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
 	case AMDGPU_INFO_VRAM_USAGE:
-		ui64 = amdgpu_vram_mgr_usage(ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM));
+		ui64 = amdgpu_vram_mgr_usage(adev);
 		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
 	case AMDGPU_INFO_VIS_VRAM_USAGE:
-		ui64 = amdgpu_vram_mgr_vis_usage(ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM));
+		ui64 = amdgpu_vram_mgr_vis_usage(adev);
 		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
 	case AMDGPU_INFO_GTT_USAGE:
 		ui64 = amdgpu_gtt_mgr_usage(adev);
@@ -709,8 +709,6 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 	}
 	case AMDGPU_INFO_MEMORY: {
 		struct drm_amdgpu_memory_info mem;
-		struct ttm_resource_manager *vram_man =
-			ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
 		struct ttm_resource_manager *gtt_man =
 			ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
 		memset(&mem, 0, sizeof(mem));
@@ -719,7 +717,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 			atomic64_read(&adev->vram_pin_size) -
 			AMDGPU_VM_RESERVED_VRAM;
 		mem.vram.heap_usage =
-			amdgpu_vram_mgr_usage(vram_man);
+			amdgpu_vram_mgr_usage(adev);
 		mem.vram.max_allocation = mem.vram.usable_heap_size * 3 / 4;
 
 		mem.cpu_accessible_vram.total_heap_size =
@@ -729,7 +727,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 			    atomic64_read(&adev->visible_pin_size),
 			    mem.vram.usable_heap_size);
 		mem.cpu_accessible_vram.heap_usage =
-			amdgpu_vram_mgr_vis_usage(vram_man);
+			amdgpu_vram_mgr_vis_usage(adev);
 		mem.cpu_accessible_vram.max_allocation =
 			mem.cpu_accessible_vram.usable_heap_size * 3 / 4;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 08133de21fdd..2e339d37ba35 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1804,8 +1804,7 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 			.size = AMDGPU_GPU_PAGE_SIZE,
 			.flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED,
 		};
-		status = amdgpu_vram_mgr_query_page_status(
-				ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
+		status = amdgpu_vram_mgr_query_page_status(adev,
 				data->bps[i].retired_page);
 		if (status == -EBUSY)
 			(*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING;
@@ -1906,8 +1905,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
 			goto out;
 		}
 
-		amdgpu_vram_mgr_reserve_range(
-			ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM),
+		amdgpu_vram_mgr_reserve_range(adev,
 			bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
 			AMDGPU_GPU_PAGE_SIZE);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index 6e337cacef51..99861418a52e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -129,11 +129,11 @@ int amdgpu_vram_mgr_alloc_sgt(struct amdgpu_device *adev,
 void amdgpu_vram_mgr_free_sgt(struct device *dev,
 			      enum dma_data_direction dir,
 			      struct sg_table *sgt);
-uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man);
-uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man);
-int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
+uint64_t amdgpu_vram_mgr_usage(struct amdgpu_device *adev);
+uint64_t amdgpu_vram_mgr_vis_usage(struct amdgpu_device *adev);
+int amdgpu_vram_mgr_reserve_range(struct amdgpu_device *adev,
 				  uint64_t start, uint64_t size);
-int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man,
+int amdgpu_vram_mgr_query_page_status(struct amdgpu_device *adev,
 				      uint64_t start);
 
 int amdgpu_ttm_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 88c4177b708a..4806ab817c02 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -548,7 +548,6 @@ static void amdgpu_virt_populate_vf2pf_ucode_info(struct amdgpu_device *adev)
 static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev)
 {
 	struct amd_sriov_msg_vf2pf_info *vf2pf_info;
-	struct ttm_resource_manager *vram_man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
 
 	vf2pf_info = (struct amd_sriov_msg_vf2pf_info *) adev->virt.fw_reserve.p_vf2pf;
 
@@ -571,8 +570,8 @@ static int amdgpu_virt_write_vf2pf_data(struct amdgpu_device *adev)
 	vf2pf_info->driver_cert = 0;
 	vf2pf_info->os_info.all = 0;
 
-	vf2pf_info->fb_usage = amdgpu_vram_mgr_usage(vram_man) >> 20;
-	vf2pf_info->fb_vis_usage = amdgpu_vram_mgr_vis_usage(vram_man) >> 20;
+	vf2pf_info->fb_usage = amdgpu_vram_mgr_usage(adev) >> 20;
+	vf2pf_info->fb_vis_usage = amdgpu_vram_mgr_vis_usage(adev) >> 20;
 	vf2pf_info->fb_size = adev->gmc.real_vram_size >> 20;
 	vf2pf_info->fb_vis_size = adev->gmc.visible_vram_size >> 20;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 7b2b0980ec41..7348b744c929 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -96,10 +96,8 @@ static ssize_t amdgpu_mem_info_vram_used_show(struct device *dev,
 {
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = drm_to_adev(ddev);
-	struct ttm_resource_manager *man;
 
-	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
-	return sysfs_emit(buf, "%llu\n", amdgpu_vram_mgr_usage(man));
+	return sysfs_emit(buf, "%llu\n", amdgpu_vram_mgr_usage(adev));
 }
 
 /**
@@ -116,10 +114,8 @@ static ssize_t amdgpu_mem_info_vis_vram_used_show(struct device *dev,
 {
 	struct drm_device *ddev = dev_get_drvdata(dev);
 	struct amdgpu_device *adev = drm_to_adev(ddev);
-	struct ttm_resource_manager *man;
 
-	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
-	return sysfs_emit(buf, "%llu\n", amdgpu_vram_mgr_vis_usage(man));
+	return sysfs_emit(buf, "%llu\n", amdgpu_vram_mgr_vis_usage(adev));
 }
 
 /**
@@ -263,18 +259,22 @@ static void amdgpu_vram_mgr_do_reserve(struct ttm_resource_manager *man)
 /**
  * amdgpu_vram_mgr_reserve_range - Reserve a range from VRAM
  *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
  * @start: start address of the range in VRAM
  * @size: size of the range
  *
  * Reserve memory from start addess with the specified size in VRAM
  */
-int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
+int amdgpu_vram_mgr_reserve_range(struct amdgpu_device *adev,
 				  uint64_t start, uint64_t size)
 {
-	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
+	struct ttm_resource_manager *man;
+	struct amdgpu_vram_mgr *mgr;
 	struct amdgpu_vram_reservation *rsv;
 
+	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
+	mgr = to_vram_mgr(man);
+
 	rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
 	if (!rsv)
 		return -ENOMEM;
@@ -294,7 +294,7 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
 /**
  * amdgpu_vram_mgr_query_page_status - query the reservation status
  *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
  * @start: start address of a page in VRAM
  *
  * Returns:
@@ -302,13 +302,17 @@ int amdgpu_vram_mgr_reserve_range(struct ttm_resource_manager *man,
  *	0: the page has been reserved
  *	-ENOENT: the input page is not a reservation
  */
-int amdgpu_vram_mgr_query_page_status(struct ttm_resource_manager *man,
+int amdgpu_vram_mgr_query_page_status(struct amdgpu_device *adev,
 				      uint64_t start)
 {
-	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
+	struct ttm_resource_manager *man;
+	struct amdgpu_vram_mgr *mgr;
 	struct amdgpu_vram_reservation *rsv;
 	int ret;
 
+	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
+	mgr = to_vram_mgr(man);
+
 	spin_lock(&mgr->lock);
 
 	list_for_each_entry(rsv, &mgr->reservations_pending, node) {
@@ -632,13 +636,18 @@ void amdgpu_vram_mgr_free_sgt(struct device *dev,
 /**
  * amdgpu_vram_mgr_usage - how many bytes are used in this domain
  *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
  *
  * Returns how many bytes are used in this domain.
  */
-uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_vram_mgr_usage(struct amdgpu_device *adev)
 {
-	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
+	struct ttm_resource_manager *man;
+	struct amdgpu_vram_mgr *mgr;
+
+	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
+	mgr = to_vram_mgr(man);
+
 
 	return atomic64_read(&mgr->usage);
 }
@@ -646,13 +655,17 @@ uint64_t amdgpu_vram_mgr_usage(struct ttm_resource_manager *man)
 /**
  * amdgpu_vram_mgr_vis_usage - how many bytes are used in the visible part
  *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
  *
  * Returns how many bytes are used in the visible part of VRAM
  */
-uint64_t amdgpu_vram_mgr_vis_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_vram_mgr_vis_usage(struct amdgpu_device *adev)
 {
-	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
+	struct ttm_resource_manager *man;
+	struct amdgpu_vram_mgr *mgr;
+
+	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM);
+	mgr = to_vram_mgr(man);
 
 	return atomic64_read(&mgr->vis_usage);
 }
@@ -669,14 +682,15 @@ static void amdgpu_vram_mgr_debug(struct ttm_resource_manager *man,
 				  struct drm_printer *printer)
 {
 	struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
+	struct amdgpu_device *adev = to_amdgpu_device(mgr);
 
 	spin_lock(&mgr->lock);
 	drm_mm_print(&mgr->mm, printer);
 	spin_unlock(&mgr->lock);
 
 	drm_printf(printer, "man size:%llu pages, ram usage:%lluMB, vis usage:%lluMB\n",
-		   man->size, amdgpu_vram_mgr_usage(man) >> 20,
-		   amdgpu_vram_mgr_vis_usage(man) >> 20);
+		   man->size, amdgpu_vram_mgr_usage(adev) >> 20,
+		   amdgpu_vram_mgr_vis_usage(adev) >> 20);
 }
 
 static const struct ttm_resource_manager_func amdgpu_vram_mgr_func = {
-- 
2.32.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-19 18:14 [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr Nirmoy Das
  2021-10-19 18:14 ` [PATCH 2/3] drm/amdgpu: do not pass ttm_resource_manager to vram_mgr Nirmoy Das
@ 2021-10-19 18:14 ` Nirmoy Das
  2021-10-20  6:52   ` Christian König
  2021-10-20  9:11   ` Lazar, Lijo
  2021-10-20  6:49 ` [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr Christian König
  2 siblings, 2 replies; 18+ messages in thread
From: Nirmoy Das @ 2021-10-19 18:14 UTC (permalink / raw)
  To: amd-gfx; +Cc: Christian.Koenig, andrey.grodzovsky, Nirmoy Das

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 ++++++++++++----------
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
 6 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5807df52031c..f69e613805db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
 	amdgpu_fence_driver_hw_fini(adev);

 	amdgpu_device_ip_suspend_phase2(adev);
-	/* This second call to evict device resources is to evict
-	 * the gart page table using the CPU.
-	 */
-	amdgpu_device_evict_resources(adev);

 	return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index d3e4203f6217..97a9f61fa106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
  *
  * @adev: amdgpu_device pointer
  *
- * Allocate video memory for GART page table
+ * Allocate and pin video memory for GART page table
  * (pcie r4xx, r5xx+).  These asics require the
  * gart table to be in video memory.
  * Returns 0 for success, error for failure.
  */
 int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
 {
+	struct amdgpu_bo_param bp;
 	int r;

-	if (adev->gart.bo == NULL) {
-		struct amdgpu_bo_param bp;
-
-		memset(&bp, 0, sizeof(bp));
-		bp.size = adev->gart.table_size;
-		bp.byte_align = PAGE_SIZE;
-		bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-		bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-			AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-		bp.type = ttm_bo_type_kernel;
-		bp.resv = NULL;
-		bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-		r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
-		if (r) {
-			return r;
-		}
-	}
+	if (adev->gart.bo != NULL)
+		return 0;
+
+	memset(&bp, 0, sizeof(bp));
+	bp.size = adev->gart.table_size;
+	bp.byte_align = PAGE_SIZE;
+	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
+	bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
+		AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+	bp.type = ttm_bo_type_kernel;
+	bp.resv = NULL;
+	bp.bo_ptr_size = sizeof(struct amdgpu_bo);
+
+	r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
+	if (r)
+		return r;
+
+	r = amdgpu_gart_table_vram_pin(adev);
+	if (r)
+		return r;
+
 	return 0;
 }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 3ec5ff5a6dbe..75d584e1b0e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct amdgpu_device *adev)
 		return -EINVAL;
 	}

-	r = amdgpu_gart_table_vram_pin(adev);
-	if (r)
-		return r;
+	if (adev->in_suspend) {
+		r = amdgpu_gtt_mgr_recover(adev);
+		if (r)
+			return r;
+	}

 	r = adev->gfxhub.funcs->gart_enable(adev);
 	if (r)
@@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct amdgpu_device *adev)
 {
 	adev->gfxhub.funcs->gart_disable(adev);
 	adev->mmhub.funcs->gart_disable(adev);
-	amdgpu_gart_table_vram_unpin(adev);
 }

 static int gmc_v10_0_hw_fini(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 0a50fdaced7e..02e90d9443c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct amdgpu_device *adev)
 		dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
 		return -EINVAL;
 	}
-	r = amdgpu_gart_table_vram_pin(adev);
-	if (r)
-		return r;
+
+	if (adev->in_suspend) {
+		r = amdgpu_gtt_mgr_recover(adev);
+		if (r)
+			return r;
+	}

 	table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);

@@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct amdgpu_device *adev)
 	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
 	WREG32(mmVM_L2_CNTL, tmp);
 	WREG32(mmVM_L2_CNTL2, 0);
-	amdgpu_gart_table_vram_unpin(adev);
 }

 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 492ebed2915b..dc2577e37688 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct amdgpu_device *adev)
 		dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
 		return -EINVAL;
 	}
-	r = amdgpu_gart_table_vram_pin(adev);
-	if (r)
-		return r;
+
+	if (adev->in_suspend) {
+		r = amdgpu_gtt_mgr_recover(adev);
+		if (r)
+			return r;
+	}

 	table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);

@@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct amdgpu_device *adev)
 	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
 	WREG32(mmVM_L2_CNTL, tmp);
 	WREG32(mmVM_L2_CNTL2, 0);
-	amdgpu_gart_table_vram_unpin(adev);
 }

 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index cb82404df534..732d91dbf449 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct amdgpu_device *adev)
 		return -EINVAL;
 	}

-	r = amdgpu_gart_table_vram_pin(adev);
-	if (r)
-		return r;
+	if (adev->in_suspend) {
+		r = amdgpu_gtt_mgr_recover(adev);
+		if (r)
+			return r;
+	}

 	r = adev->gfxhub.funcs->gart_enable(adev);
 	if (r)
@@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct amdgpu_device *adev)
 {
 	adev->gfxhub.funcs->gart_disable(adev);
 	adev->mmhub.funcs->gart_disable(adev);
-	amdgpu_gart_table_vram_unpin(adev);
 }

 static int gmc_v9_0_hw_fini(void *handle)
--
2.32.0


^ permalink raw reply related	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr
  2021-10-19 18:14 [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr Nirmoy Das
  2021-10-19 18:14 ` [PATCH 2/3] drm/amdgpu: do not pass ttm_resource_manager to vram_mgr Nirmoy Das
  2021-10-19 18:14 ` [PATCH v2 3/3] drm/amdgpu: recover gart table at resume Nirmoy Das
@ 2021-10-20  6:49 ` Christian König
  2021-10-20  8:48   ` Das, Nirmoy
  2 siblings, 1 reply; 18+ messages in thread
From: Christian König @ 2021-10-20  6:49 UTC (permalink / raw)
  To: Nirmoy Das, amd-gfx; +Cc: andrey.grodzovsky

Am 19.10.21 um 20:14 schrieb Nirmoy Das:
> Do not allow exported amdgpu_gtt_mgr_*() to accept
> any ttm_resource_manager pointer. Also there is no need
> to force other module to call a ttm function just to
> eventually call gtt_mgr functions.

That's a rather bad idea I think.

The GTT and VRAM manager work on their respective objects and not on the 
adev directly.

Christian.

>
> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 31 ++++++++++++---------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  4 +--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |  4 +--
>   4 files changed, 24 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 41ce86244144..5807df52031c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
>   
>   	amdgpu_virt_init_data_exchange(adev);
>   	/* we need recover gart prior to run SMC/CP/SDMA resume */
> -	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
> +	amdgpu_gtt_mgr_recover(adev);
>   
>   	r = amdgpu_device_fw_loading(adev);
>   	if (r)
> @@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,
>   					amdgpu_inc_vram_lost(tmp_adev);
>   				}
>   
> -				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
> +				r = amdgpu_gtt_mgr_recover(tmp_adev);
>   				if (r)
>   					goto out;
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
> index c18f16b3be9c..5e41f8ef743a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
> @@ -77,10 +77,8 @@ static ssize_t amdgpu_mem_info_gtt_used_show(struct device *dev,
>   {
>   	struct drm_device *ddev = dev_get_drvdata(dev);
>   	struct amdgpu_device *adev = drm_to_adev(ddev);
> -	struct ttm_resource_manager *man;
>   
> -	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
> -	return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
> +	return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(adev));
>   }
>   
>   static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
> @@ -206,14 +204,19 @@ static void amdgpu_gtt_mgr_del(struct ttm_resource_manager *man,
>   /**
>    * amdgpu_gtt_mgr_usage - return usage of GTT domain
>    *
> - * @man: TTM memory type manager
> + * @adev: amdgpu_device pointer
>    *
>    * Return how many bytes are used in the GTT domain
>    */
> -uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
> +uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev)
>   {
> -	struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
> -	s64 result = man->size - atomic64_read(&mgr->available);
> +	struct ttm_resource_manager *man;
> +	struct amdgpu_gtt_mgr *mgr;
> +	s64 result;
> +
> +	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
> +	mgr = to_gtt_mgr(man);
> +	result = man->size - atomic64_read(&mgr->available);
>   
>   	return (result > 0 ? result : 0) * PAGE_SIZE;
>   }
> @@ -221,19 +224,20 @@ uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
>   /**
>    * amdgpu_gtt_mgr_recover - re-init gart
>    *
> - * @man: TTM memory type manager
> + * @adev: amdgpu_device pointer
>    *
>    * Re-init the gart for each known BO in the GTT.
>    */
> -int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
> +int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev)
>   {
> -	struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
> -	struct amdgpu_device *adev;
> +	struct ttm_resource_manager *man;
> +	struct amdgpu_gtt_mgr *mgr;
>   	struct amdgpu_gtt_node *node;
>   	struct drm_mm_node *mm_node;
>   	int r = 0;
>   
> -	adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
> +	man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
> +	mgr = to_gtt_mgr(man);
>   	spin_lock(&mgr->lock);
>   	drm_mm_for_each_node(mm_node, &mgr->mm) {
>   		node = container_of(mm_node, typeof(*node), base.mm_nodes[0]);
> @@ -260,6 +264,7 @@ static void amdgpu_gtt_mgr_debug(struct ttm_resource_manager *man,
>   				 struct drm_printer *printer)
>   {
>   	struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
> +	struct amdgpu_device *adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
>   
>   	spin_lock(&mgr->lock);
>   	drm_mm_print(&mgr->mm, printer);
> @@ -267,7 +272,7 @@ static void amdgpu_gtt_mgr_debug(struct ttm_resource_manager *man,
>   
>   	drm_printf(printer, "man size:%llu pages, gtt available:%lld pages, usage:%lluMB\n",
>   		   man->size, (u64)atomic64_read(&mgr->available),
> -		   amdgpu_gtt_mgr_usage(man) >> 20);
> +		   amdgpu_gtt_mgr_usage(adev) >> 20);
>   }
>   
>   static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func = {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> index d2955ea4a62b..b9b38f70e416 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
> @@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   		ui64 = amdgpu_vram_mgr_vis_usage(ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM));
>   		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
>   	case AMDGPU_INFO_GTT_USAGE:
> -		ui64 = amdgpu_gtt_mgr_usage(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
> +		ui64 = amdgpu_gtt_mgr_usage(adev);
>   		return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
>   	case AMDGPU_INFO_GDS_CONFIG: {
>   		struct drm_amdgpu_info_gds gds_info;
> @@ -738,7 +738,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   		mem.gtt.usable_heap_size = mem.gtt.total_heap_size -
>   			atomic64_read(&adev->gart_pin_size);
>   		mem.gtt.heap_usage =
> -			amdgpu_gtt_mgr_usage(gtt_man);
> +			amdgpu_gtt_mgr_usage(adev);
>   		mem.gtt.max_allocation = mem.gtt.usable_heap_size * 3 / 4;
>   
>   		return copy_to_user(out, &mem,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index 91a087f9dc7c..6e337cacef51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -114,8 +114,8 @@ int amdgpu_vram_mgr_init(struct amdgpu_device *adev);
>   void amdgpu_vram_mgr_fini(struct amdgpu_device *adev);
>   
>   bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_resource *mem);
> -uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man);
> -int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man);
> +uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev);
> +int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev);
>   
>   uint64_t amdgpu_preempt_mgr_usage(struct ttm_resource_manager *man);
>   


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-19 18:14 ` [PATCH v2 3/3] drm/amdgpu: recover gart table at resume Nirmoy Das
@ 2021-10-20  6:52   ` Christian König
  2021-10-20  8:41     ` Das, Nirmoy
  2021-10-20  9:11   ` Lazar, Lijo
  1 sibling, 1 reply; 18+ messages in thread
From: Christian König @ 2021-10-20  6:52 UTC (permalink / raw)
  To: Nirmoy Das, amd-gfx; +Cc: andrey.grodzovsky

Am 19.10.21 um 20:14 schrieb Nirmoy Das:
> Get rid off pin/unpin of gart BO at resume/suspend and
> instead pin only once and try to recover gart content
> at resume time. This is much more stable in case there
> is OOM situation at 2nd call to amdgpu_device_evict_resources()
> while evicting GART table.
>
> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 ++++++++++++----------
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>   6 files changed, 45 insertions(+), 39 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5807df52031c..f69e613805db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
>   	amdgpu_fence_driver_hw_fini(adev);
>
>   	amdgpu_device_ip_suspend_phase2(adev);
> -	/* This second call to evict device resources is to evict
> -	 * the gart page table using the CPU.
> -	 */
> -	amdgpu_device_evict_resources(adev);
>
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> index d3e4203f6217..97a9f61fa106 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>    *
>    * @adev: amdgpu_device pointer
>    *
> - * Allocate video memory for GART page table
> + * Allocate and pin video memory for GART page table
>    * (pcie r4xx, r5xx+).  These asics require the
>    * gart table to be in video memory.
>    * Returns 0 for success, error for failure.
>    */
>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>   {
> +	struct amdgpu_bo_param bp;
>   	int r;
>
> -	if (adev->gart.bo == NULL) {
> -		struct amdgpu_bo_param bp;
> -
> -		memset(&bp, 0, sizeof(bp));
> -		bp.size = adev->gart.table_size;
> -		bp.byte_align = PAGE_SIZE;
> -		bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
> -		bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
> -			AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
> -		bp.type = ttm_bo_type_kernel;
> -		bp.resv = NULL;
> -		bp.bo_ptr_size = sizeof(struct amdgpu_bo);
> -
> -		r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
> -		if (r) {
> -			return r;
> -		}
> -	}
> +	if (adev->gart.bo != NULL)
> +		return 0;
> +
> +	memset(&bp, 0, sizeof(bp));
> +	bp.size = adev->gart.table_size;
> +	bp.byte_align = PAGE_SIZE;
> +	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
> +	bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
> +		AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
> +	bp.type = ttm_bo_type_kernel;
> +	bp.resv = NULL;
> +	bp.bo_ptr_size = sizeof(struct amdgpu_bo);
> +
> +	r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
> +	if (r)
> +		return r;
> +
> +	r = amdgpu_gart_table_vram_pin(adev);
> +	if (r)
> +		return r;
> +

Instead of all this you should be able to use amdgpu_bo_create_kernel().

>   	return 0;
>   }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 3ec5ff5a6dbe..75d584e1b0e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct amdgpu_device *adev)
>   		return -EINVAL;
>   	}
>
> -	r = amdgpu_gart_table_vram_pin(adev);
> -	if (r)
> -		return r;
> +	if (adev->in_suspend) {
> +		r = amdgpu_gtt_mgr_recover(adev);
> +		if (r)
> +			return r;
> +	}

Please drop the in_suspend check here.

If I'm not completely mistaken the GTT domain should already be 
initialized here and if it's not then we can easily check for that in 
amdgpu_gtt_mgr_recover.

Christian.

>
>   	r = adev->gfxhub.funcs->gart_enable(adev);
>   	if (r)
> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct amdgpu_device *adev)
>   {
>   	adev->gfxhub.funcs->gart_disable(adev);
>   	adev->mmhub.funcs->gart_disable(adev);
> -	amdgpu_gart_table_vram_unpin(adev);
>   }
>
>   static int gmc_v10_0_hw_fini(void *handle)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> index 0a50fdaced7e..02e90d9443c1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct amdgpu_device *adev)
>   		dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>   		return -EINVAL;
>   	}
> -	r = amdgpu_gart_table_vram_pin(adev);
> -	if (r)
> -		return r;
> +
> +	if (adev->in_suspend) {
> +		r = amdgpu_gtt_mgr_recover(adev);
> +		if (r)
> +			return r;
> +	}
>
>   	table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>
> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct amdgpu_device *adev)
>   	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>   	WREG32(mmVM_L2_CNTL, tmp);
>   	WREG32(mmVM_L2_CNTL2, 0);
> -	amdgpu_gart_table_vram_unpin(adev);
>   }
>
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> index 492ebed2915b..dc2577e37688 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct amdgpu_device *adev)
>   		dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>   		return -EINVAL;
>   	}
> -	r = amdgpu_gart_table_vram_pin(adev);
> -	if (r)
> -		return r;
> +
> +	if (adev->in_suspend) {
> +		r = amdgpu_gtt_mgr_recover(adev);
> +		if (r)
> +			return r;
> +	}
>
>   	table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>
> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct amdgpu_device *adev)
>   	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>   	WREG32(mmVM_L2_CNTL, tmp);
>   	WREG32(mmVM_L2_CNTL2, 0);
> -	amdgpu_gart_table_vram_unpin(adev);
>   }
>
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index cb82404df534..732d91dbf449 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct amdgpu_device *adev)
>   		return -EINVAL;
>   	}
>
> -	r = amdgpu_gart_table_vram_pin(adev);
> -	if (r)
> -		return r;
> +	if (adev->in_suspend) {
> +		r = amdgpu_gtt_mgr_recover(adev);
> +		if (r)
> +			return r;
> +	}
>
>   	r = adev->gfxhub.funcs->gart_enable(adev);
>   	if (r)
> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct amdgpu_device *adev)
>   {
>   	adev->gfxhub.funcs->gart_disable(adev);
>   	adev->mmhub.funcs->gart_disable(adev);
> -	amdgpu_gart_table_vram_unpin(adev);
>   }
>
>   static int gmc_v9_0_hw_fini(void *handle)
> --
> 2.32.0
>


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-20  6:52   ` Christian König
@ 2021-10-20  8:41     ` Das, Nirmoy
  0 siblings, 0 replies; 18+ messages in thread
From: Das, Nirmoy @ 2021-10-20  8:41 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: andrey.grodzovsky


On 10/20/2021 8:52 AM, Christian König wrote:
> Am 19.10.21 um 20:14 schrieb Nirmoy Das:
>> Get rid off pin/unpin of gart BO at resume/suspend and
>> instead pin only once and try to recover gart content
>> at resume time. This is much more stable in case there
>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>> while evicting GART table.
>>
>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 ++++++++++++----------
>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>>   6 files changed, 45 insertions(+), 39 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 5807df52031c..f69e613805db 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device 
>> *dev, bool fbcon)
>>       amdgpu_fence_driver_hw_fini(adev);
>>
>>       amdgpu_device_ip_suspend_phase2(adev);
>> -    /* This second call to evict device resources is to evict
>> -     * the gart page table using the CPU.
>> -     */
>> -    amdgpu_device_evict_resources(adev);
>>
>>       return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> index d3e4203f6217..97a9f61fa106 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
>> amdgpu_device *adev)
>>    *
>>    * @adev: amdgpu_device pointer
>>    *
>> - * Allocate video memory for GART page table
>> + * Allocate and pin video memory for GART page table
>>    * (pcie r4xx, r5xx+).  These asics require the
>>    * gart table to be in video memory.
>>    * Returns 0 for success, error for failure.
>>    */
>>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>   {
>> +    struct amdgpu_bo_param bp;
>>       int r;
>>
>> -    if (adev->gart.bo == NULL) {
>> -        struct amdgpu_bo_param bp;
>> -
>> -        memset(&bp, 0, sizeof(bp));
>> -        bp.size = adev->gart.table_size;
>> -        bp.byte_align = PAGE_SIZE;
>> -        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>> -        bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>> -            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>> -        bp.type = ttm_bo_type_kernel;
>> -        bp.resv = NULL;
>> -        bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>> -
>> -        r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>> -        if (r) {
>> -            return r;
>> -        }
>> -    }
>> +    if (adev->gart.bo != NULL)
>> +        return 0;
>> +
>> +    memset(&bp, 0, sizeof(bp));
>> +    bp.size = adev->gart.table_size;
>> +    bp.byte_align = PAGE_SIZE;
>> +    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>> +    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>> +        AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>> +    bp.type = ttm_bo_type_kernel;
>> +    bp.resv = NULL;
>> +    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>> +
>> +    r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>> +    if (r)
>> +        return r;
>> +
>> +    r = amdgpu_gart_table_vram_pin(adev);
>> +    if (r)
>> +        return r;
>> +
>
> Instead of all this you should be able to use amdgpu_bo_create_kernel().


OK, with that we can remove amdgpu_gart_table_vram_pin() completely.


>
>>       return 0;
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
>> amdgpu_device *adev)
>>           return -EINVAL;
>>       }
>>
>> -    r = amdgpu_gart_table_vram_pin(adev);
>> -    if (r)
>> -        return r;
>> +    if (adev->in_suspend) {
>> +        r = amdgpu_gtt_mgr_recover(adev);
>> +        if (r)
>> +            return r;
>> +    }
>
> Please drop the in_suspend check here.
>
> If I'm not completely mistaken the GTT domain should already be 
> initialized here and if it's not then we can easily check for that in 
> amdgpu_gtt_mgr_recover.


Yes it is. I will remove that.


Thanks,

Nirmoy


>
> Christian.
>
>>
>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>       if (r)
>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
>> amdgpu_device *adev)
>>   {
>>       adev->gfxhub.funcs->gart_disable(adev);
>>       adev->mmhub.funcs->gart_disable(adev);
>> -    amdgpu_gart_table_vram_unpin(adev);
>>   }
>>
>>   static int gmc_v10_0_hw_fini(void *handle)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>> index 0a50fdaced7e..02e90d9443c1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
>> amdgpu_device *adev)
>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>           return -EINVAL;
>>       }
>> -    r = amdgpu_gart_table_vram_pin(adev);
>> -    if (r)
>> -        return r;
>> +
>> +    if (adev->in_suspend) {
>> +        r = amdgpu_gtt_mgr_recover(adev);
>> +        if (r)
>> +            return r;
>> +    }
>>
>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>
>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct 
>> amdgpu_device *adev)
>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>       WREG32(mmVM_L2_CNTL, tmp);
>>       WREG32(mmVM_L2_CNTL2, 0);
>> -    amdgpu_gart_table_vram_unpin(adev);
>>   }
>>
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> index 492ebed2915b..dc2577e37688 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct 
>> amdgpu_device *adev)
>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>           return -EINVAL;
>>       }
>> -    r = amdgpu_gart_table_vram_pin(adev);
>> -    if (r)
>> -        return r;
>> +
>> +    if (adev->in_suspend) {
>> +        r = amdgpu_gtt_mgr_recover(adev);
>> +        if (r)
>> +            return r;
>> +    }
>>
>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>
>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct 
>> amdgpu_device *adev)
>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>       WREG32(mmVM_L2_CNTL, tmp);
>>       WREG32(mmVM_L2_CNTL2, 0);
>> -    amdgpu_gart_table_vram_unpin(adev);
>>   }
>>
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index cb82404df534..732d91dbf449 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct 
>> amdgpu_device *adev)
>>           return -EINVAL;
>>       }
>>
>> -    r = amdgpu_gart_table_vram_pin(adev);
>> -    if (r)
>> -        return r;
>> +    if (adev->in_suspend) {
>> +        r = amdgpu_gtt_mgr_recover(adev);
>> +        if (r)
>> +            return r;
>> +    }
>>
>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>       if (r)
>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct 
>> amdgpu_device *adev)
>>   {
>>       adev->gfxhub.funcs->gart_disable(adev);
>>       adev->mmhub.funcs->gart_disable(adev);
>> -    amdgpu_gart_table_vram_unpin(adev);
>>   }
>>
>>   static int gmc_v9_0_hw_fini(void *handle)
>> -- 
>> 2.32.0
>>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr
  2021-10-20  6:49 ` [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr Christian König
@ 2021-10-20  8:48   ` Das, Nirmoy
  2021-10-20  9:19     ` Lazar, Lijo
  0 siblings, 1 reply; 18+ messages in thread
From: Das, Nirmoy @ 2021-10-20  8:48 UTC (permalink / raw)
  To: Christian König, amd-gfx; +Cc: andrey.grodzovsky


On 10/20/2021 8:49 AM, Christian König wrote:
> Am 19.10.21 um 20:14 schrieb Nirmoy Das:
>> Do not allow exported amdgpu_gtt_mgr_*() to accept
>> any ttm_resource_manager pointer. Also there is no need
>> to force other module to call a ttm function just to
>> eventually call gtt_mgr functions.
>
> That's a rather bad idea I think.
>
> The GTT and VRAM manager work on their respective objects and not on 
> the adev directly.


What is bothering me is : it is obvious that  the amdgpu_gtt_mgr_usage() 
for example should only calculate

usages for TTM_PL_TT type resource manager, why to pass that explicitly. 
I am trying to leverage the fact that

we only have one gtt/vram manager for a adev and the functions that I 
changed  work on whole gtt/vram manager

as a unit.


Regards,

Nirmoy


>
> Christian.
>
>>
>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 31 ++++++++++++---------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  4 +--
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |  4 +--
>>   4 files changed, 24 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 41ce86244144..5807df52031c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct 
>> amdgpu_device *adev,
>>         amdgpu_virt_init_data_exchange(adev);
>>       /* we need recover gart prior to run SMC/CP/SDMA resume */
>> - amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
>> +    amdgpu_gtt_mgr_recover(adev);
>>         r = amdgpu_device_fw_loading(adev);
>>       if (r)
>> @@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
>> *device_list_handle,
>>                       amdgpu_inc_vram_lost(tmp_adev);
>>                   }
>>   -                r = 
>> amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, 
>> TTM_PL_TT));
>> +                r = amdgpu_gtt_mgr_recover(tmp_adev);
>>                   if (r)
>>                       goto out;
>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> index c18f16b3be9c..5e41f8ef743a 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>> @@ -77,10 +77,8 @@ static ssize_t 
>> amdgpu_mem_info_gtt_used_show(struct device *dev,
>>   {
>>       struct drm_device *ddev = dev_get_drvdata(dev);
>>       struct amdgpu_device *adev = drm_to_adev(ddev);
>> -    struct ttm_resource_manager *man;
>>   -    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>> -    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
>> +    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(adev));
>>   }
>>     static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
>> @@ -206,14 +204,19 @@ static void amdgpu_gtt_mgr_del(struct 
>> ttm_resource_manager *man,
>>   /**
>>    * amdgpu_gtt_mgr_usage - return usage of GTT domain
>>    *
>> - * @man: TTM memory type manager
>> + * @adev: amdgpu_device pointer
>>    *
>>    * Return how many bytes are used in the GTT domain
>>    */
>> -uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
>> +uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev)
>>   {
>> -    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>> -    s64 result = man->size - atomic64_read(&mgr->available);
>> +    struct ttm_resource_manager *man;
>> +    struct amdgpu_gtt_mgr *mgr;
>> +    s64 result;
>> +
>> +    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>> +    mgr = to_gtt_mgr(man);
>> +    result = man->size - atomic64_read(&mgr->available);
>>         return (result > 0 ? result : 0) * PAGE_SIZE;
>>   }
>> @@ -221,19 +224,20 @@ uint64_t amdgpu_gtt_mgr_usage(struct 
>> ttm_resource_manager *man)
>>   /**
>>    * amdgpu_gtt_mgr_recover - re-init gart
>>    *
>> - * @man: TTM memory type manager
>> + * @adev: amdgpu_device pointer
>>    *
>>    * Re-init the gart for each known BO in the GTT.
>>    */
>> -int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
>> +int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev)
>>   {
>> -    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>> -    struct amdgpu_device *adev;
>> +    struct ttm_resource_manager *man;
>> +    struct amdgpu_gtt_mgr *mgr;
>>       struct amdgpu_gtt_node *node;
>>       struct drm_mm_node *mm_node;
>>       int r = 0;
>>   -    adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
>> +    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>> +    mgr = to_gtt_mgr(man);
>>       spin_lock(&mgr->lock);
>>       drm_mm_for_each_node(mm_node, &mgr->mm) {
>>           node = container_of(mm_node, typeof(*node), base.mm_nodes[0]);
>> @@ -260,6 +264,7 @@ static void amdgpu_gtt_mgr_debug(struct 
>> ttm_resource_manager *man,
>>                    struct drm_printer *printer)
>>   {
>>       struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>> +    struct amdgpu_device *adev = container_of(mgr, typeof(*adev), 
>> mman.gtt_mgr);
>>         spin_lock(&mgr->lock);
>>       drm_mm_print(&mgr->mm, printer);
>> @@ -267,7 +272,7 @@ static void amdgpu_gtt_mgr_debug(struct 
>> ttm_resource_manager *man,
>>         drm_printf(printer, "man size:%llu pages, gtt available:%lld 
>> pages, usage:%lluMB\n",
>>              man->size, (u64)atomic64_read(&mgr->available),
>> -           amdgpu_gtt_mgr_usage(man) >> 20);
>> +           amdgpu_gtt_mgr_usage(adev) >> 20);
>>   }
>>     static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func 
>> = {
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> index d2955ea4a62b..b9b38f70e416 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>> @@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, 
>> void *data, struct drm_file *filp)
>>           ui64 = 
>> amdgpu_vram_mgr_vis_usage(ttm_manager_type(&adev->mman.bdev, 
>> TTM_PL_VRAM));
>>           return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
>>       case AMDGPU_INFO_GTT_USAGE:
>> -        ui64 = 
>> amdgpu_gtt_mgr_usage(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
>> +        ui64 = amdgpu_gtt_mgr_usage(adev);
>>           return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
>>       case AMDGPU_INFO_GDS_CONFIG: {
>>           struct drm_amdgpu_info_gds gds_info;
>> @@ -738,7 +738,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, 
>> void *data, struct drm_file *filp)
>>           mem.gtt.usable_heap_size = mem.gtt.total_heap_size -
>>               atomic64_read(&adev->gart_pin_size);
>>           mem.gtt.heap_usage =
>> -            amdgpu_gtt_mgr_usage(gtt_man);
>> +            amdgpu_gtt_mgr_usage(adev);
>>           mem.gtt.max_allocation = mem.gtt.usable_heap_size * 3 / 4;
>>             return copy_to_user(out, &mem,
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> index 91a087f9dc7c..6e337cacef51 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> @@ -114,8 +114,8 @@ int amdgpu_vram_mgr_init(struct amdgpu_device 
>> *adev);
>>   void amdgpu_vram_mgr_fini(struct amdgpu_device *adev);
>>     bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_resource *mem);
>> -uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man);
>> -int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man);
>> +uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev);
>> +int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev);
>>     uint64_t amdgpu_preempt_mgr_usage(struct ttm_resource_manager *man);
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-19 18:14 ` [PATCH v2 3/3] drm/amdgpu: recover gart table at resume Nirmoy Das
  2021-10-20  6:52   ` Christian König
@ 2021-10-20  9:11   ` Lazar, Lijo
  2021-10-20  9:53     ` Das, Nirmoy
  1 sibling, 1 reply; 18+ messages in thread
From: Lazar, Lijo @ 2021-10-20  9:11 UTC (permalink / raw)
  To: Nirmoy Das, amd-gfx; +Cc: Christian.Koenig, andrey.grodzovsky



On 10/19/2021 11:44 PM, Nirmoy Das wrote:
> Get rid off pin/unpin of gart BO at resume/suspend and
> instead pin only once and try to recover gart content
> at resume time. This is much more stable in case there
> is OOM situation at 2nd call to amdgpu_device_evict_resources()
> while evicting GART table.
> 
> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 ++++++++++++----------
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>   6 files changed, 45 insertions(+), 39 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 5807df52031c..f69e613805db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
>   	amdgpu_fence_driver_hw_fini(adev);
> 
>   	amdgpu_device_ip_suspend_phase2(adev);
> -	/* This second call to evict device resources is to evict
> -	 * the gart page table using the CPU.
> -	 */
> -	amdgpu_device_evict_resources(adev);
> 
>   	return 0;
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> index d3e4203f6217..97a9f61fa106 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct amdgpu_device *adev)
>    *
>    * @adev: amdgpu_device pointer
>    *
> - * Allocate video memory for GART page table
> + * Allocate and pin video memory for GART page table
>    * (pcie r4xx, r5xx+).  These asics require the
>    * gart table to be in video memory.
>    * Returns 0 for success, error for failure.
>    */
>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>   {
> +	struct amdgpu_bo_param bp;
>   	int r;
> 
> -	if (adev->gart.bo == NULL) {
> -		struct amdgpu_bo_param bp;
> -
> -		memset(&bp, 0, sizeof(bp));
> -		bp.size = adev->gart.table_size;
> -		bp.byte_align = PAGE_SIZE;
> -		bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
> -		bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
> -			AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
> -		bp.type = ttm_bo_type_kernel;
> -		bp.resv = NULL;
> -		bp.bo_ptr_size = sizeof(struct amdgpu_bo);
> -
> -		r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
> -		if (r) {
> -			return r;
> -		}
> -	}
> +	if (adev->gart.bo != NULL)
> +		return 0;
> +
> +	memset(&bp, 0, sizeof(bp));
> +	bp.size = adev->gart.table_size;
> +	bp.byte_align = PAGE_SIZE;
> +	bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
> +	bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
> +		AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
> +	bp.type = ttm_bo_type_kernel;
> +	bp.resv = NULL;
> +	bp.bo_ptr_size = sizeof(struct amdgpu_bo);
> +
> +	r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
> +	if (r)
> +		return r;
> +
> +	r = amdgpu_gart_table_vram_pin(adev);
> +	if (r)
> +		return r;
> +
>   	return 0;
>   }
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 3ec5ff5a6dbe..75d584e1b0e9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct amdgpu_device *adev)
>   		return -EINVAL;
>   	}
> 
> -	r = amdgpu_gart_table_vram_pin(adev);
> -	if (r)
> -		return r;
> +	if (adev->in_suspend) {
> +		r = amdgpu_gtt_mgr_recover(adev);

When the existing usage of this function is checked, this is called 
during reset recovery after ip resume phase1. Can't the same thing be 
done in ip_resume() to place this after phase1 resume instead of 
repeating in every IP version?

Thanks,
Lijo

> +		if (r)
> +			return r;
> +	}
> 
>   	r = adev->gfxhub.funcs->gart_enable(adev);
>   	if (r)
> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct amdgpu_device *adev)
>   {
>   	adev->gfxhub.funcs->gart_disable(adev);
>   	adev->mmhub.funcs->gart_disable(adev);
> -	amdgpu_gart_table_vram_unpin(adev);
>   }
> 
>   static int gmc_v10_0_hw_fini(void *handle)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> index 0a50fdaced7e..02e90d9443c1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct amdgpu_device *adev)
>   		dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>   		return -EINVAL;
>   	}
> -	r = amdgpu_gart_table_vram_pin(adev);
> -	if (r)
> -		return r;
> +
> +	if (adev->in_suspend) {
> +		r = amdgpu_gtt_mgr_recover(adev);
> +		if (r)
> +			return r;
> +	}
> 
>   	table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
> 
> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct amdgpu_device *adev)
>   	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>   	WREG32(mmVM_L2_CNTL, tmp);
>   	WREG32(mmVM_L2_CNTL2, 0);
> -	amdgpu_gart_table_vram_unpin(adev);
>   }
> 
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> index 492ebed2915b..dc2577e37688 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct amdgpu_device *adev)
>   		dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>   		return -EINVAL;
>   	}
> -	r = amdgpu_gart_table_vram_pin(adev);
> -	if (r)
> -		return r;
> +
> +	if (adev->in_suspend) {
> +		r = amdgpu_gtt_mgr_recover(adev);
> +		if (r)
> +			return r;
> +	}
> 
>   	table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
> 
> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct amdgpu_device *adev)
>   	tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>   	WREG32(mmVM_L2_CNTL, tmp);
>   	WREG32(mmVM_L2_CNTL2, 0);
> -	amdgpu_gart_table_vram_unpin(adev);
>   }
> 
>   /**
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index cb82404df534..732d91dbf449 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct amdgpu_device *adev)
>   		return -EINVAL;
>   	}
> 
> -	r = amdgpu_gart_table_vram_pin(adev);
> -	if (r)
> -		return r;
> +	if (adev->in_suspend) {
> +		r = amdgpu_gtt_mgr_recover(adev);
> +		if (r)
> +			return r;
> +	}
> 
>   	r = adev->gfxhub.funcs->gart_enable(adev);
>   	if (r)
> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct amdgpu_device *adev)
>   {
>   	adev->gfxhub.funcs->gart_disable(adev);
>   	adev->mmhub.funcs->gart_disable(adev);
> -	amdgpu_gart_table_vram_unpin(adev);
>   }
> 
>   static int gmc_v9_0_hw_fini(void *handle)
> --
> 2.32.0
> 

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr
  2021-10-20  8:48   ` Das, Nirmoy
@ 2021-10-20  9:19     ` Lazar, Lijo
  2021-10-20 10:49       ` Christian König
  0 siblings, 1 reply; 18+ messages in thread
From: Lazar, Lijo @ 2021-10-20  9:19 UTC (permalink / raw)
  To: Das, Nirmoy, Christian König, amd-gfx; +Cc: andrey.grodzovsky



On 10/20/2021 2:18 PM, Das, Nirmoy wrote:
> 
> On 10/20/2021 8:49 AM, Christian König wrote:
>> Am 19.10.21 um 20:14 schrieb Nirmoy Das:
>>> Do not allow exported amdgpu_gtt_mgr_*() to accept
>>> any ttm_resource_manager pointer. Also there is no need
>>> to force other module to call a ttm function just to
>>> eventually call gtt_mgr functions.
>>
>> That's a rather bad idea I think.
>>
>> The GTT and VRAM manager work on their respective objects and not on 
>> the adev directly.
> 
> 
> What is bothering me is : it is obvious that  the amdgpu_gtt_mgr_usage() 
> for example should only calculate
> 
> usages for TTM_PL_TT type resource manager, why to pass that explicitly. 
> I am trying to leverage the fact that
> 
> we only have one gtt/vram manager for a adev and the functions that I 
> changed  work on whole gtt/vram manager
> 
> as a unit.
> 

Don't know about the functional aspects. From a sofware perspective, 
amdgpu_gtt_mgr_*() operating on struct amdgpu_gtt_mgr *mgr seems more 
logical.

Thanks,
Lijo

> 
> Regards,
> 
> Nirmoy
> 
> 
>>
>> Christian.
>>
>>>
>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 31 ++++++++++++---------
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  4 +--
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |  4 +--
>>>   4 files changed, 24 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 41ce86244144..5807df52031c 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct 
>>> amdgpu_device *adev,
>>>         amdgpu_virt_init_data_exchange(adev);
>>>       /* we need recover gart prior to run SMC/CP/SDMA resume */
>>> - amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
>>> +    amdgpu_gtt_mgr_recover(adev);
>>>         r = amdgpu_device_fw_loading(adev);
>>>       if (r)
>>> @@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
>>> *device_list_handle,
>>>                       amdgpu_inc_vram_lost(tmp_adev);
>>>                   }
>>>   -                r = 
>>> amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, 
>>> TTM_PL_TT));
>>> +                r = amdgpu_gtt_mgr_recover(tmp_adev);
>>>                   if (r)
>>>                       goto out;
>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>>> index c18f16b3be9c..5e41f8ef743a 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>>> @@ -77,10 +77,8 @@ static ssize_t 
>>> amdgpu_mem_info_gtt_used_show(struct device *dev,
>>>   {
>>>       struct drm_device *ddev = dev_get_drvdata(dev);
>>>       struct amdgpu_device *adev = drm_to_adev(ddev);
>>> -    struct ttm_resource_manager *man;
>>>   -    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>> -    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
>>> +    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(adev));
>>>   }
>>>     static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
>>> @@ -206,14 +204,19 @@ static void amdgpu_gtt_mgr_del(struct 
>>> ttm_resource_manager *man,
>>>   /**
>>>    * amdgpu_gtt_mgr_usage - return usage of GTT domain
>>>    *
>>> - * @man: TTM memory type manager
>>> + * @adev: amdgpu_device pointer
>>>    *
>>>    * Return how many bytes are used in the GTT domain
>>>    */
>>> -uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
>>> +uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev)
>>>   {
>>> -    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>>> -    s64 result = man->size - atomic64_read(&mgr->available);
>>> +    struct ttm_resource_manager *man;
>>> +    struct amdgpu_gtt_mgr *mgr;
>>> +    s64 result;
>>> +
>>> +    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>> +    mgr = to_gtt_mgr(man);
>>> +    result = man->size - atomic64_read(&mgr->available);
>>>         return (result > 0 ? result : 0) * PAGE_SIZE;
>>>   }
>>> @@ -221,19 +224,20 @@ uint64_t amdgpu_gtt_mgr_usage(struct 
>>> ttm_resource_manager *man)
>>>   /**
>>>    * amdgpu_gtt_mgr_recover - re-init gart
>>>    *
>>> - * @man: TTM memory type manager
>>> + * @adev: amdgpu_device pointer
>>>    *
>>>    * Re-init the gart for each known BO in the GTT.
>>>    */
>>> -int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
>>> +int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev)
>>>   {
>>> -    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>>> -    struct amdgpu_device *adev;
>>> +    struct ttm_resource_manager *man;
>>> +    struct amdgpu_gtt_mgr *mgr;
>>>       struct amdgpu_gtt_node *node;
>>>       struct drm_mm_node *mm_node;
>>>       int r = 0;
>>>   -    adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
>>> +    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>> +    mgr = to_gtt_mgr(man);
>>>       spin_lock(&mgr->lock);
>>>       drm_mm_for_each_node(mm_node, &mgr->mm) {
>>>           node = container_of(mm_node, typeof(*node), base.mm_nodes[0]);
>>> @@ -260,6 +264,7 @@ static void amdgpu_gtt_mgr_debug(struct 
>>> ttm_resource_manager *man,
>>>                    struct drm_printer *printer)
>>>   {
>>>       struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>>> +    struct amdgpu_device *adev = container_of(mgr, typeof(*adev), 
>>> mman.gtt_mgr);
>>>         spin_lock(&mgr->lock);
>>>       drm_mm_print(&mgr->mm, printer);
>>> @@ -267,7 +272,7 @@ static void amdgpu_gtt_mgr_debug(struct 
>>> ttm_resource_manager *man,
>>>         drm_printf(printer, "man size:%llu pages, gtt available:%lld 
>>> pages, usage:%lluMB\n",
>>>              man->size, (u64)atomic64_read(&mgr->available),
>>> -           amdgpu_gtt_mgr_usage(man) >> 20);
>>> +           amdgpu_gtt_mgr_usage(adev) >> 20);
>>>   }
>>>     static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func 
>>> = {
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> index d2955ea4a62b..b9b38f70e416 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>> @@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, 
>>> void *data, struct drm_file *filp)
>>>           ui64 = 
>>> amdgpu_vram_mgr_vis_usage(ttm_manager_type(&adev->mman.bdev, 
>>> TTM_PL_VRAM));
>>>           return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
>>>       case AMDGPU_INFO_GTT_USAGE:
>>> -        ui64 = 
>>> amdgpu_gtt_mgr_usage(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
>>> +        ui64 = amdgpu_gtt_mgr_usage(adev);
>>>           return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT : 0;
>>>       case AMDGPU_INFO_GDS_CONFIG: {
>>>           struct drm_amdgpu_info_gds gds_info;
>>> @@ -738,7 +738,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, 
>>> void *data, struct drm_file *filp)
>>>           mem.gtt.usable_heap_size = mem.gtt.total_heap_size -
>>>               atomic64_read(&adev->gart_pin_size);
>>>           mem.gtt.heap_usage =
>>> -            amdgpu_gtt_mgr_usage(gtt_man);
>>> +            amdgpu_gtt_mgr_usage(adev);
>>>           mem.gtt.max_allocation = mem.gtt.usable_heap_size * 3 / 4;
>>>             return copy_to_user(out, &mem,
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> index 91a087f9dc7c..6e337cacef51 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>> @@ -114,8 +114,8 @@ int amdgpu_vram_mgr_init(struct amdgpu_device 
>>> *adev);
>>>   void amdgpu_vram_mgr_fini(struct amdgpu_device *adev);
>>>     bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_resource *mem);
>>> -uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man);
>>> -int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man);
>>> +uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev);
>>> +int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev);
>>>     uint64_t amdgpu_preempt_mgr_usage(struct ttm_resource_manager *man);
>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-20  9:11   ` Lazar, Lijo
@ 2021-10-20  9:53     ` Das, Nirmoy
  2021-10-20 10:03       ` Lazar, Lijo
  0 siblings, 1 reply; 18+ messages in thread
From: Das, Nirmoy @ 2021-10-20  9:53 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Christian.Koenig, andrey.grodzovsky


On 10/20/2021 11:11 AM, Lazar, Lijo wrote:
>
>
> On 10/19/2021 11:44 PM, Nirmoy Das wrote:
>> Get rid off pin/unpin of gart BO at resume/suspend and
>> instead pin only once and try to recover gart content
>> at resume time. This is much more stable in case there
>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>> while evicting GART table.
>>
>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 ++++++++++++----------
>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>>   6 files changed, 45 insertions(+), 39 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 5807df52031c..f69e613805db 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device 
>> *dev, bool fbcon)
>>       amdgpu_fence_driver_hw_fini(adev);
>>
>>       amdgpu_device_ip_suspend_phase2(adev);
>> -    /* This second call to evict device resources is to evict
>> -     * the gart page table using the CPU.
>> -     */
>> -    amdgpu_device_evict_resources(adev);
>>
>>       return 0;
>>   }
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> index d3e4203f6217..97a9f61fa106 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
>> amdgpu_device *adev)
>>    *
>>    * @adev: amdgpu_device pointer
>>    *
>> - * Allocate video memory for GART page table
>> + * Allocate and pin video memory for GART page table
>>    * (pcie r4xx, r5xx+).  These asics require the
>>    * gart table to be in video memory.
>>    * Returns 0 for success, error for failure.
>>    */
>>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>   {
>> +    struct amdgpu_bo_param bp;
>>       int r;
>>
>> -    if (adev->gart.bo == NULL) {
>> -        struct amdgpu_bo_param bp;
>> -
>> -        memset(&bp, 0, sizeof(bp));
>> -        bp.size = adev->gart.table_size;
>> -        bp.byte_align = PAGE_SIZE;
>> -        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>> -        bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>> -            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>> -        bp.type = ttm_bo_type_kernel;
>> -        bp.resv = NULL;
>> -        bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>> -
>> -        r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>> -        if (r) {
>> -            return r;
>> -        }
>> -    }
>> +    if (adev->gart.bo != NULL)
>> +        return 0;
>> +
>> +    memset(&bp, 0, sizeof(bp));
>> +    bp.size = adev->gart.table_size;
>> +    bp.byte_align = PAGE_SIZE;
>> +    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>> +    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>> +        AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>> +    bp.type = ttm_bo_type_kernel;
>> +    bp.resv = NULL;
>> +    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>> +
>> +    r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>> +    if (r)
>> +        return r;
>> +
>> +    r = amdgpu_gart_table_vram_pin(adev);
>> +    if (r)
>> +        return r;
>> +
>>       return 0;
>>   }
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
>> amdgpu_device *adev)
>>           return -EINVAL;
>>       }
>>
>> -    r = amdgpu_gart_table_vram_pin(adev);
>> -    if (r)
>> -        return r;
>> +    if (adev->in_suspend) {
>> +        r = amdgpu_gtt_mgr_recover(adev);
>
> When the existing usage of this function is checked, this is called 
> during reset recovery after ip resume phase1. Can't the same thing be 
> done in ip_resume() to place this after phase1 resume instead of 
> repeating in every IP version?


Placing amdgpu_gtt_mgr_recover() after phase1 generally works but  
gmc_v10_0_gart_enable() seems to be correct  place  to do this

gart specific work.


Regards,

Nirmoy



>
> Thanks,
> Lijo
>
>> +        if (r)
>> +            return r;
>> +    }
>>
>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>       if (r)
>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
>> amdgpu_device *adev)
>>   {
>>       adev->gfxhub.funcs->gart_disable(adev);
>>       adev->mmhub.funcs->gart_disable(adev);
>> -    amdgpu_gart_table_vram_unpin(adev);
>>   }
>>
>>   static int gmc_v10_0_hw_fini(void *handle)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>> index 0a50fdaced7e..02e90d9443c1 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
>> amdgpu_device *adev)
>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>           return -EINVAL;
>>       }
>> -    r = amdgpu_gart_table_vram_pin(adev);
>> -    if (r)
>> -        return r;
>> +
>> +    if (adev->in_suspend) {
>> +        r = amdgpu_gtt_mgr_recover(adev);
>> +        if (r)
>> +            return r;
>> +    }
>>
>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>
>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct 
>> amdgpu_device *adev)
>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>       WREG32(mmVM_L2_CNTL, tmp);
>>       WREG32(mmVM_L2_CNTL2, 0);
>> -    amdgpu_gart_table_vram_unpin(adev);
>>   }
>>
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> index 492ebed2915b..dc2577e37688 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct 
>> amdgpu_device *adev)
>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>           return -EINVAL;
>>       }
>> -    r = amdgpu_gart_table_vram_pin(adev);
>> -    if (r)
>> -        return r;
>> +
>> +    if (adev->in_suspend) {
>> +        r = amdgpu_gtt_mgr_recover(adev);
>> +        if (r)
>> +            return r;
>> +    }
>>
>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>
>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct 
>> amdgpu_device *adev)
>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>       WREG32(mmVM_L2_CNTL, tmp);
>>       WREG32(mmVM_L2_CNTL2, 0);
>> -    amdgpu_gart_table_vram_unpin(adev);
>>   }
>>
>>   /**
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> index cb82404df534..732d91dbf449 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct 
>> amdgpu_device *adev)
>>           return -EINVAL;
>>       }
>>
>> -    r = amdgpu_gart_table_vram_pin(adev);
>> -    if (r)
>> -        return r;
>> +    if (adev->in_suspend) {
>> +        r = amdgpu_gtt_mgr_recover(adev);
>> +        if (r)
>> +            return r;
>> +    }
>>
>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>       if (r)
>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct 
>> amdgpu_device *adev)
>>   {
>>       adev->gfxhub.funcs->gart_disable(adev);
>>       adev->mmhub.funcs->gart_disable(adev);
>> -    amdgpu_gart_table_vram_unpin(adev);
>>   }
>>
>>   static int gmc_v9_0_hw_fini(void *handle)
>> -- 
>> 2.32.0
>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-20  9:53     ` Das, Nirmoy
@ 2021-10-20 10:03       ` Lazar, Lijo
  2021-10-20 10:12         ` Das, Nirmoy
  0 siblings, 1 reply; 18+ messages in thread
From: Lazar, Lijo @ 2021-10-20 10:03 UTC (permalink / raw)
  To: Das, Nirmoy, amd-gfx; +Cc: Christian.Koenig, andrey.grodzovsky



On 10/20/2021 3:23 PM, Das, Nirmoy wrote:
> 
> On 10/20/2021 11:11 AM, Lazar, Lijo wrote:
>>
>>
>> On 10/19/2021 11:44 PM, Nirmoy Das wrote:
>>> Get rid off pin/unpin of gart BO at resume/suspend and
>>> instead pin only once and try to recover gart content
>>> at resume time. This is much more stable in case there
>>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>>> while evicting GART table.
>>>
>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>> ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 ++++++++++++----------
>>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>>>   6 files changed, 45 insertions(+), 39 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> index 5807df52031c..f69e613805db 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device 
>>> *dev, bool fbcon)
>>>       amdgpu_fence_driver_hw_fini(adev);
>>>
>>>       amdgpu_device_ip_suspend_phase2(adev);
>>> -    /* This second call to evict device resources is to evict
>>> -     * the gart page table using the CPU.
>>> -     */
>>> -    amdgpu_device_evict_resources(adev);
>>>
>>>       return 0;
>>>   }
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>> index d3e4203f6217..97a9f61fa106 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
>>> amdgpu_device *adev)
>>>    *
>>>    * @adev: amdgpu_device pointer
>>>    *
>>> - * Allocate video memory for GART page table
>>> + * Allocate and pin video memory for GART page table
>>>    * (pcie r4xx, r5xx+).  These asics require the
>>>    * gart table to be in video memory.
>>>    * Returns 0 for success, error for failure.
>>>    */
>>>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>>   {
>>> +    struct amdgpu_bo_param bp;
>>>       int r;
>>>
>>> -    if (adev->gart.bo == NULL) {
>>> -        struct amdgpu_bo_param bp;
>>> -
>>> -        memset(&bp, 0, sizeof(bp));
>>> -        bp.size = adev->gart.table_size;
>>> -        bp.byte_align = PAGE_SIZE;
>>> -        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>> -        bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>> -            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>> -        bp.type = ttm_bo_type_kernel;
>>> -        bp.resv = NULL;
>>> -        bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>> -
>>> -        r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>> -        if (r) {
>>> -            return r;
>>> -        }
>>> -    }
>>> +    if (adev->gart.bo != NULL)
>>> +        return 0;
>>> +
>>> +    memset(&bp, 0, sizeof(bp));
>>> +    bp.size = adev->gart.table_size;
>>> +    bp.byte_align = PAGE_SIZE;
>>> +    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>> +    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>> +        AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>> +    bp.type = ttm_bo_type_kernel;
>>> +    bp.resv = NULL;
>>> +    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>> +
>>> +    r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>> +    if (r)
>>> +        return r;
>>> +
>>> +    r = amdgpu_gart_table_vram_pin(adev);
>>> +    if (r)
>>> +        return r;
>>> +
>>>       return 0;
>>>   }
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
>>> amdgpu_device *adev)
>>>           return -EINVAL;
>>>       }
>>>
>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>> -    if (r)
>>> -        return r;
>>> +    if (adev->in_suspend) {
>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>
>> When the existing usage of this function is checked, this is called 
>> during reset recovery after ip resume phase1. Can't the same thing be 
>> done in ip_resume() to place this after phase1 resume instead of 
>> repeating in every IP version?
> 
> 
> Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
> gmc_v10_0_gart_enable() seems to be correct  place  to do this
> 
> gart specific work.
> 

I see. In that case probably the patch needs to change other call places 
also as this step is already taken care in gart enable.

Thanks,
Lijo

> 
> Regards,
> 
> Nirmoy
> 
> 
> 
>>
>> Thanks,
>> Lijo
>>
>>> +        if (r)
>>> +            return r;
>>> +    }
>>>
>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>       if (r)
>>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
>>> amdgpu_device *adev)
>>>   {
>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>       adev->mmhub.funcs->gart_disable(adev);
>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>   }
>>>
>>>   static int gmc_v10_0_hw_fini(void *handle)
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>> index 0a50fdaced7e..02e90d9443c1 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
>>> amdgpu_device *adev)
>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>           return -EINVAL;
>>>       }
>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>> -    if (r)
>>> -        return r;
>>> +
>>> +    if (adev->in_suspend) {
>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>> +        if (r)
>>> +            return r;
>>> +    }
>>>
>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>
>>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct 
>>> amdgpu_device *adev)
>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>       WREG32(mmVM_L2_CNTL2, 0);
>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>   }
>>>
>>>   /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>> index 492ebed2915b..dc2577e37688 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct 
>>> amdgpu_device *adev)
>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>           return -EINVAL;
>>>       }
>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>> -    if (r)
>>> -        return r;
>>> +
>>> +    if (adev->in_suspend) {
>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>> +        if (r)
>>> +            return r;
>>> +    }
>>>
>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>
>>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct 
>>> amdgpu_device *adev)
>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>       WREG32(mmVM_L2_CNTL2, 0);
>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>   }
>>>
>>>   /**
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index cb82404df534..732d91dbf449 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct 
>>> amdgpu_device *adev)
>>>           return -EINVAL;
>>>       }
>>>
>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>> -    if (r)
>>> -        return r;
>>> +    if (adev->in_suspend) {
>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>> +        if (r)
>>> +            return r;
>>> +    }
>>>
>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>       if (r)
>>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct 
>>> amdgpu_device *adev)
>>>   {
>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>       adev->mmhub.funcs->gart_disable(adev);
>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>   }
>>>
>>>   static int gmc_v9_0_hw_fini(void *handle)
>>> -- 
>>> 2.32.0
>>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-20 10:03       ` Lazar, Lijo
@ 2021-10-20 10:12         ` Das, Nirmoy
  2021-10-20 10:15           ` Lazar, Lijo
  0 siblings, 1 reply; 18+ messages in thread
From: Das, Nirmoy @ 2021-10-20 10:12 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Christian.Koenig, andrey.grodzovsky


On 10/20/2021 12:03 PM, Lazar, Lijo wrote:
>
>
> On 10/20/2021 3:23 PM, Das, Nirmoy wrote:
>>
>> On 10/20/2021 11:11 AM, Lazar, Lijo wrote:
>>>
>>>
>>> On 10/19/2021 11:44 PM, Nirmoy Das wrote:
>>>> Get rid off pin/unpin of gart BO at resume/suspend and
>>>> instead pin only once and try to recover gart content
>>>> at resume time. This is much more stable in case there
>>>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>>>> while evicting GART table.
>>>>
>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 
>>>> ++++++++++++----------
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>>>>   6 files changed, 45 insertions(+), 39 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 5807df52031c..f69e613805db 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device 
>>>> *dev, bool fbcon)
>>>>       amdgpu_fence_driver_hw_fini(adev);
>>>>
>>>>       amdgpu_device_ip_suspend_phase2(adev);
>>>> -    /* This second call to evict device resources is to evict
>>>> -     * the gart page table using the CPU.
>>>> -     */
>>>> -    amdgpu_device_evict_resources(adev);
>>>>
>>>>       return 0;
>>>>   }
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>> index d3e4203f6217..97a9f61fa106 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
>>>> amdgpu_device *adev)
>>>>    *
>>>>    * @adev: amdgpu_device pointer
>>>>    *
>>>> - * Allocate video memory for GART page table
>>>> + * Allocate and pin video memory for GART page table
>>>>    * (pcie r4xx, r5xx+).  These asics require the
>>>>    * gart table to be in video memory.
>>>>    * Returns 0 for success, error for failure.
>>>>    */
>>>>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>>>   {
>>>> +    struct amdgpu_bo_param bp;
>>>>       int r;
>>>>
>>>> -    if (adev->gart.bo == NULL) {
>>>> -        struct amdgpu_bo_param bp;
>>>> -
>>>> -        memset(&bp, 0, sizeof(bp));
>>>> -        bp.size = adev->gart.table_size;
>>>> -        bp.byte_align = PAGE_SIZE;
>>>> -        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>> -        bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>> -            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>> -        bp.type = ttm_bo_type_kernel;
>>>> -        bp.resv = NULL;
>>>> -        bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>> -
>>>> -        r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>> -        if (r) {
>>>> -            return r;
>>>> -        }
>>>> -    }
>>>> +    if (adev->gart.bo != NULL)
>>>> +        return 0;
>>>> +
>>>> +    memset(&bp, 0, sizeof(bp));
>>>> +    bp.size = adev->gart.table_size;
>>>> +    bp.byte_align = PAGE_SIZE;
>>>> +    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>> +    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>> +        AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>> +    bp.type = ttm_bo_type_kernel;
>>>> +    bp.resv = NULL;
>>>> +    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>> +
>>>> +    r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>> +    if (r)
>>>> +        return r;
>>>> +
>>>> +    r = amdgpu_gart_table_vram_pin(adev);
>>>> +    if (r)
>>>> +        return r;
>>>> +
>>>>       return 0;
>>>>   }
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
>>>> amdgpu_device *adev)
>>>>           return -EINVAL;
>>>>       }
>>>>
>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>> -    if (r)
>>>> -        return r;
>>>> +    if (adev->in_suspend) {
>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>
>>> When the existing usage of this function is checked, this is called 
>>> during reset recovery after ip resume phase1. Can't the same thing 
>>> be done in ip_resume() to place this after phase1 resume instead of 
>>> repeating in every IP version?
>>
>>
>> Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
>> gmc_v10_0_gart_enable() seems to be correct  place  to do this
>>
>> gart specific work.
>>
>
> I see. In that case probably the patch needs to change other call 
> places also as this step is already taken care in gart enable.


Do you mean amdgpu_do_asic_reset() ?


Nirmoy


>
> Thanks,
> Lijo
>
>>
>> Regards,
>>
>> Nirmoy
>>
>>
>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>> +        if (r)
>>>> +            return r;
>>>> +    }
>>>>
>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>       if (r)
>>>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
>>>> amdgpu_device *adev)
>>>>   {
>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>   }
>>>>
>>>>   static int gmc_v10_0_hw_fini(void *handle)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>> index 0a50fdaced7e..02e90d9443c1 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
>>>> amdgpu_device *adev)
>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>           return -EINVAL;
>>>>       }
>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>> -    if (r)
>>>> -        return r;
>>>> +
>>>> +    if (adev->in_suspend) {
>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>> +        if (r)
>>>> +            return r;
>>>> +    }
>>>>
>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>
>>>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct 
>>>> amdgpu_device *adev)
>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>   }
>>>>
>>>>   /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>> index 492ebed2915b..dc2577e37688 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct 
>>>> amdgpu_device *adev)
>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>           return -EINVAL;
>>>>       }
>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>> -    if (r)
>>>> -        return r;
>>>> +
>>>> +    if (adev->in_suspend) {
>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>> +        if (r)
>>>> +            return r;
>>>> +    }
>>>>
>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>
>>>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct 
>>>> amdgpu_device *adev)
>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>   }
>>>>
>>>>   /**
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> index cb82404df534..732d91dbf449 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct 
>>>> amdgpu_device *adev)
>>>>           return -EINVAL;
>>>>       }
>>>>
>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>> -    if (r)
>>>> -        return r;
>>>> +    if (adev->in_suspend) {
>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>> +        if (r)
>>>> +            return r;
>>>> +    }
>>>>
>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>       if (r)
>>>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct 
>>>> amdgpu_device *adev)
>>>>   {
>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>   }
>>>>
>>>>   static int gmc_v9_0_hw_fini(void *handle)
>>>> -- 
>>>> 2.32.0
>>>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-20 10:12         ` Das, Nirmoy
@ 2021-10-20 10:15           ` Lazar, Lijo
  2021-10-20 10:21             ` Das, Nirmoy
  0 siblings, 1 reply; 18+ messages in thread
From: Lazar, Lijo @ 2021-10-20 10:15 UTC (permalink / raw)
  To: Das, Nirmoy, amd-gfx; +Cc: Christian.Koenig, andrey.grodzovsky



On 10/20/2021 3:42 PM, Das, Nirmoy wrote:
> 
> On 10/20/2021 12:03 PM, Lazar, Lijo wrote:
>>
>>
>> On 10/20/2021 3:23 PM, Das, Nirmoy wrote:
>>>
>>> On 10/20/2021 11:11 AM, Lazar, Lijo wrote:
>>>>
>>>>
>>>> On 10/19/2021 11:44 PM, Nirmoy Das wrote:
>>>>> Get rid off pin/unpin of gart BO at resume/suspend and
>>>>> instead pin only once and try to recover gart content
>>>>> at resume time. This is much more stable in case there
>>>>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>>>>> while evicting GART table.
>>>>>
>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 
>>>>> ++++++++++++----------
>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>>>>>   6 files changed, 45 insertions(+), 39 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 5807df52031c..f69e613805db 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device 
>>>>> *dev, bool fbcon)
>>>>>       amdgpu_fence_driver_hw_fini(adev);
>>>>>
>>>>>       amdgpu_device_ip_suspend_phase2(adev);
>>>>> -    /* This second call to evict device resources is to evict
>>>>> -     * the gart page table using the CPU.
>>>>> -     */
>>>>> -    amdgpu_device_evict_resources(adev);
>>>>>
>>>>>       return 0;
>>>>>   }
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>> index d3e4203f6217..97a9f61fa106 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
>>>>> amdgpu_device *adev)
>>>>>    *
>>>>>    * @adev: amdgpu_device pointer
>>>>>    *
>>>>> - * Allocate video memory for GART page table
>>>>> + * Allocate and pin video memory for GART page table
>>>>>    * (pcie r4xx, r5xx+).  These asics require the
>>>>>    * gart table to be in video memory.
>>>>>    * Returns 0 for success, error for failure.
>>>>>    */
>>>>>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>>>>   {
>>>>> +    struct amdgpu_bo_param bp;
>>>>>       int r;
>>>>>
>>>>> -    if (adev->gart.bo == NULL) {
>>>>> -        struct amdgpu_bo_param bp;
>>>>> -
>>>>> -        memset(&bp, 0, sizeof(bp));
>>>>> -        bp.size = adev->gart.table_size;
>>>>> -        bp.byte_align = PAGE_SIZE;
>>>>> -        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>> -        bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>> -            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>> -        bp.type = ttm_bo_type_kernel;
>>>>> -        bp.resv = NULL;
>>>>> -        bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>> -
>>>>> -        r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>> -        if (r) {
>>>>> -            return r;
>>>>> -        }
>>>>> -    }
>>>>> +    if (adev->gart.bo != NULL)
>>>>> +        return 0;
>>>>> +
>>>>> +    memset(&bp, 0, sizeof(bp));
>>>>> +    bp.size = adev->gart.table_size;
>>>>> +    bp.byte_align = PAGE_SIZE;
>>>>> +    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>> +    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>> +        AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>> +    bp.type = ttm_bo_type_kernel;
>>>>> +    bp.resv = NULL;
>>>>> +    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>> +
>>>>> +    r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>> +    if (r)
>>>>> +        return r;
>>>>> +
>>>>> +    r = amdgpu_gart_table_vram_pin(adev);
>>>>> +    if (r)
>>>>> +        return r;
>>>>> +
>>>>>       return 0;
>>>>>   }
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
>>>>> amdgpu_device *adev)
>>>>>           return -EINVAL;
>>>>>       }
>>>>>
>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>> -    if (r)
>>>>> -        return r;
>>>>> +    if (adev->in_suspend) {
>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>
>>>> When the existing usage of this function is checked, this is called 
>>>> during reset recovery after ip resume phase1. Can't the same thing 
>>>> be done in ip_resume() to place this after phase1 resume instead of 
>>>> repeating in every IP version?
>>>
>>>
>>> Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
>>> gmc_v10_0_gart_enable() seems to be correct  place  to do this
>>>
>>> gart specific work.
>>>
>>
>> I see. In that case probably the patch needs to change other call 
>> places also as this step is already taken care in gart enable.
> 
> 
> Do you mean amdgpu_do_asic_reset() ?
> 

Yes, and saw it called in one more place related to sriov reset (didn't 
track the sriov reset path though).

Thanks,
Lijo

> 
> Nirmoy
> 
> 
>>
>> Thanks,
>> Lijo
>>
>>>
>>> Regards,
>>>
>>> Nirmoy
>>>
>>>
>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>> +        if (r)
>>>>> +            return r;
>>>>> +    }
>>>>>
>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>       if (r)
>>>>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
>>>>> amdgpu_device *adev)
>>>>>   {
>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>   }
>>>>>
>>>>>   static int gmc_v10_0_hw_fini(void *handle)
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>> index 0a50fdaced7e..02e90d9443c1 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
>>>>> amdgpu_device *adev)
>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>           return -EINVAL;
>>>>>       }
>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>> -    if (r)
>>>>> -        return r;
>>>>> +
>>>>> +    if (adev->in_suspend) {
>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>> +        if (r)
>>>>> +            return r;
>>>>> +    }
>>>>>
>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>
>>>>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct 
>>>>> amdgpu_device *adev)
>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>   }
>>>>>
>>>>>   /**
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>> index 492ebed2915b..dc2577e37688 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct 
>>>>> amdgpu_device *adev)
>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>           return -EINVAL;
>>>>>       }
>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>> -    if (r)
>>>>> -        return r;
>>>>> +
>>>>> +    if (adev->in_suspend) {
>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>> +        if (r)
>>>>> +            return r;
>>>>> +    }
>>>>>
>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>
>>>>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct 
>>>>> amdgpu_device *adev)
>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>   }
>>>>>
>>>>>   /**
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> index cb82404df534..732d91dbf449 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct 
>>>>> amdgpu_device *adev)
>>>>>           return -EINVAL;
>>>>>       }
>>>>>
>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>> -    if (r)
>>>>> -        return r;
>>>>> +    if (adev->in_suspend) {
>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>> +        if (r)
>>>>> +            return r;
>>>>> +    }
>>>>>
>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>       if (r)
>>>>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct 
>>>>> amdgpu_device *adev)
>>>>>   {
>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>   }
>>>>>
>>>>>   static int gmc_v9_0_hw_fini(void *handle)
>>>>> -- 
>>>>> 2.32.0
>>>>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-20 10:15           ` Lazar, Lijo
@ 2021-10-20 10:21             ` Das, Nirmoy
  2021-10-20 10:51               ` Christian König
  0 siblings, 1 reply; 18+ messages in thread
From: Das, Nirmoy @ 2021-10-20 10:21 UTC (permalink / raw)
  To: Lazar, Lijo, amd-gfx; +Cc: Christian.Koenig, andrey.grodzovsky


On 10/20/2021 12:15 PM, Lazar, Lijo wrote:
>
>
> On 10/20/2021 3:42 PM, Das, Nirmoy wrote:
>>
>> On 10/20/2021 12:03 PM, Lazar, Lijo wrote:
>>>
>>>
>>> On 10/20/2021 3:23 PM, Das, Nirmoy wrote:
>>>>
>>>> On 10/20/2021 11:11 AM, Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 10/19/2021 11:44 PM, Nirmoy Das wrote:
>>>>>> Get rid off pin/unpin of gart BO at resume/suspend and
>>>>>> instead pin only once and try to recover gart content
>>>>>> at resume time. This is much more stable in case there
>>>>>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>>>>>> while evicting GART table.
>>>>>>
>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>>> ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 
>>>>>> ++++++++++++----------
>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>>>>>>   6 files changed, 45 insertions(+), 39 deletions(-)
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> index 5807df52031c..f69e613805db 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct 
>>>>>> drm_device *dev, bool fbcon)
>>>>>>       amdgpu_fence_driver_hw_fini(adev);
>>>>>>
>>>>>>       amdgpu_device_ip_suspend_phase2(adev);
>>>>>> -    /* This second call to evict device resources is to evict
>>>>>> -     * the gart page table using the CPU.
>>>>>> -     */
>>>>>> -    amdgpu_device_evict_resources(adev);
>>>>>>
>>>>>>       return 0;
>>>>>>   }
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>> index d3e4203f6217..97a9f61fa106 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
>>>>>> amdgpu_device *adev)
>>>>>>    *
>>>>>>    * @adev: amdgpu_device pointer
>>>>>>    *
>>>>>> - * Allocate video memory for GART page table
>>>>>> + * Allocate and pin video memory for GART page table
>>>>>>    * (pcie r4xx, r5xx+).  These asics require the
>>>>>>    * gart table to be in video memory.
>>>>>>    * Returns 0 for success, error for failure.
>>>>>>    */
>>>>>>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>>>>>   {
>>>>>> +    struct amdgpu_bo_param bp;
>>>>>>       int r;
>>>>>>
>>>>>> -    if (adev->gart.bo == NULL) {
>>>>>> -        struct amdgpu_bo_param bp;
>>>>>> -
>>>>>> -        memset(&bp, 0, sizeof(bp));
>>>>>> -        bp.size = adev->gart.table_size;
>>>>>> -        bp.byte_align = PAGE_SIZE;
>>>>>> -        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>> -        bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>> -            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>> -        bp.type = ttm_bo_type_kernel;
>>>>>> -        bp.resv = NULL;
>>>>>> -        bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>> -
>>>>>> -        r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>> -        if (r) {
>>>>>> -            return r;
>>>>>> -        }
>>>>>> -    }
>>>>>> +    if (adev->gart.bo != NULL)
>>>>>> +        return 0;
>>>>>> +
>>>>>> +    memset(&bp, 0, sizeof(bp));
>>>>>> +    bp.size = adev->gart.table_size;
>>>>>> +    bp.byte_align = PAGE_SIZE;
>>>>>> +    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>> +    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>> +        AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>> +    bp.type = ttm_bo_type_kernel;
>>>>>> +    bp.resv = NULL;
>>>>>> +    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>> +
>>>>>> +    r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>> +    if (r)
>>>>>> +        return r;
>>>>>> +
>>>>>> +    r = amdgpu_gart_table_vram_pin(adev);
>>>>>> +    if (r)
>>>>>> +        return r;
>>>>>> +
>>>>>>       return 0;
>>>>>>   }
>>>>>>
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
>>>>>> amdgpu_device *adev)
>>>>>>           return -EINVAL;
>>>>>>       }
>>>>>>
>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>> -    if (r)
>>>>>> -        return r;
>>>>>> +    if (adev->in_suspend) {
>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>
>>>>> When the existing usage of this function is checked, this is 
>>>>> called during reset recovery after ip resume phase1. Can't the 
>>>>> same thing be done in ip_resume() to place this after phase1 
>>>>> resume instead of repeating in every IP version?
>>>>
>>>>
>>>> Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
>>>> gmc_v10_0_gart_enable() seems to be correct place  to do this
>>>>
>>>> gart specific work.
>>>>
>>>
>>> I see. In that case probably the patch needs to change other call 
>>> places also as this step is already taken care in gart enable.
>>
>>
>> Do you mean amdgpu_do_asic_reset() ?
>>
>
> Yes, and saw it called in one more place related to sriov reset 
> (didn't track the sriov reset path though).


True, hmm looks like this patch going to need multiple tested-by tags 
for gfx6,7 and sriov. I only have gfx8,9,10.


Regards,

Nirmoy


>
> Thanks,
> Lijo
>
>>
>> Nirmoy
>>
>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>>
>>>> Regards,
>>>>
>>>> Nirmoy
>>>>
>>>>
>>>>
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>>> +        if (r)
>>>>>> +            return r;
>>>>>> +    }
>>>>>>
>>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>>       if (r)
>>>>>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
>>>>>> amdgpu_device *adev)
>>>>>>   {
>>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>   }
>>>>>>
>>>>>>   static int gmc_v10_0_hw_fini(void *handle)
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>> index 0a50fdaced7e..02e90d9443c1 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
>>>>>> amdgpu_device *adev)
>>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>>           return -EINVAL;
>>>>>>       }
>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>> -    if (r)
>>>>>> -        return r;
>>>>>> +
>>>>>> +    if (adev->in_suspend) {
>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>> +        if (r)
>>>>>> +            return r;
>>>>>> +    }
>>>>>>
>>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>
>>>>>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct 
>>>>>> amdgpu_device *adev)
>>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>   }
>>>>>>
>>>>>>   /**
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>> index 492ebed2915b..dc2577e37688 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct 
>>>>>> amdgpu_device *adev)
>>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>>           return -EINVAL;
>>>>>>       }
>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>> -    if (r)
>>>>>> -        return r;
>>>>>> +
>>>>>> +    if (adev->in_suspend) {
>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>> +        if (r)
>>>>>> +            return r;
>>>>>> +    }
>>>>>>
>>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>
>>>>>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct 
>>>>>> amdgpu_device *adev)
>>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>   }
>>>>>>
>>>>>>   /**
>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> index cb82404df534..732d91dbf449 100644
>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct 
>>>>>> amdgpu_device *adev)
>>>>>>           return -EINVAL;
>>>>>>       }
>>>>>>
>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>> -    if (r)
>>>>>> -        return r;
>>>>>> +    if (adev->in_suspend) {
>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>> +        if (r)
>>>>>> +            return r;
>>>>>> +    }
>>>>>>
>>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>>       if (r)
>>>>>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct 
>>>>>> amdgpu_device *adev)
>>>>>>   {
>>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>   }
>>>>>>
>>>>>>   static int gmc_v9_0_hw_fini(void *handle)
>>>>>> -- 
>>>>>> 2.32.0
>>>>>>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr
  2021-10-20  9:19     ` Lazar, Lijo
@ 2021-10-20 10:49       ` Christian König
  2021-10-20 11:12         ` Das, Nirmoy
  0 siblings, 1 reply; 18+ messages in thread
From: Christian König @ 2021-10-20 10:49 UTC (permalink / raw)
  To: Lazar, Lijo, Das, Nirmoy, amd-gfx; +Cc: andrey.grodzovsky

Am 20.10.21 um 11:19 schrieb Lazar, Lijo:
>
>
> On 10/20/2021 2:18 PM, Das, Nirmoy wrote:
>>
>> On 10/20/2021 8:49 AM, Christian König wrote:
>>> Am 19.10.21 um 20:14 schrieb Nirmoy Das:
>>>> Do not allow exported amdgpu_gtt_mgr_*() to accept
>>>> any ttm_resource_manager pointer. Also there is no need
>>>> to force other module to call a ttm function just to
>>>> eventually call gtt_mgr functions.
>>>
>>> That's a rather bad idea I think.
>>>
>>> The GTT and VRAM manager work on their respective objects and not on 
>>> the adev directly.
>>
>>
>> What is bothering me is : it is obvious that  the 
>> amdgpu_gtt_mgr_usage() for example should only calculate
>>
>> usages for TTM_PL_TT type resource manager, why to pass that 
>> explicitly. I am trying to leverage the fact that
>>
>> we only have one gtt/vram manager for a adev and the functions that I 
>> changed  work on whole gtt/vram manager
>>
>> as a unit.
>>
>
> Don't know about the functional aspects. From a sofware perspective, 
> amdgpu_gtt_mgr_*() operating on struct amdgpu_gtt_mgr *mgr seems more 
> logical.

What we could do is to pass in amdgpu_gtt_mgr instead of 
ttm_resource_manager and then use &adev->mman.gtt_mgr.

Regards,
Christian.

>
>
> Thanks,
> Lijo
>
>>
>> Regards,
>>
>> Nirmoy
>>
>>
>>>
>>> Christian.
>>>
>>>>
>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 31 
>>>> ++++++++++++---------
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  4 +--
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |  4 +--
>>>>   4 files changed, 24 insertions(+), 19 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> index 41ce86244144..5807df52031c 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>> @@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct 
>>>> amdgpu_device *adev,
>>>>         amdgpu_virt_init_data_exchange(adev);
>>>>       /* we need recover gart prior to run SMC/CP/SDMA resume */
>>>> - amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, 
>>>> TTM_PL_TT));
>>>> +    amdgpu_gtt_mgr_recover(adev);
>>>>         r = amdgpu_device_fw_loading(adev);
>>>>       if (r)
>>>> @@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
>>>> *device_list_handle,
>>>>                       amdgpu_inc_vram_lost(tmp_adev);
>>>>                   }
>>>>   -                r = 
>>>> amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, 
>>>> TTM_PL_TT));
>>>> +                r = amdgpu_gtt_mgr_recover(tmp_adev);
>>>>                   if (r)
>>>>                       goto out;
>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>>>> index c18f16b3be9c..5e41f8ef743a 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>>>> @@ -77,10 +77,8 @@ static ssize_t 
>>>> amdgpu_mem_info_gtt_used_show(struct device *dev,
>>>>   {
>>>>       struct drm_device *ddev = dev_get_drvdata(dev);
>>>>       struct amdgpu_device *adev = drm_to_adev(ddev);
>>>> -    struct ttm_resource_manager *man;
>>>>   -    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>>> -    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
>>>> +    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(adev));
>>>>   }
>>>>     static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
>>>> @@ -206,14 +204,19 @@ static void amdgpu_gtt_mgr_del(struct 
>>>> ttm_resource_manager *man,
>>>>   /**
>>>>    * amdgpu_gtt_mgr_usage - return usage of GTT domain
>>>>    *
>>>> - * @man: TTM memory type manager
>>>> + * @adev: amdgpu_device pointer
>>>>    *
>>>>    * Return how many bytes are used in the GTT domain
>>>>    */
>>>> -uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
>>>> +uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev)
>>>>   {
>>>> -    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>>>> -    s64 result = man->size - atomic64_read(&mgr->available);
>>>> +    struct ttm_resource_manager *man;
>>>> +    struct amdgpu_gtt_mgr *mgr;
>>>> +    s64 result;
>>>> +
>>>> +    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>>> +    mgr = to_gtt_mgr(man);
>>>> +    result = man->size - atomic64_read(&mgr->available);
>>>>         return (result > 0 ? result : 0) * PAGE_SIZE;
>>>>   }
>>>> @@ -221,19 +224,20 @@ uint64_t amdgpu_gtt_mgr_usage(struct 
>>>> ttm_resource_manager *man)
>>>>   /**
>>>>    * amdgpu_gtt_mgr_recover - re-init gart
>>>>    *
>>>> - * @man: TTM memory type manager
>>>> + * @adev: amdgpu_device pointer
>>>>    *
>>>>    * Re-init the gart for each known BO in the GTT.
>>>>    */
>>>> -int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
>>>> +int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev)
>>>>   {
>>>> -    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>>>> -    struct amdgpu_device *adev;
>>>> +    struct ttm_resource_manager *man;
>>>> +    struct amdgpu_gtt_mgr *mgr;
>>>>       struct amdgpu_gtt_node *node;
>>>>       struct drm_mm_node *mm_node;
>>>>       int r = 0;
>>>>   -    adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
>>>> +    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>>> +    mgr = to_gtt_mgr(man);
>>>>       spin_lock(&mgr->lock);
>>>>       drm_mm_for_each_node(mm_node, &mgr->mm) {
>>>>           node = container_of(mm_node, typeof(*node), 
>>>> base.mm_nodes[0]);
>>>> @@ -260,6 +264,7 @@ static void amdgpu_gtt_mgr_debug(struct 
>>>> ttm_resource_manager *man,
>>>>                    struct drm_printer *printer)
>>>>   {
>>>>       struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>>>> +    struct amdgpu_device *adev = container_of(mgr, typeof(*adev), 
>>>> mman.gtt_mgr);
>>>>         spin_lock(&mgr->lock);
>>>>       drm_mm_print(&mgr->mm, printer);
>>>> @@ -267,7 +272,7 @@ static void amdgpu_gtt_mgr_debug(struct 
>>>> ttm_resource_manager *man,
>>>>         drm_printf(printer, "man size:%llu pages, gtt 
>>>> available:%lld pages, usage:%lluMB\n",
>>>>              man->size, (u64)atomic64_read(&mgr->available),
>>>> -           amdgpu_gtt_mgr_usage(man) >> 20);
>>>> +           amdgpu_gtt_mgr_usage(adev) >> 20);
>>>>   }
>>>>     static const struct ttm_resource_manager_func 
>>>> amdgpu_gtt_mgr_func = {
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>> index d2955ea4a62b..b9b38f70e416 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>> @@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, 
>>>> void *data, struct drm_file *filp)
>>>>           ui64 = 
>>>> amdgpu_vram_mgr_vis_usage(ttm_manager_type(&adev->mman.bdev, 
>>>> TTM_PL_VRAM));
>>>>           return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT 
>>>> : 0;
>>>>       case AMDGPU_INFO_GTT_USAGE:
>>>> -        ui64 = 
>>>> amdgpu_gtt_mgr_usage(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
>>>> +        ui64 = amdgpu_gtt_mgr_usage(adev);
>>>>           return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT 
>>>> : 0;
>>>>       case AMDGPU_INFO_GDS_CONFIG: {
>>>>           struct drm_amdgpu_info_gds gds_info;
>>>> @@ -738,7 +738,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, 
>>>> void *data, struct drm_file *filp)
>>>>           mem.gtt.usable_heap_size = mem.gtt.total_heap_size -
>>>>               atomic64_read(&adev->gart_pin_size);
>>>>           mem.gtt.heap_usage =
>>>> -            amdgpu_gtt_mgr_usage(gtt_man);
>>>> +            amdgpu_gtt_mgr_usage(adev);
>>>>           mem.gtt.max_allocation = mem.gtt.usable_heap_size * 3 / 4;
>>>>             return copy_to_user(out, &mem,
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>>> index 91a087f9dc7c..6e337cacef51 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>>> @@ -114,8 +114,8 @@ int amdgpu_vram_mgr_init(struct amdgpu_device 
>>>> *adev);
>>>>   void amdgpu_vram_mgr_fini(struct amdgpu_device *adev);
>>>>     bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_resource *mem);
>>>> -uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man);
>>>> -int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man);
>>>> +uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev);
>>>> +int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev);
>>>>     uint64_t amdgpu_preempt_mgr_usage(struct ttm_resource_manager 
>>>> *man);
>>>


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-20 10:21             ` Das, Nirmoy
@ 2021-10-20 10:51               ` Christian König
  2021-10-20 11:10                 ` Das, Nirmoy
  0 siblings, 1 reply; 18+ messages in thread
From: Christian König @ 2021-10-20 10:51 UTC (permalink / raw)
  To: Das, Nirmoy, Lazar, Lijo, amd-gfx; +Cc: andrey.grodzovsky



Am 20.10.21 um 12:21 schrieb Das, Nirmoy:
>
> On 10/20/2021 12:15 PM, Lazar, Lijo wrote:
>>
>>
>> On 10/20/2021 3:42 PM, Das, Nirmoy wrote:
>>>
>>> On 10/20/2021 12:03 PM, Lazar, Lijo wrote:
>>>>
>>>>
>>>> On 10/20/2021 3:23 PM, Das, Nirmoy wrote:
>>>>>
>>>>> On 10/20/2021 11:11 AM, Lazar, Lijo wrote:
>>>>>>
>>>>>>
>>>>>> On 10/19/2021 11:44 PM, Nirmoy Das wrote:
>>>>>>> Get rid off pin/unpin of gart BO at resume/suspend and
>>>>>>> instead pin only once and try to recover gart content
>>>>>>> at resume time. This is much more stable in case there
>>>>>>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>>>>>>> while evicting GART table.
>>>>>>>
>>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>>>> ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 
>>>>>>> ++++++++++++----------
>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>>>>>>>   6 files changed, 45 insertions(+), 39 deletions(-)
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> index 5807df52031c..f69e613805db 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct 
>>>>>>> drm_device *dev, bool fbcon)
>>>>>>>       amdgpu_fence_driver_hw_fini(adev);
>>>>>>>
>>>>>>>       amdgpu_device_ip_suspend_phase2(adev);
>>>>>>> -    /* This second call to evict device resources is to evict
>>>>>>> -     * the gart page table using the CPU.
>>>>>>> -     */
>>>>>>> -    amdgpu_device_evict_resources(adev);
>>>>>>>
>>>>>>>       return 0;
>>>>>>>   }
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>> index d3e4203f6217..97a9f61fa106 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>    *
>>>>>>>    * @adev: amdgpu_device pointer
>>>>>>>    *
>>>>>>> - * Allocate video memory for GART page table
>>>>>>> + * Allocate and pin video memory for GART page table
>>>>>>>    * (pcie r4xx, r5xx+).  These asics require the
>>>>>>>    * gart table to be in video memory.
>>>>>>>    * Returns 0 for success, error for failure.
>>>>>>>    */
>>>>>>>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>>>>>>   {
>>>>>>> +    struct amdgpu_bo_param bp;
>>>>>>>       int r;
>>>>>>>
>>>>>>> -    if (adev->gart.bo == NULL) {
>>>>>>> -        struct amdgpu_bo_param bp;
>>>>>>> -
>>>>>>> -        memset(&bp, 0, sizeof(bp));
>>>>>>> -        bp.size = adev->gart.table_size;
>>>>>>> -        bp.byte_align = PAGE_SIZE;
>>>>>>> -        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>>> -        bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>>> -            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>>> -        bp.type = ttm_bo_type_kernel;
>>>>>>> -        bp.resv = NULL;
>>>>>>> -        bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>>> -
>>>>>>> -        r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>>> -        if (r) {
>>>>>>> -            return r;
>>>>>>> -        }
>>>>>>> -    }
>>>>>>> +    if (adev->gart.bo != NULL)
>>>>>>> +        return 0;
>>>>>>> +
>>>>>>> +    memset(&bp, 0, sizeof(bp));
>>>>>>> +    bp.size = adev->gart.table_size;
>>>>>>> +    bp.byte_align = PAGE_SIZE;
>>>>>>> +    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>>> +    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>>> +        AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>>> +    bp.type = ttm_bo_type_kernel;
>>>>>>> +    bp.resv = NULL;
>>>>>>> +    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>>> +
>>>>>>> +    r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>>> +    if (r)
>>>>>>> +        return r;
>>>>>>> +
>>>>>>> +    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>> +    if (r)
>>>>>>> +        return r;
>>>>>>> +
>>>>>>>       return 0;
>>>>>>>   }
>>>>>>>
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           return -EINVAL;
>>>>>>>       }
>>>>>>>
>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>> -    if (r)
>>>>>>> -        return r;
>>>>>>> +    if (adev->in_suspend) {
>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>
>>>>>> When the existing usage of this function is checked, this is 
>>>>>> called during reset recovery after ip resume phase1. Can't the 
>>>>>> same thing be done in ip_resume() to place this after phase1 
>>>>>> resume instead of repeating in every IP version?
>>>>>
>>>>>
>>>>> Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
>>>>> gmc_v10_0_gart_enable() seems to be correct place  to do this
>>>>>
>>>>> gart specific work.
>>>>>
>>>>
>>>> I see. In that case probably the patch needs to change other call 
>>>> places also as this step is already taken care in gart enable.
>>>
>>>
>>> Do you mean amdgpu_do_asic_reset() ?
>>>
>>
>> Yes, and saw it called in one more place related to sriov reset 
>> (didn't track the sriov reset path though).
>
>
> True, hmm looks like this patch going to need multiple tested-by tags 
> for gfx6,7 and sriov. I only have gfx8,9,10.

You also need to test this on APUs as well, when it works won Raven/gfx9 
I'm pretty sure it will work on other generations as well (except for 
typos of course).

Christian.

>
>
> Regards,
>
> Nirmoy
>
>
>>
>> Thanks,
>> Lijo
>>
>>>
>>> Nirmoy
>>>
>>>
>>>>
>>>> Thanks,
>>>> Lijo
>>>>
>>>>>
>>>>> Regards,
>>>>>
>>>>> Nirmoy
>>>>>
>>>>>
>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>> Lijo
>>>>>>
>>>>>>> +        if (r)
>>>>>>> +            return r;
>>>>>>> +    }
>>>>>>>
>>>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>>>       if (r)
>>>>>>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>   {
>>>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>   }
>>>>>>>
>>>>>>>   static int gmc_v10_0_hw_fini(void *handle)
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>> index 0a50fdaced7e..02e90d9443c1 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>>>           return -EINVAL;
>>>>>>>       }
>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>> -    if (r)
>>>>>>> -        return r;
>>>>>>> +
>>>>>>> +    if (adev->in_suspend) {
>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>> +        if (r)
>>>>>>> +            return r;
>>>>>>> +    }
>>>>>>>
>>>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>>
>>>>>>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>   }
>>>>>>>
>>>>>>>   /**
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>> index 492ebed2915b..dc2577e37688 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>>>           return -EINVAL;
>>>>>>>       }
>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>> -    if (r)
>>>>>>> -        return r;
>>>>>>> +
>>>>>>> +    if (adev->in_suspend) {
>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>> +        if (r)
>>>>>>> +            return r;
>>>>>>> +    }
>>>>>>>
>>>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>>
>>>>>>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>   }
>>>>>>>
>>>>>>>   /**
>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>> index cb82404df534..732d91dbf449 100644
>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>           return -EINVAL;
>>>>>>>       }
>>>>>>>
>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>> -    if (r)
>>>>>>> -        return r;
>>>>>>> +    if (adev->in_suspend) {
>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>> +        if (r)
>>>>>>> +            return r;
>>>>>>> +    }
>>>>>>>
>>>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>>>       if (r)
>>>>>>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct 
>>>>>>> amdgpu_device *adev)
>>>>>>>   {
>>>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>   }
>>>>>>>
>>>>>>>   static int gmc_v9_0_hw_fini(void *handle)
>>>>>>> -- 
>>>>>>> 2.32.0
>>>>>>>


^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume
  2021-10-20 10:51               ` Christian König
@ 2021-10-20 11:10                 ` Das, Nirmoy
  0 siblings, 0 replies; 18+ messages in thread
From: Das, Nirmoy @ 2021-10-20 11:10 UTC (permalink / raw)
  To: Christian König, Lazar, Lijo, amd-gfx; +Cc: andrey.grodzovsky


On 10/20/2021 12:51 PM, Christian König wrote:
>
>
> Am 20.10.21 um 12:21 schrieb Das, Nirmoy:
>>
>> On 10/20/2021 12:15 PM, Lazar, Lijo wrote:
>>>
>>>
>>> On 10/20/2021 3:42 PM, Das, Nirmoy wrote:
>>>>
>>>> On 10/20/2021 12:03 PM, Lazar, Lijo wrote:
>>>>>
>>>>>
>>>>> On 10/20/2021 3:23 PM, Das, Nirmoy wrote:
>>>>>>
>>>>>> On 10/20/2021 11:11 AM, Lazar, Lijo wrote:
>>>>>>>
>>>>>>>
>>>>>>> On 10/19/2021 11:44 PM, Nirmoy Das wrote:
>>>>>>>> Get rid off pin/unpin of gart BO at resume/suspend and
>>>>>>>> instead pin only once and try to recover gart content
>>>>>>>> at resume time. This is much more stable in case there
>>>>>>>> is OOM situation at 2nd call to amdgpu_device_evict_resources()
>>>>>>>> while evicting GART table.
>>>>>>>>
>>>>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>>>>> ---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 
>>>>>>>> ++++++++++++----------
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c     |  9 ++---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c      | 10 +++---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c      | 10 +++---
>>>>>>>>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c      |  9 ++---
>>>>>>>>   6 files changed, 45 insertions(+), 39 deletions(-)
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> index 5807df52031c..f69e613805db 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>>>>> @@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct 
>>>>>>>> drm_device *dev, bool fbcon)
>>>>>>>>       amdgpu_fence_driver_hw_fini(adev);
>>>>>>>>
>>>>>>>>       amdgpu_device_ip_suspend_phase2(adev);
>>>>>>>> -    /* This second call to evict device resources is to evict
>>>>>>>> -     * the gart page table using the CPU.
>>>>>>>> -     */
>>>>>>>> -    amdgpu_device_evict_resources(adev);
>>>>>>>>
>>>>>>>>       return 0;
>>>>>>>>   }
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>>> index d3e4203f6217..97a9f61fa106 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
>>>>>>>> @@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>    *
>>>>>>>>    * @adev: amdgpu_device pointer
>>>>>>>>    *
>>>>>>>> - * Allocate video memory for GART page table
>>>>>>>> + * Allocate and pin video memory for GART page table
>>>>>>>>    * (pcie r4xx, r5xx+).  These asics require the
>>>>>>>>    * gart table to be in video memory.
>>>>>>>>    * Returns 0 for success, error for failure.
>>>>>>>>    */
>>>>>>>>   int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
>>>>>>>>   {
>>>>>>>> +    struct amdgpu_bo_param bp;
>>>>>>>>       int r;
>>>>>>>>
>>>>>>>> -    if (adev->gart.bo == NULL) {
>>>>>>>> -        struct amdgpu_bo_param bp;
>>>>>>>> -
>>>>>>>> -        memset(&bp, 0, sizeof(bp));
>>>>>>>> -        bp.size = adev->gart.table_size;
>>>>>>>> -        bp.byte_align = PAGE_SIZE;
>>>>>>>> -        bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>>>> -        bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>>>> -            AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>>>> -        bp.type = ttm_bo_type_kernel;
>>>>>>>> -        bp.resv = NULL;
>>>>>>>> -        bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>>>> -
>>>>>>>> -        r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>>>> -        if (r) {
>>>>>>>> -            return r;
>>>>>>>> -        }
>>>>>>>> -    }
>>>>>>>> +    if (adev->gart.bo != NULL)
>>>>>>>> +        return 0;
>>>>>>>> +
>>>>>>>> +    memset(&bp, 0, sizeof(bp));
>>>>>>>> +    bp.size = adev->gart.table_size;
>>>>>>>> +    bp.byte_align = PAGE_SIZE;
>>>>>>>> +    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
>>>>>>>> +    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
>>>>>>>> +        AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
>>>>>>>> +    bp.type = ttm_bo_type_kernel;
>>>>>>>> +    bp.resv = NULL;
>>>>>>>> +    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
>>>>>>>> +
>>>>>>>> +    r = amdgpu_bo_create(adev, &bp, &adev->gart.bo);
>>>>>>>> +    if (r)
>>>>>>>> +        return r;
>>>>>>>> +
>>>>>>>> +    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>>> +    if (r)
>>>>>>>> +        return r;
>>>>>>>> +
>>>>>>>>       return 0;
>>>>>>>>   }
>>>>>>>>
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>>> index 3ec5ff5a6dbe..75d584e1b0e9 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>>>>>>> @@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           return -EINVAL;
>>>>>>>>       }
>>>>>>>>
>>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>>> -    if (r)
>>>>>>>> -        return r;
>>>>>>>> +    if (adev->in_suspend) {
>>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>>
>>>>>>> When the existing usage of this function is checked, this is 
>>>>>>> called during reset recovery after ip resume phase1. Can't the 
>>>>>>> same thing be done in ip_resume() to place this after phase1 
>>>>>>> resume instead of repeating in every IP version?
>>>>>>
>>>>>>
>>>>>> Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
>>>>>> gmc_v10_0_gart_enable() seems to be correct place  to do this
>>>>>>
>>>>>> gart specific work.
>>>>>>
>>>>>
>>>>> I see. In that case probably the patch needs to change other call 
>>>>> places also as this step is already taken care in gart enable.
>>>>
>>>>
>>>> Do you mean amdgpu_do_asic_reset() ?
>>>>
>>>
>>> Yes, and saw it called in one more place related to sriov reset 
>>> (didn't track the sriov reset path though).
>>
>>
>> True, hmm looks like this patch going to need multiple tested-by tags 
>> for gfx6,7 and sriov. I only have gfx8,9,10.
>
> You also need to test this on APUs as well, when it works won 
> Raven/gfx9 I'm pretty sure it will work on other generations as well 
> (except for typos of course).


I have a raven APU. I will test on that as well.


Regards,

Nirmoy

>
> Christian.
>
>>
>>
>> Regards,
>>
>> Nirmoy
>>
>>
>>>
>>> Thanks,
>>> Lijo
>>>
>>>>
>>>> Nirmoy
>>>>
>>>>
>>>>>
>>>>> Thanks,
>>>>> Lijo
>>>>>
>>>>>>
>>>>>> Regards,
>>>>>>
>>>>>> Nirmoy
>>>>>>
>>>>>>
>>>>>>
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Lijo
>>>>>>>
>>>>>>>> +        if (r)
>>>>>>>> +            return r;
>>>>>>>> +    }
>>>>>>>>
>>>>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>>>>       if (r)
>>>>>>>> @@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>   {
>>>>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   static int gmc_v10_0_hw_fini(void *handle)
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>>> index 0a50fdaced7e..02e90d9443c1 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
>>>>>>>> @@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>>>>           return -EINVAL;
>>>>>>>>       }
>>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>>> -    if (r)
>>>>>>>> -        return r;
>>>>>>>> +
>>>>>>>> +    if (adev->in_suspend) {
>>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>>> +        if (r)
>>>>>>>> +            return r;
>>>>>>>> +    }
>>>>>>>>
>>>>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>>>
>>>>>>>> @@ -758,7 +761,6 @@ static void gmc_v7_0_gart_disable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   /**
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>>> index 492ebed2915b..dc2577e37688 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
>>>>>>>> @@ -837,9 +837,12 @@ static int gmc_v8_0_gart_enable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
>>>>>>>>           return -EINVAL;
>>>>>>>>       }
>>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>>> -    if (r)
>>>>>>>> -        return r;
>>>>>>>> +
>>>>>>>> +    if (adev->in_suspend) {
>>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>>> +        if (r)
>>>>>>>> +            return r;
>>>>>>>> +    }
>>>>>>>>
>>>>>>>>       table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
>>>>>>>>
>>>>>>>> @@ -992,7 +995,6 @@ static void gmc_v8_0_gart_disable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>       tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
>>>>>>>>       WREG32(mmVM_L2_CNTL, tmp);
>>>>>>>>       WREG32(mmVM_L2_CNTL2, 0);
>>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   /**
>>>>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
>>>>>>>> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>>> index cb82404df534..732d91dbf449 100644
>>>>>>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>>>>>>> @@ -1714,9 +1714,11 @@ static int gmc_v9_0_gart_enable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>           return -EINVAL;
>>>>>>>>       }
>>>>>>>>
>>>>>>>> -    r = amdgpu_gart_table_vram_pin(adev);
>>>>>>>> -    if (r)
>>>>>>>> -        return r;
>>>>>>>> +    if (adev->in_suspend) {
>>>>>>>> +        r = amdgpu_gtt_mgr_recover(adev);
>>>>>>>> +        if (r)
>>>>>>>> +            return r;
>>>>>>>> +    }
>>>>>>>>
>>>>>>>>       r = adev->gfxhub.funcs->gart_enable(adev);
>>>>>>>>       if (r)
>>>>>>>> @@ -1793,7 +1795,6 @@ static void gmc_v9_0_gart_disable(struct 
>>>>>>>> amdgpu_device *adev)
>>>>>>>>   {
>>>>>>>>       adev->gfxhub.funcs->gart_disable(adev);
>>>>>>>>       adev->mmhub.funcs->gart_disable(adev);
>>>>>>>> -    amdgpu_gart_table_vram_unpin(adev);
>>>>>>>>   }
>>>>>>>>
>>>>>>>>   static int gmc_v9_0_hw_fini(void *handle)
>>>>>>>> -- 
>>>>>>>> 2.32.0
>>>>>>>>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr
  2021-10-20 10:49       ` Christian König
@ 2021-10-20 11:12         ` Das, Nirmoy
  0 siblings, 0 replies; 18+ messages in thread
From: Das, Nirmoy @ 2021-10-20 11:12 UTC (permalink / raw)
  To: Christian König, Lazar, Lijo, amd-gfx; +Cc: andrey.grodzovsky


On 10/20/2021 12:49 PM, Christian König wrote:
> Am 20.10.21 um 11:19 schrieb Lazar, Lijo:
>>
>>
>> On 10/20/2021 2:18 PM, Das, Nirmoy wrote:
>>>
>>> On 10/20/2021 8:49 AM, Christian König wrote:
>>>> Am 19.10.21 um 20:14 schrieb Nirmoy Das:
>>>>> Do not allow exported amdgpu_gtt_mgr_*() to accept
>>>>> any ttm_resource_manager pointer. Also there is no need
>>>>> to force other module to call a ttm function just to
>>>>> eventually call gtt_mgr functions.
>>>>
>>>> That's a rather bad idea I think.
>>>>
>>>> The GTT and VRAM manager work on their respective objects and not 
>>>> on the adev directly.
>>>
>>>
>>> What is bothering me is : it is obvious that  the 
>>> amdgpu_gtt_mgr_usage() for example should only calculate
>>>
>>> usages for TTM_PL_TT type resource manager, why to pass that 
>>> explicitly. I am trying to leverage the fact that
>>>
>>> we only have one gtt/vram manager for a adev and the functions that 
>>> I changed  work on whole gtt/vram manager
>>>
>>> as a unit.
>>>
>>
>> Don't know about the functional aspects. From a sofware perspective, 
>> amdgpu_gtt_mgr_*() operating on struct amdgpu_gtt_mgr *mgr seems more 
>> logical.
>
> What we could do is to pass in amdgpu_gtt_mgr instead of 
> ttm_resource_manager and then use &adev->mman.gtt_mgr.


Sounds good, I will try this way then.


Regards,

Nirmoy

>
> Regards,
> Christian.
>
>>
>>
>> Thanks,
>> Lijo
>>
>>>
>>> Regards,
>>>
>>> Nirmoy
>>>
>>>
>>>>
>>>> Christian.
>>>>
>>>>>
>>>>> Signed-off-by: Nirmoy Das <nirmoy.das@amd.com>
>>>>> ---
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 31 
>>>>> ++++++++++++---------
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c     |  4 +--
>>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h     |  4 +--
>>>>>   4 files changed, 24 insertions(+), 19 deletions(-)
>>>>>
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> index 41ce86244144..5807df52031c 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>>>>> @@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct 
>>>>> amdgpu_device *adev,
>>>>>         amdgpu_virt_init_data_exchange(adev);
>>>>>       /* we need recover gart prior to run SMC/CP/SDMA resume */
>>>>> - amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, 
>>>>> TTM_PL_TT));
>>>>> +    amdgpu_gtt_mgr_recover(adev);
>>>>>         r = amdgpu_device_fw_loading(adev);
>>>>>       if (r)
>>>>> @@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
>>>>> *device_list_handle,
>>>>>                       amdgpu_inc_vram_lost(tmp_adev);
>>>>>                   }
>>>>>   -                r = 
>>>>> amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, 
>>>>> TTM_PL_TT));
>>>>> +                r = amdgpu_gtt_mgr_recover(tmp_adev);
>>>>>                   if (r)
>>>>>                       goto out;
>>>>>   diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>>>>> index c18f16b3be9c..5e41f8ef743a 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
>>>>> @@ -77,10 +77,8 @@ static ssize_t 
>>>>> amdgpu_mem_info_gtt_used_show(struct device *dev,
>>>>>   {
>>>>>       struct drm_device *ddev = dev_get_drvdata(dev);
>>>>>       struct amdgpu_device *adev = drm_to_adev(ddev);
>>>>> -    struct ttm_resource_manager *man;
>>>>>   -    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>>>> -    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
>>>>> +    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(adev));
>>>>>   }
>>>>>     static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
>>>>> @@ -206,14 +204,19 @@ static void amdgpu_gtt_mgr_del(struct 
>>>>> ttm_resource_manager *man,
>>>>>   /**
>>>>>    * amdgpu_gtt_mgr_usage - return usage of GTT domain
>>>>>    *
>>>>> - * @man: TTM memory type manager
>>>>> + * @adev: amdgpu_device pointer
>>>>>    *
>>>>>    * Return how many bytes are used in the GTT domain
>>>>>    */
>>>>> -uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
>>>>> +uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev)
>>>>>   {
>>>>> -    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>>>>> -    s64 result = man->size - atomic64_read(&mgr->available);
>>>>> +    struct ttm_resource_manager *man;
>>>>> +    struct amdgpu_gtt_mgr *mgr;
>>>>> +    s64 result;
>>>>> +
>>>>> +    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>>>> +    mgr = to_gtt_mgr(man);
>>>>> +    result = man->size - atomic64_read(&mgr->available);
>>>>>         return (result > 0 ? result : 0) * PAGE_SIZE;
>>>>>   }
>>>>> @@ -221,19 +224,20 @@ uint64_t amdgpu_gtt_mgr_usage(struct 
>>>>> ttm_resource_manager *man)
>>>>>   /**
>>>>>    * amdgpu_gtt_mgr_recover - re-init gart
>>>>>    *
>>>>> - * @man: TTM memory type manager
>>>>> + * @adev: amdgpu_device pointer
>>>>>    *
>>>>>    * Re-init the gart for each known BO in the GTT.
>>>>>    */
>>>>> -int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
>>>>> +int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev)
>>>>>   {
>>>>> -    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>>>>> -    struct amdgpu_device *adev;
>>>>> +    struct ttm_resource_manager *man;
>>>>> +    struct amdgpu_gtt_mgr *mgr;
>>>>>       struct amdgpu_gtt_node *node;
>>>>>       struct drm_mm_node *mm_node;
>>>>>       int r = 0;
>>>>>   -    adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
>>>>> +    man = ttm_manager_type(&adev->mman.bdev, TTM_PL_TT);
>>>>> +    mgr = to_gtt_mgr(man);
>>>>>       spin_lock(&mgr->lock);
>>>>>       drm_mm_for_each_node(mm_node, &mgr->mm) {
>>>>>           node = container_of(mm_node, typeof(*node), 
>>>>> base.mm_nodes[0]);
>>>>> @@ -260,6 +264,7 @@ static void amdgpu_gtt_mgr_debug(struct 
>>>>> ttm_resource_manager *man,
>>>>>                    struct drm_printer *printer)
>>>>>   {
>>>>>       struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
>>>>> +    struct amdgpu_device *adev = container_of(mgr, typeof(*adev), 
>>>>> mman.gtt_mgr);
>>>>>         spin_lock(&mgr->lock);
>>>>>       drm_mm_print(&mgr->mm, printer);
>>>>> @@ -267,7 +272,7 @@ static void amdgpu_gtt_mgr_debug(struct 
>>>>> ttm_resource_manager *man,
>>>>>         drm_printf(printer, "man size:%llu pages, gtt 
>>>>> available:%lld pages, usage:%lluMB\n",
>>>>>              man->size, (u64)atomic64_read(&mgr->available),
>>>>> -           amdgpu_gtt_mgr_usage(man) >> 20);
>>>>> +           amdgpu_gtt_mgr_usage(adev) >> 20);
>>>>>   }
>>>>>     static const struct ttm_resource_manager_func 
>>>>> amdgpu_gtt_mgr_func = {
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>>> index d2955ea4a62b..b9b38f70e416 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
>>>>> @@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, 
>>>>> void *data, struct drm_file *filp)
>>>>>           ui64 = 
>>>>> amdgpu_vram_mgr_vis_usage(ttm_manager_type(&adev->mman.bdev, 
>>>>> TTM_PL_VRAM));
>>>>>           return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT 
>>>>> : 0;
>>>>>       case AMDGPU_INFO_GTT_USAGE:
>>>>> -        ui64 = 
>>>>> amdgpu_gtt_mgr_usage(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
>>>>> +        ui64 = amdgpu_gtt_mgr_usage(adev);
>>>>>           return copy_to_user(out, &ui64, min(size, 8u)) ? -EFAULT 
>>>>> : 0;
>>>>>       case AMDGPU_INFO_GDS_CONFIG: {
>>>>>           struct drm_amdgpu_info_gds gds_info;
>>>>> @@ -738,7 +738,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, 
>>>>> void *data, struct drm_file *filp)
>>>>>           mem.gtt.usable_heap_size = mem.gtt.total_heap_size -
>>>>>               atomic64_read(&adev->gart_pin_size);
>>>>>           mem.gtt.heap_usage =
>>>>> -            amdgpu_gtt_mgr_usage(gtt_man);
>>>>> +            amdgpu_gtt_mgr_usage(adev);
>>>>>           mem.gtt.max_allocation = mem.gtt.usable_heap_size * 3 / 4;
>>>>>             return copy_to_user(out, &mem,
>>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
>>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>>>> index 91a087f9dc7c..6e337cacef51 100644
>>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>>>>> @@ -114,8 +114,8 @@ int amdgpu_vram_mgr_init(struct amdgpu_device 
>>>>> *adev);
>>>>>   void amdgpu_vram_mgr_fini(struct amdgpu_device *adev);
>>>>>     bool amdgpu_gtt_mgr_has_gart_addr(struct ttm_resource *mem);
>>>>> -uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man);
>>>>> -int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man);
>>>>> +uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev);
>>>>> +int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev);
>>>>>     uint64_t amdgpu_preempt_mgr_usage(struct ttm_resource_manager 
>>>>> *man);
>>>>
>

^ permalink raw reply	[flat|nested] 18+ messages in thread

end of thread, other threads:[~2021-10-20 11:12 UTC | newest]

Thread overview: 18+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-19 18:14 [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr Nirmoy Das
2021-10-19 18:14 ` [PATCH 2/3] drm/amdgpu: do not pass ttm_resource_manager to vram_mgr Nirmoy Das
2021-10-19 18:14 ` [PATCH v2 3/3] drm/amdgpu: recover gart table at resume Nirmoy Das
2021-10-20  6:52   ` Christian König
2021-10-20  8:41     ` Das, Nirmoy
2021-10-20  9:11   ` Lazar, Lijo
2021-10-20  9:53     ` Das, Nirmoy
2021-10-20 10:03       ` Lazar, Lijo
2021-10-20 10:12         ` Das, Nirmoy
2021-10-20 10:15           ` Lazar, Lijo
2021-10-20 10:21             ` Das, Nirmoy
2021-10-20 10:51               ` Christian König
2021-10-20 11:10                 ` Das, Nirmoy
2021-10-20  6:49 ` [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr Christian König
2021-10-20  8:48   ` Das, Nirmoy
2021-10-20  9:19     ` Lazar, Lijo
2021-10-20 10:49       ` Christian König
2021-10-20 11:12         ` Das, Nirmoy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.