All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdgpu: fix logic inversion in check
@ 2022-01-28 15:16 Christian König
  2022-01-28 15:16 ` [PATCH 2/2] drm/amdgpu: restructure amdgpu_fill_buffer Christian König
  0 siblings, 1 reply; 4+ messages in thread
From: Christian König @ 2022-01-28 15:16 UTC (permalink / raw)
  To: harish.kasiviswanathan, benjaminadam.price, felix.kuehling, amd-gfx

We probably never trigger this, but the logic inside the check is
inverted.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 3d8a20956b74..2b0e83e9fa8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1938,7 +1938,7 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
 	unsigned i;
 	int r;
 
-	if (direct_submit && !ring->sched.ready) {
+	if (!direct_submit && !ring->sched.ready) {
 		DRM_ERROR("Trying to move memory with ring turned off.\n");
 		return -EINVAL;
 	}
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/2] drm/amdgpu: restructure amdgpu_fill_buffer
  2022-01-28 15:16 [PATCH 1/2] drm/amdgpu: fix logic inversion in check Christian König
@ 2022-01-28 15:16 ` Christian König
  2022-01-28 15:55   ` Felix Kuehling
  0 siblings, 1 reply; 4+ messages in thread
From: Christian König @ 2022-01-28 15:16 UTC (permalink / raw)
  To: harish.kasiviswanathan, benjaminadam.price, felix.kuehling, amd-gfx

We ran into the problem that clearing really larger buffer (60GiB) caused an
SDMA timeout.

Restructure the function to use the dst window instead of mapping the whole
buffer into the GART and then fill only 2MiB chunks at a time.

Signed-off-by: Christian König <christian.koenig@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 200 +++++++++++++-----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |   2 +
 2 files changed, 114 insertions(+), 88 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 2b0e83e9fa8a..8671ba32fb52 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -296,9 +296,6 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
 			       struct dma_resv *resv,
 			       struct dma_fence **f)
 {
-	const uint32_t GTT_MAX_BYTES = (AMDGPU_GTT_MAX_TRANSFER_SIZE *
-					AMDGPU_GPU_PAGE_SIZE);
-
 	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
 	struct amdgpu_res_cursor src_mm, dst_mm;
 	struct dma_fence *fence = NULL;
@@ -320,12 +317,15 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
 		uint32_t cur_size;
 		uint64_t from, to;
 
-		/* Copy size cannot exceed GTT_MAX_BYTES. So if src or dst
-		 * begins at an offset, then adjust the size accordingly
+		/*
+		 * Copy size cannot exceed AMDGPU_GTT_MAX_TRANSFER_BYTES. So if
+		 * src or dst begins at an offset, then adjust the size
+		 * accordingly
 		 */
 		cur_size = max(src_page_offset, dst_page_offset);
 		cur_size = min(min3(src_mm.size, dst_mm.size, size),
-			       (uint64_t)(GTT_MAX_BYTES - cur_size));
+			       (uint64_t)(AMDGPU_GTT_MAX_TRANSFER_BYTES -
+					  cur_size));
 
 		/* Map src to window 0 and dst to window 1. */
 		r = amdgpu_ttm_map_buffer(src->bo, src->mem, &src_mm,
@@ -395,8 +395,7 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo,
 	    (abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
 		struct dma_fence *wipe_fence = NULL;
 
-		r = amdgpu_fill_buffer(ttm_to_amdgpu_bo(bo), AMDGPU_POISON,
-				       NULL, &wipe_fence);
+		r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, &wipe_fence);
 		if (r) {
 			goto error;
 		} else if (wipe_fence) {
@@ -1922,19 +1921,51 @@ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable)
 	adev->mman.buffer_funcs_enabled = enable;
 }
 
+static int amdgpu_ttm_prepare_job(struct amdgpu_device *adev,
+				  bool direct_submit,
+				  unsigned int num_dw,
+				  struct dma_resv *resv,
+				  bool vm_needs_flush,
+				  struct amdgpu_job **job)
+{
+	enum amdgpu_ib_pool_type pool = direct_submit ?
+		AMDGPU_IB_POOL_DIRECT :
+		AMDGPU_IB_POOL_DELAYED;
+	int r;
+
+	r = amdgpu_job_alloc_with_ib(adev, num_dw * 4, pool, job);
+	if (r)
+		return r;
+
+	if (vm_needs_flush) {
+		(*job)->vm_pd_addr = amdgpu_gmc_pd_addr(adev->gmc.pdb0_bo ?
+							adev->gmc.pdb0_bo :
+							adev->gart.bo);
+		(*job)->vm_needs_flush = true;
+	}
+	if (resv) {
+		r = amdgpu_sync_resv(adev, &(*job)->sync, resv,
+				     AMDGPU_SYNC_ALWAYS,
+				     AMDGPU_FENCE_OWNER_UNDEFINED);
+		if (r) {
+			DRM_ERROR("sync failed (%d).\n", r);
+			amdgpu_job_free(*job);
+			return r;
+		}
+	}
+	return 0;
+}
+
 int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
 		       uint64_t dst_offset, uint32_t byte_count,
 		       struct dma_resv *resv,
 		       struct dma_fence **fence, bool direct_submit,
 		       bool vm_needs_flush, bool tmz)
 {
-	enum amdgpu_ib_pool_type pool = direct_submit ? AMDGPU_IB_POOL_DIRECT :
-		AMDGPU_IB_POOL_DELAYED;
 	struct amdgpu_device *adev = ring->adev;
+	unsigned num_loops, num_dw;
 	struct amdgpu_job *job;
-
 	uint32_t max_bytes;
-	unsigned num_loops, num_dw;
 	unsigned i;
 	int r;
 
@@ -1946,26 +1977,11 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
 	max_bytes = adev->mman.buffer_funcs->copy_max_bytes;
 	num_loops = DIV_ROUND_UP(byte_count, max_bytes);
 	num_dw = ALIGN(num_loops * adev->mman.buffer_funcs->copy_num_dw, 8);
-
-	r = amdgpu_job_alloc_with_ib(adev, num_dw * 4, pool, &job);
+	r = amdgpu_ttm_prepare_job(adev, direct_submit, num_dw,
+				   resv, vm_needs_flush, &job);
 	if (r)
 		return r;
 
-	if (vm_needs_flush) {
-		job->vm_pd_addr = amdgpu_gmc_pd_addr(adev->gmc.pdb0_bo ?
-					adev->gmc.pdb0_bo : adev->gart.bo);
-		job->vm_needs_flush = true;
-	}
-	if (resv) {
-		r = amdgpu_sync_resv(adev, &job->sync, resv,
-				     AMDGPU_SYNC_ALWAYS,
-				     AMDGPU_FENCE_OWNER_UNDEFINED);
-		if (r) {
-			DRM_ERROR("sync failed (%d).\n", r);
-			goto error_free;
-		}
-	}
-
 	for (i = 0; i < num_loops; i++) {
 		uint32_t cur_size_in_bytes = min(byte_count, max_bytes);
 
@@ -1995,77 +2011,35 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
 	return r;
 }
 
-int amdgpu_fill_buffer(struct amdgpu_bo *bo,
-		       uint32_t src_data,
-		       struct dma_resv *resv,
-		       struct dma_fence **fence)
+static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, uint32_t src_data,
+			       uint64_t dst_addr, uint32_t byte_count,
+			       struct dma_resv *resv,
+			       struct dma_fence **fence,
+			       bool vm_needs_flush)
 {
-	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
-	uint32_t max_bytes = adev->mman.buffer_funcs->fill_max_bytes;
-	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
-
-	struct amdgpu_res_cursor cursor;
+	struct amdgpu_device *adev = ring->adev;
 	unsigned int num_loops, num_dw;
-	uint64_t num_bytes;
-
 	struct amdgpu_job *job;
+	uint32_t max_bytes;
+	unsigned int i;
 	int r;
 
-	if (!adev->mman.buffer_funcs_enabled) {
-		DRM_ERROR("Trying to clear memory with ring turned off.\n");
-		return -EINVAL;
-	}
-
-	if (bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) {
-		DRM_ERROR("Trying to clear preemptible memory.\n");
-		return -EINVAL;
-	}
-
-	if (bo->tbo.resource->mem_type == TTM_PL_TT) {
-		r = amdgpu_ttm_alloc_gart(&bo->tbo);
-		if (r)
-			return r;
-	}
-
-	num_bytes = bo->tbo.resource->num_pages << PAGE_SHIFT;
-	num_loops = 0;
-
-	amdgpu_res_first(bo->tbo.resource, 0, num_bytes, &cursor);
-	while (cursor.remaining) {
-		num_loops += DIV_ROUND_UP_ULL(cursor.size, max_bytes);
-		amdgpu_res_next(&cursor, cursor.size);
-	}
-	num_dw = num_loops * adev->mman.buffer_funcs->fill_num_dw;
-
-	/* for IB padding */
-	num_dw += 64;
-
-	r = amdgpu_job_alloc_with_ib(adev, num_dw * 4, AMDGPU_IB_POOL_DELAYED,
-				     &job);
+	max_bytes = adev->mman.buffer_funcs->fill_max_bytes;
+	num_loops = DIV_ROUND_UP_ULL(byte_count, max_bytes);
+	num_dw = ALIGN(num_loops * adev->mman.buffer_funcs->fill_num_dw, 8);
+	r = amdgpu_ttm_prepare_job(adev, false, num_dw, resv, vm_needs_flush,
+				   &job);
 	if (r)
 		return r;
 
-	if (resv) {
-		r = amdgpu_sync_resv(adev, &job->sync, resv,
-				     AMDGPU_SYNC_ALWAYS,
-				     AMDGPU_FENCE_OWNER_UNDEFINED);
-		if (r) {
-			DRM_ERROR("sync failed (%d).\n", r);
-			goto error_free;
-		}
-	}
-
-	amdgpu_res_first(bo->tbo.resource, 0, num_bytes, &cursor);
-	while (cursor.remaining) {
-		uint32_t cur_size = min_t(uint64_t, cursor.size, max_bytes);
-		uint64_t dst_addr = cursor.start;
+	for (i = 0; i < num_loops; i++) {
+		uint32_t cur_size = min(byte_count, max_bytes);
 
-		dst_addr += amdgpu_ttm_domain_start(adev,
-						    bo->tbo.resource->mem_type);
 		amdgpu_emit_fill_buffer(adev, &job->ibs[0], src_data, dst_addr,
 					cur_size);
 
-		amdgpu_res_next(&cursor, cur_size);
+		dst_addr += cur_size;
+		byte_count -= cur_size;
 	}
 
 	amdgpu_ring_pad_ib(ring, &job->ibs[0]);
@@ -2082,6 +2056,56 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo,
 	return r;
 }
 
+int amdgpu_fill_buffer(struct amdgpu_bo *bo,
+			uint32_t src_data,
+			struct dma_resv *resv,
+			struct dma_fence **f)
+{
+	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
+	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
+	struct dma_fence *fence = NULL;
+	struct amdgpu_res_cursor dst;
+	int r;
+
+	if (!adev->mman.buffer_funcs_enabled) {
+		DRM_ERROR("Trying to clear memory with ring turned off.\n");
+		return -EINVAL;
+	}
+
+	amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst);
+
+	mutex_lock(&adev->mman.gtt_window_lock);
+	while (dst.remaining) {
+		struct dma_fence *next;
+		uint32_t cur_size;
+		uint64_t to;
+
+		cur_size = min_t(u64, dst.size, AMDGPU_GTT_MAX_TRANSFER_BYTES);
+
+		r = amdgpu_ttm_map_buffer(&bo->tbo, bo->tbo.resource, &dst,
+					  PFN_UP(cur_size), 1, ring, false,
+					  &to);
+		if (r)
+			goto error;
+
+		r = amdgpu_ttm_fill_mem(ring, src_data, to, cur_size, resv,
+					&next, false);
+		if (r)
+			goto error;
+
+		dma_fence_put(fence);
+		fence = next;
+
+		amdgpu_res_next(&dst, cur_size);
+	}
+error:
+	mutex_unlock(&adev->mman.gtt_window_lock);
+	if (f)
+		*f = dma_fence_get(fence);
+	dma_fence_put(fence);
+	return r;
+}
+
 /**
  * amdgpu_ttm_evict_resources - evict memory buffers
  * @adev: amdgpu device object
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
index d9691f262f16..bcd34592e45d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
@@ -35,6 +35,8 @@
 
 #define AMDGPU_GTT_MAX_TRANSFER_SIZE	512
 #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS	2
+#define AMDGPU_GTT_MAX_TRANSFER_BYTES	(AMDGPU_GTT_MAX_TRANSFER_SIZE * \
+					 AMDGPU_GPU_PAGE_SIZE)
 
 #define AMDGPU_POISON	0xd0bed0be
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: restructure amdgpu_fill_buffer
  2022-01-28 15:16 ` [PATCH 2/2] drm/amdgpu: restructure amdgpu_fill_buffer Christian König
@ 2022-01-28 15:55   ` Felix Kuehling
  2022-01-31 13:39     ` Christian König
  0 siblings, 1 reply; 4+ messages in thread
From: Felix Kuehling @ 2022-01-28 15:55 UTC (permalink / raw)
  To: Christian König, harish.kasiviswanathan, benjaminadam.price,
	amd-gfx


Am 2022-01-28 um 10:16 schrieb Christian König:
> We ran into the problem that clearing really larger buffer (60GiB) caused an
> SDMA timeout.
>
> Restructure the function to use the dst window instead of mapping the whole
> buffer into the GART and then fill only 2MiB chunks at a time.
>
> Signed-off-by: Christian König <christian.koenig@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 200 +++++++++++++-----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |   2 +
>   2 files changed, 114 insertions(+), 88 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> index 2b0e83e9fa8a..8671ba32fb52 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> @@ -296,9 +296,6 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
>   			       struct dma_resv *resv,
>   			       struct dma_fence **f)
>   {
> -	const uint32_t GTT_MAX_BYTES = (AMDGPU_GTT_MAX_TRANSFER_SIZE *
> -					AMDGPU_GPU_PAGE_SIZE);
> -
>   	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
>   	struct amdgpu_res_cursor src_mm, dst_mm;
>   	struct dma_fence *fence = NULL;
> @@ -320,12 +317,15 @@ int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
>   		uint32_t cur_size;
>   		uint64_t from, to;
>   
> -		/* Copy size cannot exceed GTT_MAX_BYTES. So if src or dst
> -		 * begins at an offset, then adjust the size accordingly
> +		/*
> +		 * Copy size cannot exceed AMDGPU_GTT_MAX_TRANSFER_BYTES. So if
> +		 * src or dst begins at an offset, then adjust the size
> +		 * accordingly
>   		 */
>   		cur_size = max(src_page_offset, dst_page_offset);
>   		cur_size = min(min3(src_mm.size, dst_mm.size, size),
> -			       (uint64_t)(GTT_MAX_BYTES - cur_size));
> +			       (uint64_t)(AMDGPU_GTT_MAX_TRANSFER_BYTES -
> +					  cur_size));
>   
>   		/* Map src to window 0 and dst to window 1. */
>   		r = amdgpu_ttm_map_buffer(src->bo, src->mem, &src_mm,
> @@ -395,8 +395,7 @@ static int amdgpu_move_blit(struct ttm_buffer_object *bo,
>   	    (abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE)) {
>   		struct dma_fence *wipe_fence = NULL;
>   
> -		r = amdgpu_fill_buffer(ttm_to_amdgpu_bo(bo), AMDGPU_POISON,
> -				       NULL, &wipe_fence);
> +		r = amdgpu_fill_buffer(abo, AMDGPU_POISON, NULL, &wipe_fence);
>   		if (r) {
>   			goto error;
>   		} else if (wipe_fence) {
> @@ -1922,19 +1921,51 @@ void amdgpu_ttm_set_buffer_funcs_status(struct amdgpu_device *adev, bool enable)
>   	adev->mman.buffer_funcs_enabled = enable;
>   }
>   
> +static int amdgpu_ttm_prepare_job(struct amdgpu_device *adev,
> +				  bool direct_submit,
> +				  unsigned int num_dw,
> +				  struct dma_resv *resv,
> +				  bool vm_needs_flush,
> +				  struct amdgpu_job **job)
> +{
> +	enum amdgpu_ib_pool_type pool = direct_submit ?
> +		AMDGPU_IB_POOL_DIRECT :
> +		AMDGPU_IB_POOL_DELAYED;
> +	int r;
> +
> +	r = amdgpu_job_alloc_with_ib(adev, num_dw * 4, pool, job);
> +	if (r)
> +		return r;
> +
> +	if (vm_needs_flush) {
> +		(*job)->vm_pd_addr = amdgpu_gmc_pd_addr(adev->gmc.pdb0_bo ?
> +							adev->gmc.pdb0_bo :
> +							adev->gart.bo);
> +		(*job)->vm_needs_flush = true;
> +	}
> +	if (resv) {
> +		r = amdgpu_sync_resv(adev, &(*job)->sync, resv,
> +				     AMDGPU_SYNC_ALWAYS,
> +				     AMDGPU_FENCE_OWNER_UNDEFINED);
> +		if (r) {
> +			DRM_ERROR("sync failed (%d).\n", r);
> +			amdgpu_job_free(*job);
> +			return r;
> +		}
> +	}
> +	return 0;
> +}
> +
>   int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
>   		       uint64_t dst_offset, uint32_t byte_count,
>   		       struct dma_resv *resv,
>   		       struct dma_fence **fence, bool direct_submit,
>   		       bool vm_needs_flush, bool tmz)
>   {
> -	enum amdgpu_ib_pool_type pool = direct_submit ? AMDGPU_IB_POOL_DIRECT :
> -		AMDGPU_IB_POOL_DELAYED;
>   	struct amdgpu_device *adev = ring->adev;
> +	unsigned num_loops, num_dw;
>   	struct amdgpu_job *job;
> -
>   	uint32_t max_bytes;
> -	unsigned num_loops, num_dw;
>   	unsigned i;
>   	int r;
>   
> @@ -1946,26 +1977,11 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
>   	max_bytes = adev->mman.buffer_funcs->copy_max_bytes;
>   	num_loops = DIV_ROUND_UP(byte_count, max_bytes);
>   	num_dw = ALIGN(num_loops * adev->mman.buffer_funcs->copy_num_dw, 8);
> -
> -	r = amdgpu_job_alloc_with_ib(adev, num_dw * 4, pool, &job);
> +	r = amdgpu_ttm_prepare_job(adev, direct_submit, num_dw,
> +				   resv, vm_needs_flush, &job);
>   	if (r)
>   		return r;
>   
> -	if (vm_needs_flush) {
> -		job->vm_pd_addr = amdgpu_gmc_pd_addr(adev->gmc.pdb0_bo ?
> -					adev->gmc.pdb0_bo : adev->gart.bo);
> -		job->vm_needs_flush = true;
> -	}
> -	if (resv) {
> -		r = amdgpu_sync_resv(adev, &job->sync, resv,
> -				     AMDGPU_SYNC_ALWAYS,
> -				     AMDGPU_FENCE_OWNER_UNDEFINED);
> -		if (r) {
> -			DRM_ERROR("sync failed (%d).\n", r);
> -			goto error_free;
> -		}
> -	}
> -
>   	for (i = 0; i < num_loops; i++) {
>   		uint32_t cur_size_in_bytes = min(byte_count, max_bytes);
>   
> @@ -1995,77 +2011,35 @@ int amdgpu_copy_buffer(struct amdgpu_ring *ring, uint64_t src_offset,
>   	return r;
>   }
>   
> -int amdgpu_fill_buffer(struct amdgpu_bo *bo,
> -		       uint32_t src_data,
> -		       struct dma_resv *resv,
> -		       struct dma_fence **fence)
> +static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, uint32_t src_data,
> +			       uint64_t dst_addr, uint32_t byte_count,
> +			       struct dma_resv *resv,
> +			       struct dma_fence **fence,
> +			       bool vm_needs_flush)
>   {
> -	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> -	uint32_t max_bytes = adev->mman.buffer_funcs->fill_max_bytes;
> -	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
> -
> -	struct amdgpu_res_cursor cursor;
> +	struct amdgpu_device *adev = ring->adev;
>   	unsigned int num_loops, num_dw;
> -	uint64_t num_bytes;
> -
>   	struct amdgpu_job *job;
> +	uint32_t max_bytes;
> +	unsigned int i;
>   	int r;
>   
> -	if (!adev->mman.buffer_funcs_enabled) {
> -		DRM_ERROR("Trying to clear memory with ring turned off.\n");
> -		return -EINVAL;
> -	}
> -
> -	if (bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) {

As far as I can see, you didn't add this check back elsewhere. The 
promise for preemptible BOs is, that we never have to wait for the GPU 
to finish accessing the BOs because the context using it is preemptible. 
This was a necessary condition to disable GTT usage accounting for such 
BOs. If you allow filling such BOs with this function, you're breaking 
that promise.


> -		DRM_ERROR("Trying to clear preemptible memory.\n");
> -		return -EINVAL;
> -	}
> -
> -	if (bo->tbo.resource->mem_type == TTM_PL_TT) {
> -		r = amdgpu_ttm_alloc_gart(&bo->tbo);
> -		if (r)
> -			return r;
> -	}
> -
> -	num_bytes = bo->tbo.resource->num_pages << PAGE_SHIFT;
> -	num_loops = 0;
> -
> -	amdgpu_res_first(bo->tbo.resource, 0, num_bytes, &cursor);
> -	while (cursor.remaining) {
> -		num_loops += DIV_ROUND_UP_ULL(cursor.size, max_bytes);
> -		amdgpu_res_next(&cursor, cursor.size);
> -	}
> -	num_dw = num_loops * adev->mman.buffer_funcs->fill_num_dw;
> -
> -	/* for IB padding */
> -	num_dw += 64;
> -
> -	r = amdgpu_job_alloc_with_ib(adev, num_dw * 4, AMDGPU_IB_POOL_DELAYED,
> -				     &job);
> +	max_bytes = adev->mman.buffer_funcs->fill_max_bytes;
> +	num_loops = DIV_ROUND_UP_ULL(byte_count, max_bytes);
> +	num_dw = ALIGN(num_loops * adev->mman.buffer_funcs->fill_num_dw, 8);
> +	r = amdgpu_ttm_prepare_job(adev, false, num_dw, resv, vm_needs_flush,
> +				   &job);
>   	if (r)
>   		return r;
>   
> -	if (resv) {
> -		r = amdgpu_sync_resv(adev, &job->sync, resv,
> -				     AMDGPU_SYNC_ALWAYS,
> -				     AMDGPU_FENCE_OWNER_UNDEFINED);
> -		if (r) {
> -			DRM_ERROR("sync failed (%d).\n", r);
> -			goto error_free;
> -		}
> -	}
> -
> -	amdgpu_res_first(bo->tbo.resource, 0, num_bytes, &cursor);
> -	while (cursor.remaining) {
> -		uint32_t cur_size = min_t(uint64_t, cursor.size, max_bytes);
> -		uint64_t dst_addr = cursor.start;
> +	for (i = 0; i < num_loops; i++) {
> +		uint32_t cur_size = min(byte_count, max_bytes);
>   
> -		dst_addr += amdgpu_ttm_domain_start(adev,
> -						    bo->tbo.resource->mem_type);
>   		amdgpu_emit_fill_buffer(adev, &job->ibs[0], src_data, dst_addr,
>   					cur_size);
>   
> -		amdgpu_res_next(&cursor, cur_size);
> +		dst_addr += cur_size;
> +		byte_count -= cur_size;
>   	}
>   
>   	amdgpu_ring_pad_ib(ring, &job->ibs[0]);
> @@ -2082,6 +2056,56 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo,
>   	return r;
>   }
>   
> +int amdgpu_fill_buffer(struct amdgpu_bo *bo,
> +			uint32_t src_data,
> +			struct dma_resv *resv,
> +			struct dma_fence **f)
> +{
> +	struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> +	struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
> +	struct dma_fence *fence = NULL;
> +	struct amdgpu_res_cursor dst;
> +	int r;
> +
> +	if (!adev->mman.buffer_funcs_enabled) {
> +		DRM_ERROR("Trying to clear memory with ring turned off.\n");
> +		return -EINVAL;
> +	}
> +
> +	amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), &dst);
> +
> +	mutex_lock(&adev->mman.gtt_window_lock);
> +	while (dst.remaining) {
> +		struct dma_fence *next;
> +		uint32_t cur_size;
> +		uint64_t to;
> +
> +		cur_size = min_t(u64, dst.size, AMDGPU_GTT_MAX_TRANSFER_BYTES);

For VRAM, the cur_size could be much bigger because we're not limited by 
the GART transfer window. I'm pretty sure that 2MB is going to add too 
much overhead. For the extreme 60GB BO example, it would require 
30-thousand IBs and IB frames to fill the entire buffer. That's a lot of 
VMID-switching, IB execution, fence signalling, interrupts, etc.


> +
> +		r = amdgpu_ttm_map_buffer(&bo->tbo, bo->tbo.resource, &dst,
> +					  PFN_UP(cur_size), 1, ring, false,
> +					  &to);
> +		if (r)
> +			goto error;
> +
> +		r = amdgpu_ttm_fill_mem(ring, src_data, to, cur_size, resv,
> +					&next, false);

If amdgpu_ttm_map_buffer updated the page tables, shouldn't the 
vm_needs_flush parameter be true? This flag should probably be returned 
by amdgpu_ttm_map_buffer.

Regards,
   Felix


> +		if (r)
> +			goto error;
> +
> +		dma_fence_put(fence);
> +		fence = next;
> +
> +		amdgpu_res_next(&dst, cur_size);
> +	}
> +error:
> +	mutex_unlock(&adev->mman.gtt_window_lock);
> +	if (f)
> +		*f = dma_fence_get(fence);
> +	dma_fence_put(fence);
> +	return r;
> +}
> +
>   /**
>    * amdgpu_ttm_evict_resources - evict memory buffers
>    * @adev: amdgpu device object
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> index d9691f262f16..bcd34592e45d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
> @@ -35,6 +35,8 @@
>   
>   #define AMDGPU_GTT_MAX_TRANSFER_SIZE	512
>   #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS	2
> +#define AMDGPU_GTT_MAX_TRANSFER_BYTES	(AMDGPU_GTT_MAX_TRANSFER_SIZE * \
> +					 AMDGPU_GPU_PAGE_SIZE)
>   
>   #define AMDGPU_POISON	0xd0bed0be
>   

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH 2/2] drm/amdgpu: restructure amdgpu_fill_buffer
  2022-01-28 15:55   ` Felix Kuehling
@ 2022-01-31 13:39     ` Christian König
  0 siblings, 0 replies; 4+ messages in thread
From: Christian König @ 2022-01-31 13:39 UTC (permalink / raw)
  To: Felix Kuehling, harish.kasiviswanathan, benjaminadam.price, amd-gfx



Am 28.01.22 um 16:55 schrieb Felix Kuehling:
>
> Am 2022-01-28 um 10:16 schrieb Christian König:
>> [SNIP]
>> -    if (bo->tbo.resource->mem_type == AMDGPU_PL_PREEMPT) {
>
> As far as I can see, you didn't add this check back elsewhere. The 
> promise for preemptible BOs is, that we never have to wait for the GPU 
> to finish accessing the BOs because the context using it is 
> preemptible. This was a necessary condition to disable GTT usage 
> accounting for such BOs. If you allow filling such BOs with this 
> function, you're breaking that promise.

That's now part of amdgpu_ttm_map_buffer(), but unfortunately as 
BUG_ON(). I've added a patch to change that into a warning.

[SNIP]
>> +        cur_size = min_t(u64, dst.size, AMDGPU_GTT_MAX_TRANSFER_BYTES);
>
> For VRAM, the cur_size could be much bigger because we're not limited 
> by the GART transfer window. I'm pretty sure that 2MB is going to add 
> too much overhead. For the extreme 60GB BO example, it would require 
> 30-thousand IBs and IB frames to fill the entire buffer. That's a lot 
> of VMID-switching, IB execution, fence signalling, interrupts, etc.

I've restructured the mapping function so that we can now copy/fill in 
256MiB chunks when no GART window is involved.

>
>
>> +
>> +        r = amdgpu_ttm_map_buffer(&bo->tbo, bo->tbo.resource, &dst,
>> +                      PFN_UP(cur_size), 1, ring, false,
>> +                      &to);
>> +        if (r)
>> +            goto error;
>> +
>> +        r = amdgpu_ttm_fill_mem(ring, src_data, to, cur_size, resv,
>> +                    &next, false);
>
> If amdgpu_ttm_map_buffer updated the page tables, shouldn't the 
> vm_needs_flush parameter be true? This flag should probably be 
> returned by amdgpu_ttm_map_buffer.

Ah, yes. That's indeed a typo. For now I've didn't added the 
vm_needs_flush parameter, but that's something we could optimize.

Regards,
Christian.

>
> Regards,
>   Felix
>
>
>> +        if (r)
>> +            goto error;
>> +
>> +        dma_fence_put(fence);
>> +        fence = next;
>> +
>> +        amdgpu_res_next(&dst, cur_size);
>> +    }
>> +error:
>> +    mutex_unlock(&adev->mman.gtt_window_lock);
>> +    if (f)
>> +        *f = dma_fence_get(fence);
>> +    dma_fence_put(fence);
>> +    return r;
>> +}
>> +
>>   /**
>>    * amdgpu_ttm_evict_resources - evict memory buffers
>>    * @adev: amdgpu device object
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h 
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> index d9691f262f16..bcd34592e45d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h
>> @@ -35,6 +35,8 @@
>>     #define AMDGPU_GTT_MAX_TRANSFER_SIZE    512
>>   #define AMDGPU_GTT_NUM_TRANSFER_WINDOWS    2
>> +#define AMDGPU_GTT_MAX_TRANSFER_BYTES (AMDGPU_GTT_MAX_TRANSFER_SIZE * \
>> +                     AMDGPU_GPU_PAGE_SIZE)
>>     #define AMDGPU_POISON    0xd0bed0be


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-01-31 13:39 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-01-28 15:16 [PATCH 1/2] drm/amdgpu: fix logic inversion in check Christian König
2022-01-28 15:16 ` [PATCH 2/2] drm/amdgpu: restructure amdgpu_fill_buffer Christian König
2022-01-28 15:55   ` Felix Kuehling
2022-01-31 13:39     ` Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.