All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] drm/amdgpu:make ctx_add_fence interruptible(v2)
@ 2017-09-19  6:41 Monk Liu
       [not found] ` <1505803274-8237-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Monk Liu @ 2017-09-19  6:41 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

otherwise a gpu hang will make application couldn't be killed
under timedout=0 mode

v2:
Fix memoryleak job/job->s_fence issue
unlock mn
remove the ERROR msg after waiting being interrupted

Change-Id: I6051b5b3ae1188983f49325a2438c84a6c12374a
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 17 +++++++++++++++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 12 +++++++-----
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index cc9a232..6ff2959 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -736,8 +736,8 @@ struct amdgpu_ctx_mgr {
 struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id);
 int amdgpu_ctx_put(struct amdgpu_ctx *ctx);
 
-uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
-			      struct dma_fence *fence);
+int amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
+			      struct dma_fence *fence, uint64_t *seq);
 struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,
 				   struct amdgpu_ring *ring, uint64_t seq);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index b59749d..9bd4834 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1043,6 +1043,8 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 	struct amd_sched_entity *entity = &p->ctx->rings[ring->idx].entity;
 	struct amdgpu_job *job;
 	unsigned i;
+	uint64_t seq;
+
 	int r;
 
 	amdgpu_mn_lock(p->mn);
@@ -1071,8 +1073,19 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
 	job->owner = p->filp;
 	job->fence_ctx = entity->fence_context;
 	p->fence = dma_fence_get(&job->base.s_fence->finished);
-	cs->out.handle = amdgpu_ctx_add_fence(p->ctx, ring, p->fence);
-	job->uf_sequence = cs->out.handle;
+	r = amdgpu_ctx_add_fence(p->ctx, ring, p->fence, &seq);
+	if (r) {
+		/* release job include the sched fence as well */
+		dma_fence_put(&job->base.s_fence->finished);
+		dma_fence_put(&job->base.s_fence->scheduled);
+		amdgpu_job_free(job);
+		amdgpu_mn_unlock(p->mn);
+		dma_fence_put(p->fence);
+		return r;
+	}
+
+	cs->out.handle = seq;
+	job->uf_sequence = seq;
 	amdgpu_job_free_resources(job);
 
 	trace_amdgpu_cs_ioctl(job);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index a11e443..551f114 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -246,8 +246,8 @@ int amdgpu_ctx_put(struct amdgpu_ctx *ctx)
 	return 0;
 }
 
-uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
-			      struct dma_fence *fence)
+int amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
+			      struct dma_fence *fence, uint64_t* handler)
 {
 	struct amdgpu_ctx_ring *cring = & ctx->rings[ring->idx];
 	uint64_t seq = cring->sequence;
@@ -258,9 +258,9 @@ uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
 	other = cring->fences[idx];
 	if (other) {
 		signed long r;
-		r = dma_fence_wait_timeout(other, false, MAX_SCHEDULE_TIMEOUT);
+		r = dma_fence_wait_timeout(other, true, MAX_SCHEDULE_TIMEOUT);
 		if (r < 0)
-			DRM_ERROR("Error (%ld) waiting for fence!\n", r);
+			return -ERESTARTSYS;
 	}
 
 	dma_fence_get(fence);
@@ -271,8 +271,10 @@ uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
 	spin_unlock(&ctx->ring_lock);
 
 	dma_fence_put(other);
+	if (handler)
+		*handler = seq;
 
-	return seq;
+	return 0;
 }
 
 struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset
       [not found] ` <1505803274-8237-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-19  6:41   ` Monk Liu
       [not found]     ` <1505803274-8237-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-09-19  6:41   ` [PATCH 3/3] drm/amdgpu:fix typo Monk Liu
  2017-09-19 11:42   ` [PATCH 1/3] drm/amdgpu:make ctx_add_fence interruptible(v2) Christian König
  2 siblings, 1 reply; 8+ messages in thread
From: Monk Liu @ 2017-09-19  6:41 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

GPU reset will require all hw doing hw_init thus
ucode_init_bo will be invoked again, which lead to
memory leak

skip the fw_buf allocation during sriov gpu reset to avoid
memory leak.

Change-Id: I31131eda1bd45ea2f5bdc50c5da5fc5a9fe9027d
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 64 +++++++++++++++----------------
 2 files changed, 35 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 6ff2959..3d0c633 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1185,6 +1185,9 @@ struct amdgpu_firmware {
 
 	/* gpu info firmware data pointer */
 	const struct firmware *gpu_info_fw;
+
+	void *fw_buf_ptr;
+	uint64_t fw_buf_mc;
 };
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
index f306374..6564902 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
@@ -360,8 +360,6 @@ static int amdgpu_ucode_patch_jt(struct amdgpu_firmware_info *ucode,
 int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
 {
 	struct amdgpu_bo **bo = &adev->firmware.fw_buf;
-	uint64_t fw_mc_addr;
-	void *fw_buf_ptr = NULL;
 	uint64_t fw_offset = 0;
 	int i, err;
 	struct amdgpu_firmware_info *ucode = NULL;
@@ -372,37 +370,39 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
 		return 0;
 	}
 
-	err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
-				amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
-				AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
-				NULL, NULL, 0, bo);
-	if (err) {
-		dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", err);
-		goto failed;
-	}
+	if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
+		err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
+					amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
+					AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
+					NULL, NULL, 0, bo);
+		if (err) {
+			dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", err);
+			goto failed;
+		}
 
-	err = amdgpu_bo_reserve(*bo, false);
-	if (err) {
-		dev_err(adev->dev, "(%d) Firmware buffer reserve failed\n", err);
-		goto failed_reserve;
-	}
+		err = amdgpu_bo_reserve(*bo, false);
+		if (err) {
+			dev_err(adev->dev, "(%d) Firmware buffer reserve failed\n", err);
+			goto failed_reserve;
+		}
 
-	err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
-				&fw_mc_addr);
-	if (err) {
-		dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", err);
-		goto failed_pin;
-	}
+		err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
+					&adev->firmware.fw_buf_mc);
+		if (err) {
+			dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", err);
+			goto failed_pin;
+		}
 
-	err = amdgpu_bo_kmap(*bo, &fw_buf_ptr);
-	if (err) {
-		dev_err(adev->dev, "(%d) Firmware buffer kmap failed\n", err);
-		goto failed_kmap;
-	}
+		err = amdgpu_bo_kmap(*bo, &adev->firmware.fw_buf_ptr);
+		if (err) {
+			dev_err(adev->dev, "(%d) Firmware buffer kmap failed\n", err);
+			goto failed_kmap;
+		}
 
-	amdgpu_bo_unreserve(*bo);
+		amdgpu_bo_unreserve(*bo);
+	}
 
-	memset(fw_buf_ptr, 0, adev->firmware.fw_size);
+	memset(adev->firmware.fw_buf_ptr, 0, adev->firmware.fw_size);
 
 	/*
 	 * if SMU loaded firmware, it needn't add SMC, UVD, and VCE
@@ -421,14 +421,14 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
 		ucode = &adev->firmware.ucode[i];
 		if (ucode->fw) {
 			header = (const struct common_firmware_header *)ucode->fw->data;
-			amdgpu_ucode_init_single_fw(adev, ucode, fw_mc_addr + fw_offset,
-						    (void *)((uint8_t *)fw_buf_ptr + fw_offset));
+			amdgpu_ucode_init_single_fw(adev, ucode, adev->firmware.fw_buf_mc + fw_offset,
+						    adev->firmware.fw_buf_ptr + fw_offset);
 			if (i == AMDGPU_UCODE_ID_CP_MEC1 &&
 			    adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
 				const struct gfx_firmware_header_v1_0 *cp_hdr;
 				cp_hdr = (const struct gfx_firmware_header_v1_0 *)ucode->fw->data;
-				amdgpu_ucode_patch_jt(ucode, fw_mc_addr + fw_offset,
-						    fw_buf_ptr + fw_offset);
+				amdgpu_ucode_patch_jt(ucode,  adev->firmware.fw_buf_mc + fw_offset,
+						    adev->firmware.fw_buf_ptr + fw_offset);
 				fw_offset += ALIGN(le32_to_cpu(cp_hdr->jt_size) << 2, PAGE_SIZE);
 			}
 			fw_offset += ALIGN(ucode->ucode_size, PAGE_SIZE);
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH 3/3] drm/amdgpu:fix typo
       [not found] ` <1505803274-8237-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-09-19  6:41   ` [PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset Monk Liu
@ 2017-09-19  6:41   ` Monk Liu
       [not found]     ` <1505803274-8237-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-09-19 11:42   ` [PATCH 1/3] drm/amdgpu:make ctx_add_fence interruptible(v2) Christian König
  2 siblings, 1 reply; 8+ messages in thread
From: Monk Liu @ 2017-09-19  6:41 UTC (permalink / raw)
  To: amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW; +Cc: Monk Liu

previously a patch has typo error, correct it

Change-Id: I91bcefd7148b5db1c7d957c868e13a46ca40ef74
Signed-off-by: Monk Liu <Monk.Liu@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 4eee2ef..35cc5ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -256,7 +256,7 @@ static int psp_hw_start(struct psp_context *psp)
 	struct amdgpu_device *adev = psp->adev;
 	int ret;
 
-	if (amdgpu_sriov_vf(adev) && !adev->in_sriov_reset) {
+	if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
 		ret = psp_bootloader_load_sysdrv(psp);
 		if (ret)
 			return ret;
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3] drm/amdgpu:make ctx_add_fence interruptible(v2)
       [not found] ` <1505803274-8237-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-09-19  6:41   ` [PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset Monk Liu
  2017-09-19  6:41   ` [PATCH 3/3] drm/amdgpu:fix typo Monk Liu
@ 2017-09-19 11:42   ` Christian König
  2 siblings, 0 replies; 8+ messages in thread
From: Christian König @ 2017-09-19 11:42 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 19.09.2017 um 08:41 schrieb Monk Liu:
> otherwise a gpu hang will make application couldn't be killed
> under timedout=0 mode
>
> v2:
> Fix memoryleak job/job->s_fence issue
> unlock mn
> remove the ERROR msg after waiting being interrupted
>
> Change-Id: I6051b5b3ae1188983f49325a2438c84a6c12374a
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h     |  4 ++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 17 +++++++++++++++--
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 12 +++++++-----
>   3 files changed, 24 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index cc9a232..6ff2959 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -736,8 +736,8 @@ struct amdgpu_ctx_mgr {
>   struct amdgpu_ctx *amdgpu_ctx_get(struct amdgpu_fpriv *fpriv, uint32_t id);
>   int amdgpu_ctx_put(struct amdgpu_ctx *ctx);
>   
> -uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
> -			      struct dma_fence *fence);
> +int amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
> +			      struct dma_fence *fence, uint64_t *seq);
>   struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,
>   				   struct amdgpu_ring *ring, uint64_t seq);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index b59749d..9bd4834 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -1043,6 +1043,8 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   	struct amd_sched_entity *entity = &p->ctx->rings[ring->idx].entity;
>   	struct amdgpu_job *job;
>   	unsigned i;
> +	uint64_t seq;
> +
>   	int r;
>   
>   	amdgpu_mn_lock(p->mn);
> @@ -1071,8 +1073,19 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
>   	job->owner = p->filp;
>   	job->fence_ctx = entity->fence_context;
>   	p->fence = dma_fence_get(&job->base.s_fence->finished);
> -	cs->out.handle = amdgpu_ctx_add_fence(p->ctx, ring, p->fence);
> -	job->uf_sequence = cs->out.handle;
> +	r = amdgpu_ctx_add_fence(p->ctx, ring, p->fence, &seq);
> +	if (r) {
> +		/* release job include the sched fence as well */
> +		dma_fence_put(&job->base.s_fence->finished);
> +		dma_fence_put(&job->base.s_fence->scheduled);
> +		amdgpu_job_free(job);
> +		amdgpu_mn_unlock(p->mn);
> +		dma_fence_put(p->fence);
> +		return r;
> +	}
> +
> +	cs->out.handle = seq;
> +	job->uf_sequence = seq;
>   	amdgpu_job_free_resources(job);
>   
>   	trace_amdgpu_cs_ioctl(job);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index a11e443..551f114 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -246,8 +246,8 @@ int amdgpu_ctx_put(struct amdgpu_ctx *ctx)
>   	return 0;
>   }
>   
> -uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
> -			      struct dma_fence *fence)
> +int amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
> +			      struct dma_fence *fence, uint64_t* handler)
>   {
>   	struct amdgpu_ctx_ring *cring = & ctx->rings[ring->idx];
>   	uint64_t seq = cring->sequence;
> @@ -258,9 +258,9 @@ uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
>   	other = cring->fences[idx];
>   	if (other) {
>   		signed long r;
> -		r = dma_fence_wait_timeout(other, false, MAX_SCHEDULE_TIMEOUT);
> +		r = dma_fence_wait_timeout(other, true, MAX_SCHEDULE_TIMEOUT);
>   		if (r < 0)
> -			DRM_ERROR("Error (%ld) waiting for fence!\n", r);
> +			return -ERESTARTSYS;

Return the original error code here, e.g. "r".

With that fixed the patch is Reviewed-by: Christian König 
<christian.koenig@amd.com>

Regards,
Christian.

>   	}
>   
>   	dma_fence_get(fence);
> @@ -271,8 +271,10 @@ uint64_t amdgpu_ctx_add_fence(struct amdgpu_ctx *ctx, struct amdgpu_ring *ring,
>   	spin_unlock(&ctx->ring_lock);
>   
>   	dma_fence_put(other);
> +	if (handler)
> +		*handler = seq;
>   
> -	return seq;
> +	return 0;
>   }
>   
>   struct dma_fence *amdgpu_ctx_get_fence(struct amdgpu_ctx *ctx,


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset
       [not found]     ` <1505803274-8237-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-19 11:43       ` Christian König
       [not found]         ` <1c39869f-5d12-437c-5b08-dba488b9c937-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
  0 siblings, 1 reply; 8+ messages in thread
From: Christian König @ 2017-09-19 11:43 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 19.09.2017 um 08:41 schrieb Monk Liu:
> GPU reset will require all hw doing hw_init thus
> ucode_init_bo will be invoked again, which lead to
> memory leak
>
> skip the fw_buf allocation during sriov gpu reset to avoid
> memory leak.
>
> Change-Id: I31131eda1bd45ea2f5bdc50c5da5fc5a9fe9027d
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Acked-by: Christian König <christian.koenig@amd.com> for now.

But we should really clean this up and do the allocation during SW init.

Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  3 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 64 +++++++++++++++----------------
>   2 files changed, 35 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 6ff2959..3d0c633 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1185,6 +1185,9 @@ struct amdgpu_firmware {
>   
>   	/* gpu info firmware data pointer */
>   	const struct firmware *gpu_info_fw;
> +
> +	void *fw_buf_ptr;
> +	uint64_t fw_buf_mc;
>   };
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> index f306374..6564902 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> @@ -360,8 +360,6 @@ static int amdgpu_ucode_patch_jt(struct amdgpu_firmware_info *ucode,
>   int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_bo **bo = &adev->firmware.fw_buf;
> -	uint64_t fw_mc_addr;
> -	void *fw_buf_ptr = NULL;
>   	uint64_t fw_offset = 0;
>   	int i, err;
>   	struct amdgpu_firmware_info *ucode = NULL;
> @@ -372,37 +370,39 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>   		return 0;
>   	}
>   
> -	err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
> -				amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> -				AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
> -				NULL, NULL, 0, bo);
> -	if (err) {
> -		dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", err);
> -		goto failed;
> -	}
> +	if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
> +		err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
> +					amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> +					AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
> +					NULL, NULL, 0, bo);
> +		if (err) {
> +			dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", err);
> +			goto failed;
> +		}
>   
> -	err = amdgpu_bo_reserve(*bo, false);
> -	if (err) {
> -		dev_err(adev->dev, "(%d) Firmware buffer reserve failed\n", err);
> -		goto failed_reserve;
> -	}
> +		err = amdgpu_bo_reserve(*bo, false);
> +		if (err) {
> +			dev_err(adev->dev, "(%d) Firmware buffer reserve failed\n", err);
> +			goto failed_reserve;
> +		}
>   
> -	err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> -				&fw_mc_addr);
> -	if (err) {
> -		dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", err);
> -		goto failed_pin;
> -	}
> +		err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> +					&adev->firmware.fw_buf_mc);
> +		if (err) {
> +			dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", err);
> +			goto failed_pin;
> +		}
>   
> -	err = amdgpu_bo_kmap(*bo, &fw_buf_ptr);
> -	if (err) {
> -		dev_err(adev->dev, "(%d) Firmware buffer kmap failed\n", err);
> -		goto failed_kmap;
> -	}
> +		err = amdgpu_bo_kmap(*bo, &adev->firmware.fw_buf_ptr);
> +		if (err) {
> +			dev_err(adev->dev, "(%d) Firmware buffer kmap failed\n", err);
> +			goto failed_kmap;
> +		}
>   
> -	amdgpu_bo_unreserve(*bo);
> +		amdgpu_bo_unreserve(*bo);
> +	}
>   
> -	memset(fw_buf_ptr, 0, adev->firmware.fw_size);
> +	memset(adev->firmware.fw_buf_ptr, 0, adev->firmware.fw_size);
>   
>   	/*
>   	 * if SMU loaded firmware, it needn't add SMC, UVD, and VCE
> @@ -421,14 +421,14 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>   		ucode = &adev->firmware.ucode[i];
>   		if (ucode->fw) {
>   			header = (const struct common_firmware_header *)ucode->fw->data;
> -			amdgpu_ucode_init_single_fw(adev, ucode, fw_mc_addr + fw_offset,
> -						    (void *)((uint8_t *)fw_buf_ptr + fw_offset));
> +			amdgpu_ucode_init_single_fw(adev, ucode, adev->firmware.fw_buf_mc + fw_offset,
> +						    adev->firmware.fw_buf_ptr + fw_offset);
>   			if (i == AMDGPU_UCODE_ID_CP_MEC1 &&
>   			    adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
>   				const struct gfx_firmware_header_v1_0 *cp_hdr;
>   				cp_hdr = (const struct gfx_firmware_header_v1_0 *)ucode->fw->data;
> -				amdgpu_ucode_patch_jt(ucode, fw_mc_addr + fw_offset,
> -						    fw_buf_ptr + fw_offset);
> +				amdgpu_ucode_patch_jt(ucode,  adev->firmware.fw_buf_mc + fw_offset,
> +						    adev->firmware.fw_buf_ptr + fw_offset);
>   				fw_offset += ALIGN(le32_to_cpu(cp_hdr->jt_size) << 2, PAGE_SIZE);
>   			}
>   			fw_offset += ALIGN(ucode->ucode_size, PAGE_SIZE);


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu:fix typo
       [not found]     ` <1505803274-8237-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
@ 2017-09-19 11:44       ` Christian König
  2017-09-19 14:25       ` Alex Deucher
  1 sibling, 0 replies; 8+ messages in thread
From: Christian König @ 2017-09-19 11:44 UTC (permalink / raw)
  To: Monk Liu, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

Am 19.09.2017 um 08:41 schrieb Monk Liu:
> previously a patch has typo error, correct it
>
> Change-Id: I91bcefd7148b5db1c7d957c868e13a46ca40ef74
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Acked-by: Christian König <christian.koenig@amd.com>

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 4eee2ef..35cc5ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -256,7 +256,7 @@ static int psp_hw_start(struct psp_context *psp)
>   	struct amdgpu_device *adev = psp->adev;
>   	int ret;
>   
> -	if (amdgpu_sriov_vf(adev) && !adev->in_sriov_reset) {
> +	if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
>   		ret = psp_bootloader_load_sysdrv(psp);
>   		if (ret)
>   			return ret;


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 3/3] drm/amdgpu:fix typo
       [not found]     ` <1505803274-8237-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
  2017-09-19 11:44       ` Christian König
@ 2017-09-19 14:25       ` Alex Deucher
  1 sibling, 0 replies; 8+ messages in thread
From: Alex Deucher @ 2017-09-19 14:25 UTC (permalink / raw)
  To: Monk Liu; +Cc: amd-gfx list

On Tue, Sep 19, 2017 at 2:41 AM, Monk Liu <Monk.Liu@amd.com> wrote:
> previously a patch has typo error, correct it

Please squash this into the previous broken patch before committing it.

Alex

>
> Change-Id: I91bcefd7148b5db1c7d957c868e13a46ca40ef74
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 4eee2ef..35cc5ee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -256,7 +256,7 @@ static int psp_hw_start(struct psp_context *psp)
>         struct amdgpu_device *adev = psp->adev;
>         int ret;
>
> -       if (amdgpu_sriov_vf(adev) && !adev->in_sriov_reset) {
> +       if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
>                 ret = psp_bootloader_load_sysdrv(psp);
>                 if (ret)
>                         return ret;
> --
> 2.7.4
>
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

* RE: [PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset
       [not found]         ` <1c39869f-5d12-437c-5b08-dba488b9c937-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
@ 2017-09-20  6:17           ` Liu, Monk
  0 siblings, 0 replies; 8+ messages in thread
From: Liu, Monk @ 2017-09-20  6:17 UTC (permalink / raw)
  To: Koenig, Christian, amd-gfx-PD4FTy7X32lNgt0PjOBp9y5qC8QIuHrW

No ,we cannot do this in SW init, because PSP need all other component finish their HW init and get the fw_size, before it can call ucode_init, so if we put this in SW init, the fw_size is still 0....


BR Monk

-----Original Message-----
From: Christian König [mailto:ckoenig.leichtzumerken@gmail.com] 
Sent: 2017年9月19日 19:44
To: Liu, Monk <Monk.Liu@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset

Am 19.09.2017 um 08:41 schrieb Monk Liu:
> GPU reset will require all hw doing hw_init thus ucode_init_bo will be 
> invoked again, which lead to memory leak
>
> skip the fw_buf allocation during sriov gpu reset to avoid memory 
> leak.
>
> Change-Id: I31131eda1bd45ea2f5bdc50c5da5fc5a9fe9027d
> Signed-off-by: Monk Liu <Monk.Liu@amd.com>

Acked-by: Christian König <christian.koenig@amd.com> for now.

But we should really clean this up and do the allocation during SW init.

Christian.

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h       |  3 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c | 64 +++++++++++++++----------------
>   2 files changed, 35 insertions(+), 32 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 6ff2959..3d0c633 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1185,6 +1185,9 @@ struct amdgpu_firmware {
>   
>   	/* gpu info firmware data pointer */
>   	const struct firmware *gpu_info_fw;
> +
> +	void *fw_buf_ptr;
> +	uint64_t fw_buf_mc;
>   };
>   
>   /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> index f306374..6564902 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c
> @@ -360,8 +360,6 @@ static int amdgpu_ucode_patch_jt(struct amdgpu_firmware_info *ucode,
>   int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>   {
>   	struct amdgpu_bo **bo = &adev->firmware.fw_buf;
> -	uint64_t fw_mc_addr;
> -	void *fw_buf_ptr = NULL;
>   	uint64_t fw_offset = 0;
>   	int i, err;
>   	struct amdgpu_firmware_info *ucode = NULL; @@ -372,37 +370,39 @@ 
> int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>   		return 0;
>   	}
>   
> -	err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
> -				amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> -				AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
> -				NULL, NULL, 0, bo);
> -	if (err) {
> -		dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", err);
> -		goto failed;
> -	}
> +	if (!amdgpu_sriov_vf(adev) || !adev->in_sriov_reset) {
> +		err = amdgpu_bo_create(adev, adev->firmware.fw_size, PAGE_SIZE, true,
> +					amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> +					AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS,
> +					NULL, NULL, 0, bo);
> +		if (err) {
> +			dev_err(adev->dev, "(%d) Firmware buffer allocate failed\n", err);
> +			goto failed;
> +		}
>   
> -	err = amdgpu_bo_reserve(*bo, false);
> -	if (err) {
> -		dev_err(adev->dev, "(%d) Firmware buffer reserve failed\n", err);
> -		goto failed_reserve;
> -	}
> +		err = amdgpu_bo_reserve(*bo, false);
> +		if (err) {
> +			dev_err(adev->dev, "(%d) Firmware buffer reserve failed\n", err);
> +			goto failed_reserve;
> +		}
>   
> -	err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> -				&fw_mc_addr);
> -	if (err) {
> -		dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", err);
> -		goto failed_pin;
> -	}
> +		err = amdgpu_bo_pin(*bo, amdgpu_sriov_vf(adev) ? AMDGPU_GEM_DOMAIN_VRAM : AMDGPU_GEM_DOMAIN_GTT,
> +					&adev->firmware.fw_buf_mc);
> +		if (err) {
> +			dev_err(adev->dev, "(%d) Firmware buffer pin failed\n", err);
> +			goto failed_pin;
> +		}
>   
> -	err = amdgpu_bo_kmap(*bo, &fw_buf_ptr);
> -	if (err) {
> -		dev_err(adev->dev, "(%d) Firmware buffer kmap failed\n", err);
> -		goto failed_kmap;
> -	}
> +		err = amdgpu_bo_kmap(*bo, &adev->firmware.fw_buf_ptr);
> +		if (err) {
> +			dev_err(adev->dev, "(%d) Firmware buffer kmap failed\n", err);
> +			goto failed_kmap;
> +		}
>   
> -	amdgpu_bo_unreserve(*bo);
> +		amdgpu_bo_unreserve(*bo);
> +	}
>   
> -	memset(fw_buf_ptr, 0, adev->firmware.fw_size);
> +	memset(adev->firmware.fw_buf_ptr, 0, adev->firmware.fw_size);
>   
>   	/*
>   	 * if SMU loaded firmware, it needn't add SMC, UVD, and VCE @@ 
> -421,14 +421,14 @@ int amdgpu_ucode_init_bo(struct amdgpu_device *adev)
>   		ucode = &adev->firmware.ucode[i];
>   		if (ucode->fw) {
>   			header = (const struct common_firmware_header *)ucode->fw->data;
> -			amdgpu_ucode_init_single_fw(adev, ucode, fw_mc_addr + fw_offset,
> -						    (void *)((uint8_t *)fw_buf_ptr + fw_offset));
> +			amdgpu_ucode_init_single_fw(adev, ucode, adev->firmware.fw_buf_mc + fw_offset,
> +						    adev->firmware.fw_buf_ptr + fw_offset);
>   			if (i == AMDGPU_UCODE_ID_CP_MEC1 &&
>   			    adev->firmware.load_type != AMDGPU_FW_LOAD_PSP) {
>   				const struct gfx_firmware_header_v1_0 *cp_hdr;
>   				cp_hdr = (const struct gfx_firmware_header_v1_0 *)ucode->fw->data;
> -				amdgpu_ucode_patch_jt(ucode, fw_mc_addr + fw_offset,
> -						    fw_buf_ptr + fw_offset);
> +				amdgpu_ucode_patch_jt(ucode,  adev->firmware.fw_buf_mc + fw_offset,
> +						    adev->firmware.fw_buf_ptr + fw_offset);
>   				fw_offset += ALIGN(le32_to_cpu(cp_hdr->jt_size) << 2, PAGE_SIZE);
>   			}
>   			fw_offset += ALIGN(ucode->ucode_size, PAGE_SIZE);


_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-09-20  6:17 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-09-19  6:41 [PATCH 1/3] drm/amdgpu:make ctx_add_fence interruptible(v2) Monk Liu
     [not found] ` <1505803274-8237-1-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-09-19  6:41   ` [PATCH 2/3] drm/amdgpu/sriov:fix memory leak after gpu reset Monk Liu
     [not found]     ` <1505803274-8237-2-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-09-19 11:43       ` Christian König
     [not found]         ` <1c39869f-5d12-437c-5b08-dba488b9c937-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org>
2017-09-20  6:17           ` Liu, Monk
2017-09-19  6:41   ` [PATCH 3/3] drm/amdgpu:fix typo Monk Liu
     [not found]     ` <1505803274-8237-3-git-send-email-Monk.Liu-5C7GfCeVMHo@public.gmane.org>
2017-09-19 11:44       ` Christian König
2017-09-19 14:25       ` Alex Deucher
2017-09-19 11:42   ` [PATCH 1/3] drm/amdgpu:make ctx_add_fence interruptible(v2) Christian König

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.