All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Revert "drm/amdgpu: remove ctx->lock"
       [not found] <62d06aef-ff23-93a3-dc36-c4840b1f6d80@amd.com>
@ 2022-06-21 14:42 ` Luben Tuikov
  2022-06-27 21:30   ` Alex Deucher
  2022-07-12  3:22   ` Tales Lelo da Aparecida
  0 siblings, 2 replies; 4+ messages in thread
From: Luben Tuikov @ 2022-06-21 14:42 UTC (permalink / raw)
  To: amd-gfx
  Cc: Alex Deucher, Andrey Grodzovsky, Luben Tuikov,
	Christian König, Vitaly Prosyak

This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57.

We see that the bo validate list gets corrupted, in
amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on
the next line, references a NULL bo and we get a koops.

Bisecting leads to the commit being reverted as the cause of the list
corruption. See the link below for details of the corruption failure.

Cc: Christian König <christian.koenig@amd.com>
Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
Cc: Alex Deucher <Alexander.Deucher@amd.com>
Cc: Vitaly Prosyak <Vitaly.Prosyak@amd.com>
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539
Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 16 +++++-----------
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
 3 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 36ac1f1d11e6b4..e619031753b927 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
 		goto free_chunk;
 	}
 
+	mutex_lock(&p->ctx->lock);
+
 	/* skip guilty context job */
 	if (atomic_read(&p->ctx->guilty) == 1) {
 		ret = -ECANCELED;
@@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
 		}
 	}
 
-	/* Move fence waiting after getting reservation lock of
-	 * PD root. Then there is no need on a ctx mutex lock.
-	 */
-	r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
-	if (unlikely(r != 0)) {
-		if (r != -ERESTARTSYS)
-			DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
-		goto error_validate;
-	}
-
 	amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
 					  &p->bytes_moved_vis_threshold);
 	p->bytes_moved = 0;
@@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
 	dma_fence_put(parser->fence);
 
 	if (parser->ctx) {
+		mutex_unlock(&parser->ctx->lock);
 		amdgpu_ctx_put(parser->ctx);
 	}
 	if (parser->bo_list)
@@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
 	if (parser->job->uf_addr && ring->funcs->no_user_fence)
 		return -EINVAL;
 
-	return 0;
+	return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity);
 }
 
 static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
@@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
 		goto out;
 
 	r = amdgpu_cs_submit(&parser, cs);
+
 out:
 	amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 53f9268366f29e..2ef5296216d64d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
 	kref_init(&ctx->refcount);
 	ctx->mgr = mgr;
 	spin_lock_init(&ctx->ring_lock);
+	mutex_init(&ctx->lock);
 
 	ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
 	ctx->reset_counter_query = ctx->reset_counter;
@@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
 		drm_dev_exit(idx);
 	}
 
+	mutex_destroy(&ctx->lock);
 	kfree(ctx);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
index 0fa0e56daf67e9..cc7c8afff4144c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
@@ -53,6 +53,7 @@ struct amdgpu_ctx {
 	bool				preamble_presented;
 	int32_t				init_priority;
 	int32_t				override_priority;
+	struct mutex			lock;
 	atomic_t			guilty;
 	unsigned long			ras_counter_ce;
 	unsigned long			ras_counter_ue;

base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2
-- 
2.36.1.74.g277cf0bc36


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] Revert "drm/amdgpu: remove ctx->lock"
  2022-06-21 14:42 ` [PATCH] Revert "drm/amdgpu: remove ctx->lock" Luben Tuikov
@ 2022-06-27 21:30   ` Alex Deucher
  2022-07-12  3:22   ` Tales Lelo da Aparecida
  1 sibling, 0 replies; 4+ messages in thread
From: Alex Deucher @ 2022-06-27 21:30 UTC (permalink / raw)
  To: Luben Tuikov
  Cc: Alex Deucher, Andrey Grodzovsky, Christian König,
	amd-gfx list, Vitaly Prosyak

On Tue, Jun 21, 2022 at 10:42 AM Luben Tuikov <luben.tuikov@amd.com> wrote:
>
> This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57.
>
> We see that the bo validate list gets corrupted, in
> amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on
> the next line, references a NULL bo and we get a koops.
>
> Bisecting leads to the commit being reverted as the cause of the list
> corruption. See the link below for details of the corruption failure.
>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> Cc: Alex Deucher <Alexander.Deucher@amd.com>
> Cc: Vitaly Prosyak <Vitaly.Prosyak@amd.com>
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539

Looks like this bug is also relevant:

Bug: https://bugzilla.kernel.org/show_bug.cgi?id=216143

> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 16 +++++-----------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  2 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
>  3 files changed, 8 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 36ac1f1d11e6b4..e619031753b927 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>                 goto free_chunk;
>         }
>
> +       mutex_lock(&p->ctx->lock);
> +
>         /* skip guilty context job */
>         if (atomic_read(&p->ctx->guilty) == 1) {
>                 ret = -ECANCELED;
> @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>                 }
>         }
>
> -       /* Move fence waiting after getting reservation lock of
> -        * PD root. Then there is no need on a ctx mutex lock.
> -        */
> -       r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
> -       if (unlikely(r != 0)) {
> -               if (r != -ERESTARTSYS)
> -                       DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
> -               goto error_validate;
> -       }
> -
>         amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
>                                           &p->bytes_moved_vis_threshold);
>         p->bytes_moved = 0;
> @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
>         dma_fence_put(parser->fence);
>
>         if (parser->ctx) {
> +               mutex_unlock(&parser->ctx->lock);
>                 amdgpu_ctx_put(parser->ctx);
>         }
>         if (parser->bo_list)
> @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>         if (parser->job->uf_addr && ring->funcs->no_user_fence)
>                 return -EINVAL;
>
> -       return 0;
> +       return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity);
>  }
>
>  static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
> @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>                 goto out;
>
>         r = amdgpu_cs_submit(&parser, cs);
> +
>  out:
>         amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 53f9268366f29e..2ef5296216d64d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
>         kref_init(&ctx->refcount);
>         ctx->mgr = mgr;
>         spin_lock_init(&ctx->ring_lock);
> +       mutex_init(&ctx->lock);
>
>         ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
>         ctx->reset_counter_query = ctx->reset_counter;
> @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
>                 drm_dev_exit(idx);
>         }
>
> +       mutex_destroy(&ctx->lock);
>         kfree(ctx);
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index 0fa0e56daf67e9..cc7c8afff4144c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -53,6 +53,7 @@ struct amdgpu_ctx {
>         bool                            preamble_presented;
>         int32_t                         init_priority;
>         int32_t                         override_priority;
> +       struct mutex                    lock;
>         atomic_t                        guilty;
>         unsigned long                   ras_counter_ce;
>         unsigned long                   ras_counter_ue;
>
> base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2
> --
> 2.36.1.74.g277cf0bc36
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Revert "drm/amdgpu: remove ctx->lock"
  2022-06-21 14:42 ` [PATCH] Revert "drm/amdgpu: remove ctx->lock" Luben Tuikov
  2022-06-27 21:30   ` Alex Deucher
@ 2022-07-12  3:22   ` Tales Lelo da Aparecida
  2022-07-12  3:29     ` Luben Tuikov
  1 sibling, 1 reply; 4+ messages in thread
From: Tales Lelo da Aparecida @ 2022-07-12  3:22 UTC (permalink / raw)
  To: Luben Tuikov
  Cc: Andrey Grodzovsky, siqueirajordao, magalilemes00,
	Maíra Canal, amd-gfx, Melissa Wen, Vitaly Prosyak,
	Alex Deucher, Isabella Basso, André Almeida,
	Christian König, Trevor Woerner

On 21/06/2022 11:42, Luben Tuikov wrote:
> This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57.
> 
> We see that the bo validate list gets corrupted, in
> amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on
> the next line, references a NULL bo and we get a koops.
> 
> Bisecting leads to the commit being reverted as the cause of the list
> corruption. See the link below for details of the corruption failure.
> 
> Cc: Christian König <christian.koenig@amd.com>
> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
> Cc: Alex Deucher <Alexander.Deucher@amd.com>
> Cc: Vitaly Prosyak <Vitaly.Prosyak@amd.com>
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2048#note_1427539
> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 16 +++++-----------
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  2 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
>   3 files changed, 8 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> index 36ac1f1d11e6b4..e619031753b927 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
> @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>   		goto free_chunk;
>   	}
>   
> +	mutex_lock(&p->ctx->lock);
> +
>   	/* skip guilty context job */
>   	if (atomic_read(&p->ctx->guilty) == 1) {
>   		ret = -ECANCELED;
> @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>   		}
>   	}
>   
> -	/* Move fence waiting after getting reservation lock of
> -	 * PD root. Then there is no need on a ctx mutex lock.
> -	 */
> -	r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
> -	if (unlikely(r != 0)) {
> -		if (r != -ERESTARTSYS)
> -			DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
> -		goto error_validate;
> -	}
> -
>   	amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
>   					  &p->bytes_moved_vis_threshold);
>   	p->bytes_moved = 0;
> @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
>   	dma_fence_put(parser->fence);
>   
>   	if (parser->ctx) {
> +		mutex_unlock(&parser->ctx->lock);
>   		amdgpu_ctx_put(parser->ctx);
>   	}
>   	if (parser->bo_list)
> @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>   	if (parser->job->uf_addr && ring->funcs->no_user_fence)
>   		return -EINVAL;
>   
> -	return 0;
> +	return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity);
>   }
>   
>   static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
> @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>   		goto out;
>   
>   	r = amdgpu_cs_submit(&parser, cs);
> +
>   out:
>   	amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> index 53f9268366f29e..2ef5296216d64d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
> @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
>   	kref_init(&ctx->refcount);
>   	ctx->mgr = mgr;
>   	spin_lock_init(&ctx->ring_lock);
> +	mutex_init(&ctx->lock);
>   
>   	ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
>   	ctx->reset_counter_query = ctx->reset_counter;
> @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
>   		drm_dev_exit(idx);
>   	}
>   
> +	mutex_destroy(&ctx->lock);
>   	kfree(ctx);
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> index 0fa0e56daf67e9..cc7c8afff4144c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
> @@ -53,6 +53,7 @@ struct amdgpu_ctx {
>   	bool				preamble_presented;
>   	int32_t				init_priority;
>   	int32_t				override_priority;
> +	struct mutex			lock;
>   	atomic_t			guilty;
>   	unsigned long			ras_counter_ce;
>   	unsigned long			ras_counter_ue;
> 
> base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2

Hello,

Applied on amd-staging-drm-next (with two additional commits from 
torvalds/master to allow me to compile using GCC12: 52a9dab6, 82880283) 
and tested with IGT using a RX580*; the errors on the "amd_cs_nop" tests 
are gone!

* Advanced Micro Devices, Inc. [AMD/ATI] Ellesmere [Radeon RX 
470/480/570/570X/580/580X/590] (rev e7)

The remaining IGT tests matching ".*amdgpu.*" were not affected. (FYI, 
amdgpu/amd_plane, amdgpu/amd_bypass, amdgpu/amd_plane, 
amdgpu/amd_vrr_range are failing in here, some should skip instead, but 
that's another thread to come)

Tested-by: Tales Aparecida <tales.aparecida@gmail.com>

Thanks for your time,
Tales

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] Revert "drm/amdgpu: remove ctx->lock"
  2022-07-12  3:22   ` Tales Lelo da Aparecida
@ 2022-07-12  3:29     ` Luben Tuikov
  0 siblings, 0 replies; 4+ messages in thread
From: Luben Tuikov @ 2022-07-12  3:29 UTC (permalink / raw)
  To: 20220621144227.664800-1-luben.tuikov
  Cc: Andrey Grodzovsky, siqueirajordao, magalilemes00,
	Maíra Canal, amd-gfx, Melissa Wen, Vitaly Prosyak,
	Alex Deucher, Isabella Basso, André Almeida,
	Christian König, Trevor Woerner

On 2022-07-11 23:22, Tales Lelo da Aparecida wrote:
> On 21/06/2022 11:42, Luben Tuikov wrote:
>> This reverts commit e68efb27647f2106d6b545667f35b2ea39746b57.
>>
>> We see that the bo validate list gets corrupted, in
>> amdgpu_cs_list_validate(), the lobj->tv.bo is NULL. Then getting usermm on
>> the next line, references a NULL bo and we get a koops.
>>
>> Bisecting leads to the commit being reverted as the cause of the list
>> corruption. See the link below for details of the corruption failure.
>>
>> Cc: Christian König <christian.koenig@amd.com>
>> Cc: Andrey Grodzovsky <Andrey.Grodzovsky@amd.com>
>> Cc: Alex Deucher <Alexander.Deucher@amd.com>
>> Cc: Vitaly Prosyak <Vitaly.Prosyak@amd.com>
>> Link: https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues%2F2048%23note_1427539&amp;data=05%7C01%7Cluben.tuikov%40amd.com%7C32a7f3eaffff490fc69f08da63b5c925%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637931929684937036%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=eXMfeyqaPhBIl9iJa4kbQtN1r4OhFZSGkZiGRJ3MQsQ%3D&amp;reserved=0
>> Signed-off-by: Luben Tuikov <luben.tuikov@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c  | 16 +++++-----------
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c |  2 ++
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h |  1 +
>>   3 files changed, 8 insertions(+), 11 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> index 36ac1f1d11e6b4..e619031753b927 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
>> @@ -128,6 +128,8 @@ static int amdgpu_cs_parser_init(struct amdgpu_cs_parser *p, union drm_amdgpu_cs
>>   		goto free_chunk;
>>   	}
>>   
>> +	mutex_lock(&p->ctx->lock);
>> +
>>   	/* skip guilty context job */
>>   	if (atomic_read(&p->ctx->guilty) == 1) {
>>   		ret = -ECANCELED;
>> @@ -585,16 +587,6 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p,
>>   		}
>>   	}
>>   
>> -	/* Move fence waiting after getting reservation lock of
>> -	 * PD root. Then there is no need on a ctx mutex lock.
>> -	 */
>> -	r = amdgpu_ctx_wait_prev_fence(p->ctx, p->entity);
>> -	if (unlikely(r != 0)) {
>> -		if (r != -ERESTARTSYS)
>> -			DRM_ERROR("amdgpu_ctx_wait_prev_fence failed.\n");
>> -		goto error_validate;
>> -	}
>> -
>>   	amdgpu_cs_get_threshold_for_moves(p->adev, &p->bytes_moved_threshold,
>>   					  &p->bytes_moved_vis_threshold);
>>   	p->bytes_moved = 0;
>> @@ -722,6 +714,7 @@ static void amdgpu_cs_parser_fini(struct amdgpu_cs_parser *parser, int error,
>>   	dma_fence_put(parser->fence);
>>   
>>   	if (parser->ctx) {
>> +		mutex_unlock(&parser->ctx->lock);
>>   		amdgpu_ctx_put(parser->ctx);
>>   	}
>>   	if (parser->bo_list)
>> @@ -965,7 +958,7 @@ static int amdgpu_cs_ib_fill(struct amdgpu_device *adev,
>>   	if (parser->job->uf_addr && ring->funcs->no_user_fence)
>>   		return -EINVAL;
>>   
>> -	return 0;
>> +	return amdgpu_ctx_wait_prev_fence(parser->ctx, parser->entity);
>>   }
>>   
>>   static int amdgpu_cs_process_fence_dep(struct amdgpu_cs_parser *p,
>> @@ -1384,6 +1377,7 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
>>   		goto out;
>>   
>>   	r = amdgpu_cs_submit(&parser, cs);
>> +
>>   out:
>>   	amdgpu_cs_parser_fini(&parser, r, reserved_buffers);
>>   
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> index 53f9268366f29e..2ef5296216d64d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
>> @@ -286,6 +286,7 @@ static int amdgpu_ctx_init(struct amdgpu_ctx_mgr *mgr, int32_t priority,
>>   	kref_init(&ctx->refcount);
>>   	ctx->mgr = mgr;
>>   	spin_lock_init(&ctx->ring_lock);
>> +	mutex_init(&ctx->lock);
>>   
>>   	ctx->reset_counter = atomic_read(&mgr->adev->gpu_reset_counter);
>>   	ctx->reset_counter_query = ctx->reset_counter;
>> @@ -400,6 +401,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
>>   		drm_dev_exit(idx);
>>   	}
>>   
>> +	mutex_destroy(&ctx->lock);
>>   	kfree(ctx);
>>   }
>>   
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> index 0fa0e56daf67e9..cc7c8afff4144c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h
>> @@ -53,6 +53,7 @@ struct amdgpu_ctx {
>>   	bool				preamble_presented;
>>   	int32_t				init_priority;
>>   	int32_t				override_priority;
>> +	struct mutex			lock;
>>   	atomic_t			guilty;
>>   	unsigned long			ras_counter_ce;
>>   	unsigned long			ras_counter_ue;
>>
>> base-commit: f4b3c543a2a759ce657de4b6b7e88eaddee85ec2
> 
> Hello,
> 
> Applied on amd-staging-drm-next (with two additional commits from 
> torvalds/master to allow me to compile using GCC12: 52a9dab6, 82880283) 
> and tested with IGT using a RX580*; the errors on the "amd_cs_nop" tests 
> are gone!
> 
> * Advanced Micro Devices, Inc. [AMD/ATI] Ellesmere [Radeon RX 
> 470/480/570/570X/580/580X/590] (rev e7)
> 
> The remaining IGT tests matching ".*amdgpu.*" were not affected. (FYI, 
> amdgpu/amd_plane, amdgpu/amd_bypass, amdgpu/amd_plane, 
> amdgpu/amd_vrr_range are failing in here, some should skip instead, but 
> that's another thread to come)
> 
> Tested-by: Tales Aparecida <tales.aparecida@gmail.com>
> 
> Thanks for your time,
> Tales

Hi Tales,

Thank you for testing this patch and letting us know of the results.
This helps a lot.


Best Regards,
-- 
Luben

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2022-07-12  3:29 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <62d06aef-ff23-93a3-dc36-c4840b1f6d80@amd.com>
2022-06-21 14:42 ` [PATCH] Revert "drm/amdgpu: remove ctx->lock" Luben Tuikov
2022-06-27 21:30   ` Alex Deucher
2022-07-12  3:22   ` Tales Lelo da Aparecida
2022-07-12  3:29     ` Luben Tuikov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.