All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails
@ 2021-06-16  8:35 xinhui pan
  2021-06-16  8:35 ` [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold xinhui pan
  2021-06-16 22:55 ` [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails Felix Kuehling
  0 siblings, 2 replies; 6+ messages in thread
From: xinhui pan @ 2021-06-16  8:35 UTC (permalink / raw)
  To: amd-gfx; +Cc: alexander.deucher, Felix.Kuehling, xinhui pan, christian.koenig

Some resource are freed even destroy queue fails. That will cause double
free when user-space issue another destroy_queue ioctl.

Paste some log below.

amdgpu: Can't create new usermode queue because -1 queues were already
created

refcount_t: underflow; use-after-free.
Call Trace:
 kobject_put+0xe6/0x1b0
 kfd_procfs_del_queue+0x37/0x50 [amdgpu]
 pqm_destroy_queue+0x17a/0x390 [amdgpu]
 kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
 kfd_ioctl+0x463/0x690 [amdgpu]

BUG kmalloc-32 (Tainted: G        W        ): Object already free
INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
pid=2511
 __slab_alloc+0x72/0x80
 kmem_cache_alloc_trace+0x81f/0x8c0
 allocate_sdma_mqd+0x30/0xb0 [amdgpu]
 create_queue_cpsch+0xbf/0x470 [amdgpu]
 pqm_create_queue+0x28d/0x6d0 [amdgpu]
 kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
pid=2511
 kfree+0x322/0x340
 free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
 destroy_queue_cpsch+0x20c/0x330 [amdgpu]
 pqm_destroy_queue+0x1a3/0x390 [amdgpu]
 kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 2 ++
 drivers/gpu/drm/amd/amdkfd/kfd_process.c               | 4 +++-
 drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e6366b408420..c24ab8f17eb6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1529,6 +1529,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
 				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
 		if (retval == -ETIME)
 			qpd->reset_wavefronts = true;
+		else if (retval)
+			goto failed_try_destroy_debugged_queue;
 		if (q->properties.is_gws) {
 			dqm->gws_queue_count--;
 			qpd->mapped_gws_queue = false;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 09b98a83f670..984197e5929f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
 
 void kfd_procfs_del_queue(struct queue *q)
 {
-	if (!q)
+	if (!q || !kobject_get_unless_zero(&q->kobj))
 		return;
 
 	kobject_del(&q->kobj);
 	kobject_put(&q->kobj);
+	/* paired with the get above */
+	kobject_put(&q->kobj);
 }
 
 int kfd_process_create_wq(void)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 95a6c36cea4c..4fcb64bc43dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
 		dqm = pqn->kq->dev->dqm;
 		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
 		kernel_queue_uninit(pqn->kq, false);
+		pqn->kq = NULL;
 	}
 
 	if (pqn->q) {
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold
  2021-06-16  8:35 [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
@ 2021-06-16  8:35 ` xinhui pan
  2021-06-16 23:09   ` Felix Kuehling
  2021-06-16 22:55 ` [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails Felix Kuehling
  1 sibling, 1 reply; 6+ messages in thread
From: xinhui pan @ 2021-06-16  8:35 UTC (permalink / raw)
  To: amd-gfx; +Cc: alexander.deucher, Felix.Kuehling, xinhui pan, christian.koenig

To avoid any list corruption.

Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c24ab8f17eb6..1f84de861ec6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1704,7 +1704,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 		struct qcm_process_device *qpd)
 {
 	int retval;
-	struct queue *q, *next;
+	struct queue *q;
 	struct kernel_queue *kq, *kq_next;
 	struct mqd_manager *mqd_mgr;
 	struct device_process_node *cur, *next_dpn;
@@ -1739,8 +1739,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 				qpd->mapped_gws_queue = false;
 			}
 		}
-
-		dqm->total_queue_count--;
 	}
 
 	/* Unregister process */
@@ -1772,13 +1770,19 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
 	/* Lastly, free mqd resources.
 	 * Do free_mqd() after dqm_unlock to avoid circular locking.
 	 */
-	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+	dqm_lock(dqm);
+	while (!list_empty(&qpd->queues_list)) {
+		q = list_first_entry(&qpd->queues_list, struct queue, list);
 		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
 				q->properties.type)];
 		list_del(&q->list);
 		qpd->queue_count--;
+		dqm->total_queue_count--;
+		dqm_unlock(dqm);
 		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+		dqm_lock(dqm);
 	}
+	dqm_unlock(dqm);
 
 	return retval;
 }
-- 
2.25.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails
  2021-06-16  8:35 [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
  2021-06-16  8:35 ` [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold xinhui pan
@ 2021-06-16 22:55 ` Felix Kuehling
  2021-06-17  0:48   ` Pan, Xinhui
  1 sibling, 1 reply; 6+ messages in thread
From: Felix Kuehling @ 2021-06-16 22:55 UTC (permalink / raw)
  To: xinhui pan, amd-gfx; +Cc: alexander.deucher, christian.koenig

On 2021-06-16 4:35 a.m., xinhui pan wrote:
> Some resource are freed even destroy queue fails.

Looks like you're keeping this behaviour for -ETIME. That is consistent 
with what pqn_destroy_queue does. What you're fixing here is the 
behaviour for non-timeout errors. Please make that clear in the patch 
description.

Out of curiosity, what kind of error were you getting? The only other 
ones that are not a fatal memory shortage, are some EINVAL cases in 
pm_unmap_queues_v*. But that would indicate some internal error, that a 
queue was created with an invalid type, or maybe the queue data 
structure was somehow corrupted.


>   That will cause double
> free when user-space issue another destroy_queue ioctl.
>
> Paste some log below.
>
> amdgpu: Can't create new usermode queue because -1 queues were already
> created
>
> refcount_t: underflow; use-after-free.
> Call Trace:
>   kobject_put+0xe6/0x1b0
>   kfd_procfs_del_queue+0x37/0x50 [amdgpu]
>   pqm_destroy_queue+0x17a/0x390 [amdgpu]
>   kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>   kfd_ioctl+0x463/0x690 [amdgpu]
>
> BUG kmalloc-32 (Tainted: G        W        ): Object already free
> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
> pid=2511
>   __slab_alloc+0x72/0x80
>   kmem_cache_alloc_trace+0x81f/0x8c0
>   allocate_sdma_mqd+0x30/0xb0 [amdgpu]
>   create_queue_cpsch+0xbf/0x470 [amdgpu]
>   pqm_create_queue+0x28d/0x6d0 [amdgpu]
>   kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
> pid=2511
>   kfree+0x322/0x340
>   free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
>   destroy_queue_cpsch+0x20c/0x330 [amdgpu]
>   pqm_destroy_queue+0x1a3/0x390 [amdgpu]
>   kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 2 ++
>   drivers/gpu/drm/amd/amdkfd/kfd_process.c               | 4 +++-
>   drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 1 +
>   3 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index e6366b408420..c24ab8f17eb6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1529,6 +1529,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>   				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>   		if (retval == -ETIME)
>   			qpd->reset_wavefronts = true;
> +		else if (retval)
> +			goto failed_try_destroy_debugged_queue;
>   		if (q->properties.is_gws) {
>   			dqm->gws_queue_count--;
>   			qpd->mapped_gws_queue = false;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 09b98a83f670..984197e5929f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>   
>   void kfd_procfs_del_queue(struct queue *q)
>   {
> -	if (!q)
> +	if (!q || !kobject_get_unless_zero(&q->kobj))
>   		return;
>   
>   	kobject_del(&q->kobj);
>   	kobject_put(&q->kobj);
> +	/* paired with the get above */
> +	kobject_put(&q->kobj);
>   }
>   
>   int kfd_process_create_wq(void)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 95a6c36cea4c..4fcb64bc43dd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>   		dqm = pqn->kq->dev->dqm;
>   		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>   		kernel_queue_uninit(pqn->kq, false);
> +		pqn->kq = NULL;

This seems unrelated to this patch. But if you're fixing this, I'd 
expect a similar fix after uninit_queue(pqn->q).

Regards,
   Felix


>   	}
>   
>   	if (pqn->q) {
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold
  2021-06-16  8:35 ` [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold xinhui pan
@ 2021-06-16 23:09   ` Felix Kuehling
  2021-06-17  3:05     ` Chen, Jiansong (Simon)
  0 siblings, 1 reply; 6+ messages in thread
From: Felix Kuehling @ 2021-06-16 23:09 UTC (permalink / raw)
  To: xinhui pan, amd-gfx; +Cc: alexander.deucher, christian.koenig

On 2021-06-16 4:35 a.m., xinhui pan wrote:
> To avoid any list corruption.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
>   1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c24ab8f17eb6..1f84de861ec6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1704,7 +1704,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   		struct qcm_process_device *qpd)
>   {
>   	int retval;
> -	struct queue *q, *next;
> +	struct queue *q;
>   	struct kernel_queue *kq, *kq_next;
>   	struct mqd_manager *mqd_mgr;
>   	struct device_process_node *cur, *next_dpn;
> @@ -1739,8 +1739,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   				qpd->mapped_gws_queue = false;
>   			}
>   		}
> -
> -		dqm->total_queue_count--;

I think this should stay here. This is only used to check the maximum 
user queue limit per-device, which is a HW limitation. As far as the HW 
is concerned, the queues are destroyed after the call to 
execute_queues_cpsch. So there is no need to delay this update.


>   	}
>   
>   	/* Unregister process */
> @@ -1772,13 +1770,19 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>   	/* Lastly, free mqd resources.
>   	 * Do free_mqd() after dqm_unlock to avoid circular locking.
>   	 */
> -	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
> +	dqm_lock(dqm);

Instead of taking the dqm lock again, just move this up a couple of 
lines before the dqm_unlock call.

Regards,
   Felix


> +	while (!list_empty(&qpd->queues_list)) {
> +		q = list_first_entry(&qpd->queues_list, struct queue, list);
>   		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>   				q->properties.type)];
>   		list_del(&q->list);
>   		qpd->queue_count--;
> +		dqm->total_queue_count--;
> +		dqm_unlock(dqm);
>   		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +		dqm_lock(dqm);
>   	}
> +	dqm_unlock(dqm);
>   
>   	return retval;
>   }
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails
  2021-06-16 22:55 ` [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails Felix Kuehling
@ 2021-06-17  0:48   ` Pan, Xinhui
  0 siblings, 0 replies; 6+ messages in thread
From: Pan, Xinhui @ 2021-06-17  0:48 UTC (permalink / raw)
  To: Kuehling, Felix
  Cc: Deucher, Alexander, Pan, Xinhui, Koenig, Christian, amd-gfx

[-- Attachment #1: Type: text/plain, Size: 4513 bytes --]



> 2021年6月17日 06:55,Kuehling, Felix <Felix.Kuehling@amd.com> 写道:
> 
> On 2021-06-16 4:35 a.m., xinhui pan wrote:
>> Some resource are freed even destroy queue fails.
> 
> Looks like you're keeping this behaviour for -ETIME. That is consistent with what pqn_destroy_queue does. What you're fixing here is the behaviour for non-timeout errors. Please make that clear in the patch description.
will do that in v2.

> 
> Out of curiosity, what kind of error were you getting? The only other ones that are not a fatal memory shortage, are some EINVAL cases in pm_unmap_queues_v*. But that would indicate some internal error, that a queue was created with an invalid type, or maybe the queue data structure was somehow corrupted.
> 
This is just because amdkfd_fence_wait_timeout got timeout. execute_queues_cpsch return EIO as dqm->is_hws_hang is true.
hit this issue with kfdtest --gtest_filter=*QM*
> 
>>  That will cause double
>> free when user-space issue another destroy_queue ioctl.
>> 
>> Paste some log below.
>> 
>> amdgpu: Can't create new usermode queue because -1 queues were already
>> created
>> 
>> refcount_t: underflow; use-after-free.
>> Call Trace:
>>  kobject_put+0xe6/0x1b0
>>  kfd_procfs_del_queue+0x37/0x50 [amdgpu]
>>  pqm_destroy_queue+0x17a/0x390 [amdgpu]
>>  kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>>  kfd_ioctl+0x463/0x690 [amdgpu]
>> 
>> BUG kmalloc-32 (Tainted: G        W        ): Object already free
>> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
>> pid=2511
>>  __slab_alloc+0x72/0x80
>>  kmem_cache_alloc_trace+0x81f/0x8c0
>>  allocate_sdma_mqd+0x30/0xb0 [amdgpu]
>>  create_queue_cpsch+0xbf/0x470 [amdgpu]
>>  pqm_create_queue+0x28d/0x6d0 [amdgpu]
>>  kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
>> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
>> pid=2511
>>  kfree+0x322/0x340
>>  free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
>>  destroy_queue_cpsch+0x20c/0x330 [amdgpu]
>>  pqm_destroy_queue+0x1a3/0x390 [amdgpu]
>>  kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>> 
>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>> ---
>>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c  | 2 ++
>>  drivers/gpu/drm/amd/amdkfd/kfd_process.c               | 4 +++-
>>  drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 1 +
>>  3 files changed, 6 insertions(+), 1 deletion(-)
>> 
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index e6366b408420..c24ab8f17eb6 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -1529,6 +1529,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>>  				KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>>  		if (retval == -ETIME)
>>  			qpd->reset_wavefronts = true;
>> +		else if (retval)
>> +			goto failed_try_destroy_debugged_queue;
>>  		if (q->properties.is_gws) {
>>  			dqm->gws_queue_count--;
>>  			qpd->mapped_gws_queue = false;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 09b98a83f670..984197e5929f 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>>    void kfd_procfs_del_queue(struct queue *q)
>>  {
>> -	if (!q)
>> +	if (!q || !kobject_get_unless_zero(&q->kobj))
>>  		return;
>>    	kobject_del(&q->kobj);
>>  	kobject_put(&q->kobj);
>> +	/* paired with the get above */
>> +	kobject_put(&q->kobj);
>>  }
>>    int kfd_process_create_wq(void)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index 95a6c36cea4c..4fcb64bc43dd 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>>  		dqm = pqn->kq->dev->dqm;
>>  		dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>>  		kernel_queue_uninit(pqn->kq, false);
>> +		pqn->kq = NULL;
> 
> This seems unrelated to this patch. But if you're fixing this, I'd expect a similar fix after uninit_queue(pqn->q).
> 
> Regards,
>   Felix
> 
> 
>>  	}
>>    	if (pqn->q) {


[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 15002 bytes --]

[-- Attachment #3: Type: text/plain, Size: 154 bytes --]

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

* RE: [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold
  2021-06-16 23:09   ` Felix Kuehling
@ 2021-06-17  3:05     ` Chen, Jiansong (Simon)
  0 siblings, 0 replies; 6+ messages in thread
From: Chen, Jiansong (Simon) @ 2021-06-17  3:05 UTC (permalink / raw)
  To: Kuehling, Felix, Pan, Xinhui, amd-gfx
  Cc: Deucher, Alexander, Koenig, Christian

[AMD Official Use Only]

BTW, there is an obvious typo in the subject, Walk thorugh => Walk through.

Regards,
Jiansong
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Thursday, June 17, 2021 7:09 AM
To: Pan, Xinhui <Xinhui.Pan@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold

On 2021-06-16 4:35 a.m., xinhui pan wrote:
> To avoid any list corruption.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
>   .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c    | 12 ++++++++----
>   1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c24ab8f17eb6..1f84de861ec6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1704,7 +1704,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>               struct qcm_process_device *qpd)
>   {
>       int retval;
> -     struct queue *q, *next;
> +     struct queue *q;
>       struct kernel_queue *kq, *kq_next;
>       struct mqd_manager *mqd_mgr;
>       struct device_process_node *cur, *next_dpn; @@ -1739,8 +1739,6 @@
> static int process_termination_cpsch(struct device_queue_manager *dqm,
>                               qpd->mapped_gws_queue = false;
>                       }
>               }
> -
> -             dqm->total_queue_count--;

I think this should stay here. This is only used to check the maximum user queue limit per-device, which is a HW limitation. As far as the HW is concerned, the queues are destroyed after the call to execute_queues_cpsch. So there is no need to delay this update.


>       }
>
>       /* Unregister process */
> @@ -1772,13 +1770,19 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
>       /* Lastly, free mqd resources.
>        * Do free_mqd() after dqm_unlock to avoid circular locking.
>        */
> -     list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
> +     dqm_lock(dqm);

Instead of taking the dqm lock again, just move this up a couple of lines before the dqm_unlock call.

Regards,
   Felix


> +     while (!list_empty(&qpd->queues_list)) {
> +             q = list_first_entry(&qpd->queues_list, struct queue, list);
>               mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>                               q->properties.type)];
>               list_del(&q->list);
>               qpd->queue_count--;
> +             dqm->total_queue_count--;
> +             dqm_unlock(dqm);
>               mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +             dqm_lock(dqm);
>       }
> +     dqm_unlock(dqm);
>
>       return retval;
>   }
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&amp;data=04%7C01%7CJiansong.Chen%40amd.com%7C47251235ca70449c924608d9311bc4ce%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637594817623083340%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&amp;sdata=h%2FzO6DHA%2F6%2Btw0iJp4aBFfw8KZPVgtmgkfj3VQho4pM%3D&amp;reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2021-06-17  3:05 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-16  8:35 [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
2021-06-16  8:35 ` [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold xinhui pan
2021-06-16 23:09   ` Felix Kuehling
2021-06-17  3:05     ` Chen, Jiansong (Simon)
2021-06-16 22:55 ` [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails Felix Kuehling
2021-06-17  0:48   ` Pan, Xinhui

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.