* [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails
@ 2021-06-16 8:35 xinhui pan
2021-06-16 8:35 ` [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold xinhui pan
2021-06-16 22:55 ` [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails Felix Kuehling
0 siblings, 2 replies; 6+ messages in thread
From: xinhui pan @ 2021-06-16 8:35 UTC (permalink / raw)
To: amd-gfx; +Cc: alexander.deucher, Felix.Kuehling, xinhui pan, christian.koenig
Some resource are freed even destroy queue fails. That will cause double
free when user-space issue another destroy_queue ioctl.
Paste some log below.
amdgpu: Can't create new usermode queue because -1 queues were already
created
refcount_t: underflow; use-after-free.
Call Trace:
kobject_put+0xe6/0x1b0
kfd_procfs_del_queue+0x37/0x50 [amdgpu]
pqm_destroy_queue+0x17a/0x390 [amdgpu]
kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
kfd_ioctl+0x463/0x690 [amdgpu]
BUG kmalloc-32 (Tainted: G W ): Object already free
INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
pid=2511
__slab_alloc+0x72/0x80
kmem_cache_alloc_trace+0x81f/0x8c0
allocate_sdma_mqd+0x30/0xb0 [amdgpu]
create_queue_cpsch+0xbf/0x470 [amdgpu]
pqm_create_queue+0x28d/0x6d0 [amdgpu]
kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
pid=2511
kfree+0x322/0x340
free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
destroy_queue_cpsch+0x20c/0x330 [amdgpu]
pqm_destroy_queue+0x1a3/0x390 [amdgpu]
kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 ++
drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 +++-
drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 1 +
3 files changed, 6 insertions(+), 1 deletion(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index e6366b408420..c24ab8f17eb6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1529,6 +1529,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
if (retval == -ETIME)
qpd->reset_wavefronts = true;
+ else if (retval)
+ goto failed_try_destroy_debugged_queue;
if (q->properties.is_gws) {
dqm->gws_queue_count--;
qpd->mapped_gws_queue = false;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 09b98a83f670..984197e5929f 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
void kfd_procfs_del_queue(struct queue *q)
{
- if (!q)
+ if (!q || !kobject_get_unless_zero(&q->kobj))
return;
kobject_del(&q->kobj);
kobject_put(&q->kobj);
+ /* paired with the get above */
+ kobject_put(&q->kobj);
}
int kfd_process_create_wq(void)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 95a6c36cea4c..4fcb64bc43dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
dqm = pqn->kq->dev->dqm;
dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
kernel_queue_uninit(pqn->kq, false);
+ pqn->kq = NULL;
}
if (pqn->q) {
--
2.25.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 6+ messages in thread
* [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold
2021-06-16 8:35 [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
@ 2021-06-16 8:35 ` xinhui pan
2021-06-16 23:09 ` Felix Kuehling
2021-06-16 22:55 ` [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails Felix Kuehling
1 sibling, 1 reply; 6+ messages in thread
From: xinhui pan @ 2021-06-16 8:35 UTC (permalink / raw)
To: amd-gfx; +Cc: alexander.deucher, Felix.Kuehling, xinhui pan, christian.koenig
To avoid any list corruption.
Signed-off-by: xinhui pan <xinhui.pan@amd.com>
---
.../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index c24ab8f17eb6..1f84de861ec6 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1704,7 +1704,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
int retval;
- struct queue *q, *next;
+ struct queue *q;
struct kernel_queue *kq, *kq_next;
struct mqd_manager *mqd_mgr;
struct device_process_node *cur, *next_dpn;
@@ -1739,8 +1739,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
qpd->mapped_gws_queue = false;
}
}
-
- dqm->total_queue_count--;
}
/* Unregister process */
@@ -1772,13 +1770,19 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
/* Lastly, free mqd resources.
* Do free_mqd() after dqm_unlock to avoid circular locking.
*/
- list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+ dqm_lock(dqm);
+ while (!list_empty(&qpd->queues_list)) {
+ q = list_first_entry(&qpd->queues_list, struct queue, list);
mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
q->properties.type)];
list_del(&q->list);
qpd->queue_count--;
+ dqm->total_queue_count--;
+ dqm_unlock(dqm);
mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+ dqm_lock(dqm);
}
+ dqm_unlock(dqm);
return retval;
}
--
2.25.1
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails
2021-06-16 8:35 [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
2021-06-16 8:35 ` [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold xinhui pan
@ 2021-06-16 22:55 ` Felix Kuehling
2021-06-17 0:48 ` Pan, Xinhui
1 sibling, 1 reply; 6+ messages in thread
From: Felix Kuehling @ 2021-06-16 22:55 UTC (permalink / raw)
To: xinhui pan, amd-gfx; +Cc: alexander.deucher, christian.koenig
On 2021-06-16 4:35 a.m., xinhui pan wrote:
> Some resource are freed even destroy queue fails.
Looks like you're keeping this behaviour for -ETIME. That is consistent
with what pqn_destroy_queue does. What you're fixing here is the
behaviour for non-timeout errors. Please make that clear in the patch
description.
Out of curiosity, what kind of error were you getting? The only other
ones that are not a fatal memory shortage, are some EINVAL cases in
pm_unmap_queues_v*. But that would indicate some internal error, that a
queue was created with an invalid type, or maybe the queue data
structure was somehow corrupted.
> That will cause double
> free when user-space issue another destroy_queue ioctl.
>
> Paste some log below.
>
> amdgpu: Can't create new usermode queue because -1 queues were already
> created
>
> refcount_t: underflow; use-after-free.
> Call Trace:
> kobject_put+0xe6/0x1b0
> kfd_procfs_del_queue+0x37/0x50 [amdgpu]
> pqm_destroy_queue+0x17a/0x390 [amdgpu]
> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
> kfd_ioctl+0x463/0x690 [amdgpu]
>
> BUG kmalloc-32 (Tainted: G W ): Object already free
> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
> pid=2511
> __slab_alloc+0x72/0x80
> kmem_cache_alloc_trace+0x81f/0x8c0
> allocate_sdma_mqd+0x30/0xb0 [amdgpu]
> create_queue_cpsch+0xbf/0x470 [amdgpu]
> pqm_create_queue+0x28d/0x6d0 [amdgpu]
> kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
> pid=2511
> kfree+0x322/0x340
> free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
> destroy_queue_cpsch+0x20c/0x330 [amdgpu]
> pqm_destroy_queue+0x1a3/0x390 [amdgpu]
> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
> drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 ++
> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 +++-
> drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 1 +
> 3 files changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index e6366b408420..c24ab8f17eb6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1529,6 +1529,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
> if (retval == -ETIME)
> qpd->reset_wavefronts = true;
> + else if (retval)
> + goto failed_try_destroy_debugged_queue;
> if (q->properties.is_gws) {
> dqm->gws_queue_count--;
> qpd->mapped_gws_queue = false;
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> index 09b98a83f670..984197e5929f 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>
> void kfd_procfs_del_queue(struct queue *q)
> {
> - if (!q)
> + if (!q || !kobject_get_unless_zero(&q->kobj))
> return;
>
> kobject_del(&q->kobj);
> kobject_put(&q->kobj);
> + /* paired with the get above */
> + kobject_put(&q->kobj);
> }
>
> int kfd_process_create_wq(void)
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> index 95a6c36cea4c..4fcb64bc43dd 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
> dqm = pqn->kq->dev->dqm;
> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
> kernel_queue_uninit(pqn->kq, false);
> + pqn->kq = NULL;
This seems unrelated to this patch. But if you're fixing this, I'd
expect a similar fix after uninit_queue(pqn->q).
Regards,
Felix
> }
>
> if (pqn->q) {
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold
2021-06-16 8:35 ` [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold xinhui pan
@ 2021-06-16 23:09 ` Felix Kuehling
2021-06-17 3:05 ` Chen, Jiansong (Simon)
0 siblings, 1 reply; 6+ messages in thread
From: Felix Kuehling @ 2021-06-16 23:09 UTC (permalink / raw)
To: xinhui pan, amd-gfx; +Cc: alexander.deucher, christian.koenig
On 2021-06-16 4:35 a.m., xinhui pan wrote:
> To avoid any list corruption.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
> 1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c24ab8f17eb6..1f84de861ec6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1704,7 +1704,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> struct qcm_process_device *qpd)
> {
> int retval;
> - struct queue *q, *next;
> + struct queue *q;
> struct kernel_queue *kq, *kq_next;
> struct mqd_manager *mqd_mgr;
> struct device_process_node *cur, *next_dpn;
> @@ -1739,8 +1739,6 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> qpd->mapped_gws_queue = false;
> }
> }
> -
> - dqm->total_queue_count--;
I think this should stay here. This is only used to check the maximum
user queue limit per-device, which is a HW limitation. As far as the HW
is concerned, the queues are destroyed after the call to
execute_queues_cpsch. So there is no need to delay this update.
> }
>
> /* Unregister process */
> @@ -1772,13 +1770,19 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> /* Lastly, free mqd resources.
> * Do free_mqd() after dqm_unlock to avoid circular locking.
> */
> - list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
> + dqm_lock(dqm);
Instead of taking the dqm lock again, just move this up a couple of
lines before the dqm_unlock call.
Regards,
Felix
> + while (!list_empty(&qpd->queues_list)) {
> + q = list_first_entry(&qpd->queues_list, struct queue, list);
> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> q->properties.type)];
> list_del(&q->list);
> qpd->queue_count--;
> + dqm->total_queue_count--;
> + dqm_unlock(dqm);
> mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> + dqm_lock(dqm);
> }
> + dqm_unlock(dqm);
>
> return retval;
> }
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails
2021-06-16 22:55 ` [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails Felix Kuehling
@ 2021-06-17 0:48 ` Pan, Xinhui
0 siblings, 0 replies; 6+ messages in thread
From: Pan, Xinhui @ 2021-06-17 0:48 UTC (permalink / raw)
To: Kuehling, Felix
Cc: Deucher, Alexander, Pan, Xinhui, Koenig, Christian, amd-gfx
[-- Attachment #1: Type: text/plain, Size: 4513 bytes --]
> 2021年6月17日 06:55,Kuehling, Felix <Felix.Kuehling@amd.com> 写道:
>
> On 2021-06-16 4:35 a.m., xinhui pan wrote:
>> Some resource are freed even destroy queue fails.
>
> Looks like you're keeping this behaviour for -ETIME. That is consistent with what pqn_destroy_queue does. What you're fixing here is the behaviour for non-timeout errors. Please make that clear in the patch description.
will do that in v2.
>
> Out of curiosity, what kind of error were you getting? The only other ones that are not a fatal memory shortage, are some EINVAL cases in pm_unmap_queues_v*. But that would indicate some internal error, that a queue was created with an invalid type, or maybe the queue data structure was somehow corrupted.
>
This is just because amdkfd_fence_wait_timeout got timeout. execute_queues_cpsch return EIO as dqm->is_hws_hang is true.
hit this issue with kfdtest --gtest_filter=*QM*
>
>> That will cause double
>> free when user-space issue another destroy_queue ioctl.
>>
>> Paste some log below.
>>
>> amdgpu: Can't create new usermode queue because -1 queues were already
>> created
>>
>> refcount_t: underflow; use-after-free.
>> Call Trace:
>> kobject_put+0xe6/0x1b0
>> kfd_procfs_del_queue+0x37/0x50 [amdgpu]
>> pqm_destroy_queue+0x17a/0x390 [amdgpu]
>> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>> kfd_ioctl+0x463/0x690 [amdgpu]
>>
>> BUG kmalloc-32 (Tainted: G W ): Object already free
>> INFO: Allocated in allocate_sdma_mqd+0x30/0xb0 [amdgpu] age=4796 cpu=2
>> pid=2511
>> __slab_alloc+0x72/0x80
>> kmem_cache_alloc_trace+0x81f/0x8c0
>> allocate_sdma_mqd+0x30/0xb0 [amdgpu]
>> create_queue_cpsch+0xbf/0x470 [amdgpu]
>> pqm_create_queue+0x28d/0x6d0 [amdgpu]
>> kfd_ioctl_create_queue+0x492/0xae0 [amdgpu]
>> INFO: Freed in free_mqd_hiq_sdma+0x20/0x60 [amdgpu] age=2537 cpu=7
>> pid=2511
>> kfree+0x322/0x340
>> free_mqd_hiq_sdma+0x20/0x60 [amdgpu]
>> destroy_queue_cpsch+0x20c/0x330 [amdgpu]
>> pqm_destroy_queue+0x1a3/0x390 [amdgpu]
>> kfd_ioctl_destroy_queue+0x57/0xc0 [amdgpu]
>>
>> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
>> ---
>> drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 2 ++
>> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 4 +++-
>> drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 1 +
>> 3 files changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index e6366b408420..c24ab8f17eb6 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -1529,6 +1529,8 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
>> KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
>> if (retval == -ETIME)
>> qpd->reset_wavefronts = true;
>> + else if (retval)
>> + goto failed_try_destroy_debugged_queue;
>> if (q->properties.is_gws) {
>> dqm->gws_queue_count--;
>> qpd->mapped_gws_queue = false;
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 09b98a83f670..984197e5929f 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -607,11 +607,13 @@ static int kfd_procfs_add_sysfs_files(struct kfd_process *p)
>> void kfd_procfs_del_queue(struct queue *q)
>> {
>> - if (!q)
>> + if (!q || !kobject_get_unless_zero(&q->kobj))
>> return;
>> kobject_del(&q->kobj);
>> kobject_put(&q->kobj);
>> + /* paired with the get above */
>> + kobject_put(&q->kobj);
>> }
>> int kfd_process_create_wq(void)
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> index 95a6c36cea4c..4fcb64bc43dd 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
>> @@ -373,6 +373,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid)
>> dqm = pqn->kq->dev->dqm;
>> dqm->ops.destroy_kernel_queue(dqm, pqn->kq, &pdd->qpd);
>> kernel_queue_uninit(pqn->kq, false);
>> + pqn->kq = NULL;
>
> This seems unrelated to this patch. But if you're fixing this, I'd expect a similar fix after uninit_queue(pqn->q).
>
> Regards,
> Felix
>
>
>> }
>> if (pqn->q) {
[-- Attachment #2: winmail.dat --]
[-- Type: application/ms-tnef, Size: 15002 bytes --]
[-- Attachment #3: Type: text/plain, Size: 154 bytes --]
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 6+ messages in thread
* RE: [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold
2021-06-16 23:09 ` Felix Kuehling
@ 2021-06-17 3:05 ` Chen, Jiansong (Simon)
0 siblings, 0 replies; 6+ messages in thread
From: Chen, Jiansong (Simon) @ 2021-06-17 3:05 UTC (permalink / raw)
To: Kuehling, Felix, Pan, Xinhui, amd-gfx
Cc: Deucher, Alexander, Koenig, Christian
[AMD Official Use Only]
BTW, there is an obvious typo in the subject, Walk thorugh => Walk through.
Regards,
Jiansong
-----Original Message-----
From: amd-gfx <amd-gfx-bounces@lists.freedesktop.org> On Behalf Of Felix Kuehling
Sent: Thursday, June 17, 2021 7:09 AM
To: Pan, Xinhui <Xinhui.Pan@amd.com>; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander <Alexander.Deucher@amd.com>; Koenig, Christian <Christian.Koenig@amd.com>
Subject: Re: [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold
On 2021-06-16 4:35 a.m., xinhui pan wrote:
> To avoid any list corruption.
>
> Signed-off-by: xinhui pan <xinhui.pan@amd.com>
> ---
> .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 ++++++++----
> 1 file changed, 8 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index c24ab8f17eb6..1f84de861ec6 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -1704,7 +1704,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> struct qcm_process_device *qpd)
> {
> int retval;
> - struct queue *q, *next;
> + struct queue *q;
> struct kernel_queue *kq, *kq_next;
> struct mqd_manager *mqd_mgr;
> struct device_process_node *cur, *next_dpn; @@ -1739,8 +1739,6 @@
> static int process_termination_cpsch(struct device_queue_manager *dqm,
> qpd->mapped_gws_queue = false;
> }
> }
> -
> - dqm->total_queue_count--;
I think this should stay here. This is only used to check the maximum user queue limit per-device, which is a HW limitation. As far as the HW is concerned, the queues are destroyed after the call to execute_queues_cpsch. So there is no need to delay this update.
> }
>
> /* Unregister process */
> @@ -1772,13 +1770,19 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
> /* Lastly, free mqd resources.
> * Do free_mqd() after dqm_unlock to avoid circular locking.
> */
> - list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
> + dqm_lock(dqm);
Instead of taking the dqm lock again, just move this up a couple of lines before the dqm_unlock call.
Regards,
Felix
> + while (!list_empty(&qpd->queues_list)) {
> + q = list_first_entry(&qpd->queues_list, struct queue, list);
> mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> q->properties.type)];
> list_del(&q->list);
> qpd->queue_count--;
> + dqm->total_queue_count--;
> + dqm_unlock(dqm);
> mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> + dqm_lock(dqm);
> }
> + dqm_unlock(dqm);
>
> return retval;
> }
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfx&data=04%7C01%7CJiansong.Chen%40amd.com%7C47251235ca70449c924608d9311bc4ce%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637594817623083340%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000&sdata=h%2FzO6DHA%2F6%2Btw0iJp4aBFfw8KZPVgtmgkfj3VQho4pM%3D&reserved=0
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2021-06-17 3:05 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-16 8:35 [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails xinhui pan
2021-06-16 8:35 ` [PATCH 2/2] drm/amdkfd: Walk thorugh list with dqm lock hold xinhui pan
2021-06-16 23:09 ` Felix Kuehling
2021-06-17 3:05 ` Chen, Jiansong (Simon)
2021-06-16 22:55 ` [PATCH 1/2] drm/amdkfd: Fix some double free when destroy queue fails Felix Kuehling
2021-06-17 0:48 ` Pan, Xinhui
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.