All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] drm/amdkfd: Fix circular lock in nocpsch path
@ 2021-06-15 17:50 Amber Lin
  2021-06-15 18:22 ` Felix Kuehling
  0 siblings, 1 reply; 5+ messages in thread
From: Amber Lin @ 2021-06-15 17:50 UTC (permalink / raw)
  To: amd-gfx; +Cc: Amber Lin

Calling free_mqd inside of destroy_queue_nocpsch_locked can cause a
circular lock. destroy_queue_nocpsch_locked is called under a DQM lock,
which is taken in MMU notifiers, potentially in FS reclaim context.
Taking another lock, which is BO reservation lock from free_mqd, while
causing an FS reclaim inside the DQM lock creates a problematic circular
lock dependency. Therefore move free_mqd out of
destroy_queue_nocpsch_locked and call it after unlocking DQM.

Signed-off-by: Amber Lin <Amber.Lin@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
---
 .../drm/amd/amdkfd/kfd_device_queue_manager.c  | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 72bea5278add..c069fa259b30 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -486,9 +486,6 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
 	if (retval == -ETIME)
 		qpd->reset_wavefronts = true;
 
-
-	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
-
 	list_del(&q->list);
 	if (list_empty(&qpd->queues_list)) {
 		if (qpd->reset_wavefronts) {
@@ -523,6 +520,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
 	int retval;
 	uint64_t sdma_val = 0;
 	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
+	struct mqd_manager *mqd_mgr =
+		dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)];
 
 	/* Get the SDMA queue stats */
 	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
@@ -540,6 +539,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
 		pdd->sdma_past_activity_counter += sdma_val;
 	dqm_unlock(dqm);
 
+	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+
 	return retval;
 }
 
@@ -1629,7 +1630,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
 static int process_termination_nocpsch(struct device_queue_manager *dqm,
 		struct qcm_process_device *qpd)
 {
-	struct queue *q, *next;
+	struct queue *q;
 	struct device_process_node *cur, *next_dpn;
 	int retval = 0;
 	bool found = false;
@@ -1637,12 +1638,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
 	dqm_lock(dqm);
 
 	/* Clear all user mode queues */
-	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
+	while (!list_empty(&qpd->queues_list)) {
+		struct mqd_manager *mqd_mgr;
 		int ret;
 
+		q = list_first_entry(&qpd->queues_list, struct queue, list);
+		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
+				q->properties.type)];
 		ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
 		if (ret)
 			retval = ret;
+		dqm_unlock(dqm);
+		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
+		dqm_lock(dqm);
 	}
 
 	/* Unregister process */
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdkfd: Fix circular lock in nocpsch path
  2021-06-15 17:50 [PATCH] drm/amdkfd: Fix circular lock in nocpsch path Amber Lin
@ 2021-06-15 18:22 ` Felix Kuehling
  2021-06-16  4:01   ` Pan, Xinhui
  0 siblings, 1 reply; 5+ messages in thread
From: Felix Kuehling @ 2021-06-15 18:22 UTC (permalink / raw)
  To: Amber Lin, amd-gfx, Pan, Xinhui

[+Xinhui]


Am 2021-06-15 um 1:50 p.m. schrieb Amber Lin:
> Calling free_mqd inside of destroy_queue_nocpsch_locked can cause a
> circular lock. destroy_queue_nocpsch_locked is called under a DQM lock,
> which is taken in MMU notifiers, potentially in FS reclaim context.
> Taking another lock, which is BO reservation lock from free_mqd, while
> causing an FS reclaim inside the DQM lock creates a problematic circular
> lock dependency. Therefore move free_mqd out of
> destroy_queue_nocpsch_locked and call it after unlocking DQM.
>
> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>

Let's submit this patch as is. I'm making some comments inline for
things that Xinhui can address in his race condition patch.


> ---
>  .../drm/amd/amdkfd/kfd_device_queue_manager.c  | 18 +++++++++++++-----
>  1 file changed, 13 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index 72bea5278add..c069fa259b30 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -486,9 +486,6 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>  	if (retval == -ETIME)
>  		qpd->reset_wavefronts = true;
>  
> -
> -	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> -
>  	list_del(&q->list);
>  	if (list_empty(&qpd->queues_list)) {
>  		if (qpd->reset_wavefronts) {
> @@ -523,6 +520,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
>  	int retval;
>  	uint64_t sdma_val = 0;
>  	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
> +	struct mqd_manager *mqd_mgr =
> +		dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)];
>  
>  	/* Get the SDMA queue stats */
>  	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
> @@ -540,6 +539,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
>  		pdd->sdma_past_activity_counter += sdma_val;
>  	dqm_unlock(dqm);
>  
> +	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +
>  	return retval;
>  }
>  
> @@ -1629,7 +1630,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
>  static int process_termination_nocpsch(struct device_queue_manager *dqm,
>  		struct qcm_process_device *qpd)
>  {
> -	struct queue *q, *next;
> +	struct queue *q;
>  	struct device_process_node *cur, *next_dpn;
>  	int retval = 0;
>  	bool found = false;
> @@ -1637,12 +1638,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
>  	dqm_lock(dqm);
>  
>  	/* Clear all user mode queues */
> -	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
> +	while (!list_empty(&qpd->queues_list)) {
> +		struct mqd_manager *mqd_mgr;
>  		int ret;
>  
> +		q = list_first_entry(&qpd->queues_list, struct queue, list);
> +		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
> +				q->properties.type)];
>  		ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
>  		if (ret)
>  			retval = ret;
> +		dqm_unlock(dqm);
> +		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
> +		dqm_lock(dqm);

This is the correct way to clean up the list when dropping the dqm-lock
in the middle. Xinhui, you can use the same method in
process_termination_cpsch.

I believe the swapping of the q->mqd with a temporary variable is not
needed. When free_mqd is called, the queue is no longer on the
qpd->queues_list, so destroy_queue cannot race with it. If we ensure
that queues are always removed from the list before calling free_mqd,
and that list-removal happens under the dqm_lock, then there should be
no risk of a race condition that causes a double-free.

Regards,
  Felix


>  	}
>  
>  	/* Unregister process */
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdkfd: Fix circular lock in nocpsch path
  2021-06-15 18:22 ` Felix Kuehling
@ 2021-06-16  4:01   ` Pan, Xinhui
  2021-06-16  4:36     ` Felix Kuehling
  0 siblings, 1 reply; 5+ messages in thread
From: Pan, Xinhui @ 2021-06-16  4:01 UTC (permalink / raw)
  To: Kuehling, Felix; +Cc: Lin, Amber, Pan, Xinhui, amd-gfx



> 2021年6月16日 02:22,Kuehling, Felix <Felix.Kuehling@amd.com> 写道:
> 
> [+Xinhui]
> 
> 
> Am 2021-06-15 um 1:50 p.m. schrieb Amber Lin:
>> Calling free_mqd inside of destroy_queue_nocpsch_locked can cause a
>> circular lock. destroy_queue_nocpsch_locked is called under a DQM lock,
>> which is taken in MMU notifiers, potentially in FS reclaim context.
>> Taking another lock, which is BO reservation lock from free_mqd, while
>> causing an FS reclaim inside the DQM lock creates a problematic circular
>> lock dependency. Therefore move free_mqd out of
>> destroy_queue_nocpsch_locked and call it after unlocking DQM.
>> 
>> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
> 
> Let's submit this patch as is. I'm making some comments inline for
> things that Xinhui can address in his race condition patch.
> 
> 
>> ---
>> .../drm/amd/amdkfd/kfd_device_queue_manager.c  | 18 +++++++++++++-----
>> 1 file changed, 13 insertions(+), 5 deletions(-)
>> 
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> index 72bea5278add..c069fa259b30 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>> @@ -486,9 +486,6 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>> 	if (retval == -ETIME)
>> 		qpd->reset_wavefronts = true;
>> 
>> -
>> -	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>> -
>> 	list_del(&q->list);
>> 	if (list_empty(&qpd->queues_list)) {
>> 		if (qpd->reset_wavefronts) {
>> @@ -523,6 +520,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
>> 	int retval;
>> 	uint64_t sdma_val = 0;
>> 	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>> +	struct mqd_manager *mqd_mgr =
>> +		dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)];
>> 
>> 	/* Get the SDMA queue stats */
>> 	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>> @@ -540,6 +539,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
>> 		pdd->sdma_past_activity_counter += sdma_val;
>> 	dqm_unlock(dqm);
>> 
>> +	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>> +
>> 	return retval;
>> }
>> 
>> @@ -1629,7 +1630,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
>> static int process_termination_nocpsch(struct device_queue_manager *dqm,
>> 		struct qcm_process_device *qpd)
>> {
>> -	struct queue *q, *next;
>> +	struct queue *q;
>> 	struct device_process_node *cur, *next_dpn;
>> 	int retval = 0;
>> 	bool found = false;
>> @@ -1637,12 +1638,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
>> 	dqm_lock(dqm);
>> 
>> 	/* Clear all user mode queues */
>> -	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
>> +	while (!list_empty(&qpd->queues_list)) {
>> +		struct mqd_manager *mqd_mgr;
>> 		int ret;
>> 
>> +		q = list_first_entry(&qpd->queues_list, struct queue, list);
>> +		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>> +				q->properties.type)];
>> 		ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
>> 		if (ret)
>> 			retval = ret;
>> +		dqm_unlock(dqm);
>> +		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>> +		dqm_lock(dqm);
> 
> This is the correct way to clean up the list when dropping the dqm-lock
> in the middle. Xinhui, you can use the same method in
> process_termination_cpsch.
> 

yes, that is the right way to walk through the list. thanks.


> I believe the swapping of the q->mqd with a temporary variable is not
> needed. When free_mqd is called, the queue is no longer on the
> qpd->queues_list, so destroy_queue cannot race with it. If we ensure
> that queues are always removed from the list before calling free_mqd,
> and that list-removal happens under the dqm_lock, then there should be
> no risk of a race condition that causes a double-free.
> 

no, the double free exists because pqm_destroy_queue fetch the queue from qid by get_queue_by_qid()
the race is like below.
pqm_destroy_queue
	get_queue_by_qid				process_termination_cpsch
	destroy_queue_cpsch
								lock
								list_for_each_entry_safe
									list_del(q)
								unlock
								free_mqd
	lock
	list_del(q)
	unlock
	free_mqd
	


 
> Regards,
>   Felix
> 
> 
>> 	}
>> 
>> 	/* Unregister process */

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdkfd: Fix circular lock in nocpsch path
  2021-06-16  4:01   ` Pan, Xinhui
@ 2021-06-16  4:36     ` Felix Kuehling
  2021-06-16  5:44       ` Pan, Xinhui
  0 siblings, 1 reply; 5+ messages in thread
From: Felix Kuehling @ 2021-06-16  4:36 UTC (permalink / raw)
  To: Pan, Xinhui; +Cc: Lin, Amber, amd-gfx

Am 2021-06-16 um 12:01 a.m. schrieb Pan, Xinhui:
>> 2021年6月16日 02:22,Kuehling, Felix <Felix.Kuehling@amd.com> 写道:
>>
>> [+Xinhui]
>>
>>
>> Am 2021-06-15 um 1:50 p.m. schrieb Amber Lin:
>>> Calling free_mqd inside of destroy_queue_nocpsch_locked can cause a
>>> circular lock. destroy_queue_nocpsch_locked is called under a DQM lock,
>>> which is taken in MMU notifiers, potentially in FS reclaim context.
>>> Taking another lock, which is BO reservation lock from free_mqd, while
>>> causing an FS reclaim inside the DQM lock creates a problematic circular
>>> lock dependency. Therefore move free_mqd out of
>>> destroy_queue_nocpsch_locked and call it after unlocking DQM.
>>>
>>> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>> Let's submit this patch as is. I'm making some comments inline for
>> things that Xinhui can address in his race condition patch.
>>
>>
>>> ---
>>> .../drm/amd/amdkfd/kfd_device_queue_manager.c  | 18 +++++++++++++-----
>>> 1 file changed, 13 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> index 72bea5278add..c069fa259b30 100644
>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>> @@ -486,9 +486,6 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>> 	if (retval == -ETIME)
>>> 		qpd->reset_wavefronts = true;
>>>
>>> -
>>> -	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>>> -
>>> 	list_del(&q->list);
>>> 	if (list_empty(&qpd->queues_list)) {
>>> 		if (qpd->reset_wavefronts) {
>>> @@ -523,6 +520,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
>>> 	int retval;
>>> 	uint64_t sdma_val = 0;
>>> 	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>>> +	struct mqd_manager *mqd_mgr =
>>> +		dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)];
>>>
>>> 	/* Get the SDMA queue stats */
>>> 	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>>> @@ -540,6 +539,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
>>> 		pdd->sdma_past_activity_counter += sdma_val;
>>> 	dqm_unlock(dqm);
>>>
>>> +	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>>> +
>>> 	return retval;
>>> }
>>>
>>> @@ -1629,7 +1630,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
>>> static int process_termination_nocpsch(struct device_queue_manager *dqm,
>>> 		struct qcm_process_device *qpd)
>>> {
>>> -	struct queue *q, *next;
>>> +	struct queue *q;
>>> 	struct device_process_node *cur, *next_dpn;
>>> 	int retval = 0;
>>> 	bool found = false;
>>> @@ -1637,12 +1638,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
>>> 	dqm_lock(dqm);
>>>
>>> 	/* Clear all user mode queues */
>>> -	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
>>> +	while (!list_empty(&qpd->queues_list)) {
>>> +		struct mqd_manager *mqd_mgr;
>>> 		int ret;
>>>
>>> +		q = list_first_entry(&qpd->queues_list, struct queue, list);
>>> +		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>> +				q->properties.type)];
>>> 		ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
>>> 		if (ret)
>>> 			retval = ret;
>>> +		dqm_unlock(dqm);
>>> +		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>>> +		dqm_lock(dqm);
>> This is the correct way to clean up the list when dropping the dqm-lock
>> in the middle. Xinhui, you can use the same method in
>> process_termination_cpsch.
>>
> yes, that is the right way to walk through the list. thanks.
>
>
>> I believe the swapping of the q->mqd with a temporary variable is not
>> needed. When free_mqd is called, the queue is no longer on the
>> qpd->queues_list, so destroy_queue cannot race with it. If we ensure
>> that queues are always removed from the list before calling free_mqd,
>> and that list-removal happens under the dqm_lock, then there should be
>> no risk of a race condition that causes a double-free.
>>
> no, the double free exists because pqm_destroy_queue fetch the queue from qid by get_queue_by_qid()
> the race is like below.
> pqm_destroy_queue
> 	get_queue_by_qid				process_termination_cpsch
> 	destroy_queue_cpsch
> 								lock
> 								list_for_each_entry_safe
> 									list_del(q)
> 								unlock
> 								free_mqd
> 	lock
> 	list_del(q)
> 	unlock
> 	free_mqd

I think if both those threads try to free the same queue, they both need
to hold the same process->mutex. For pqm_destroy_queue that happens in
kfd_ioctl_destroy_queue. For process_termination_cpsch that happens in
kfd_process_notifier_release before it calls
kfd_process_dequeue_from_all_devices.

Regards,
  Felix


> 	
>
>
>  
>> Regards,
>>   Felix
>>
>>
>>> 	}
>>>
>>> 	/* Unregister process */
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] drm/amdkfd: Fix circular lock in nocpsch path
  2021-06-16  4:36     ` Felix Kuehling
@ 2021-06-16  5:44       ` Pan, Xinhui
  0 siblings, 0 replies; 5+ messages in thread
From: Pan, Xinhui @ 2021-06-16  5:44 UTC (permalink / raw)
  To: Kuehling, Felix; +Cc: Lin, Amber, Pan, Xinhui, amd-gfx



> 2021年6月16日 12:36,Kuehling, Felix <Felix.Kuehling@amd.com> 写道:
> 
> Am 2021-06-16 um 12:01 a.m. schrieb Pan, Xinhui:
>>> 2021年6月16日 02:22,Kuehling, Felix <Felix.Kuehling@amd.com> 写道:
>>> 
>>> [+Xinhui]
>>> 
>>> 
>>> Am 2021-06-15 um 1:50 p.m. schrieb Amber Lin:
>>>> Calling free_mqd inside of destroy_queue_nocpsch_locked can cause a
>>>> circular lock. destroy_queue_nocpsch_locked is called under a DQM lock,
>>>> which is taken in MMU notifiers, potentially in FS reclaim context.
>>>> Taking another lock, which is BO reservation lock from free_mqd, while
>>>> causing an FS reclaim inside the DQM lock creates a problematic circular
>>>> lock dependency. Therefore move free_mqd out of
>>>> destroy_queue_nocpsch_locked and call it after unlocking DQM.
>>>> 
>>>> Signed-off-by: Amber Lin <Amber.Lin@amd.com>
>>>> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
>>> Let's submit this patch as is. I'm making some comments inline for
>>> things that Xinhui can address in his race condition patch.
>>> 
>>> 
>>>> ---
>>>> .../drm/amd/amdkfd/kfd_device_queue_manager.c  | 18 +++++++++++++-----
>>>> 1 file changed, 13 insertions(+), 5 deletions(-)
>>>> 
>>>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> index 72bea5278add..c069fa259b30 100644
>>>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
>>>> @@ -486,9 +486,6 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
>>>> 	if (retval == -ETIME)
>>>> 		qpd->reset_wavefronts = true;
>>>> 
>>>> -
>>>> -	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>>>> -
>>>> 	list_del(&q->list);
>>>> 	if (list_empty(&qpd->queues_list)) {
>>>> 		if (qpd->reset_wavefronts) {
>>>> @@ -523,6 +520,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
>>>> 	int retval;
>>>> 	uint64_t sdma_val = 0;
>>>> 	struct kfd_process_device *pdd = qpd_to_pdd(qpd);
>>>> +	struct mqd_manager *mqd_mgr =
>>>> +		dqm->mqd_mgrs[get_mqd_type_from_queue_type(q->properties.type)];
>>>> 
>>>> 	/* Get the SDMA queue stats */
>>>> 	if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) ||
>>>> @@ -540,6 +539,8 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
>>>> 		pdd->sdma_past_activity_counter += sdma_val;
>>>> 	dqm_unlock(dqm);
>>>> 
>>>> +	mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>>>> +
>>>> 	return retval;
>>>> }
>>>> 
>>>> @@ -1629,7 +1630,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
>>>> static int process_termination_nocpsch(struct device_queue_manager *dqm,
>>>> 		struct qcm_process_device *qpd)
>>>> {
>>>> -	struct queue *q, *next;
>>>> +	struct queue *q;
>>>> 	struct device_process_node *cur, *next_dpn;
>>>> 	int retval = 0;
>>>> 	bool found = false;
>>>> @@ -1637,12 +1638,19 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
>>>> 	dqm_lock(dqm);
>>>> 
>>>> 	/* Clear all user mode queues */
>>>> -	list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
>>>> +	while (!list_empty(&qpd->queues_list)) {
>>>> +		struct mqd_manager *mqd_mgr;
>>>> 		int ret;
>>>> 
>>>> +		q = list_first_entry(&qpd->queues_list, struct queue, list);
>>>> +		mqd_mgr = dqm->mqd_mgrs[get_mqd_type_from_queue_type(
>>>> +				q->properties.type)];
>>>> 		ret = destroy_queue_nocpsch_locked(dqm, qpd, q);
>>>> 		if (ret)
>>>> 			retval = ret;
>>>> +		dqm_unlock(dqm);
>>>> +		mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
>>>> +		dqm_lock(dqm);
>>> This is the correct way to clean up the list when dropping the dqm-lock
>>> in the middle. Xinhui, you can use the same method in
>>> process_termination_cpsch.
>>> 
>> yes, that is the right way to walk through the list. thanks.
>> 
>> 
>>> I believe the swapping of the q->mqd with a temporary variable is not
>>> needed. When free_mqd is called, the queue is no longer on the
>>> qpd->queues_list, so destroy_queue cannot race with it. If we ensure
>>> that queues are always removed from the list before calling free_mqd,
>>> and that list-removal happens under the dqm_lock, then there should be
>>> no risk of a race condition that causes a double-free.
>>> 
>> no, the double free exists because pqm_destroy_queue fetch the queue from qid by get_queue_by_qid()
>> the race is like below.
>> pqm_destroy_queue
>> 	get_queue_by_qid				process_termination_cpsch
>> 	destroy_queue_cpsch
>> 								lock
>> 								list_for_each_entry_safe
>> 									list_del(q)
>> 								unlock
>> 								free_mqd
>> 	lock
>> 	list_del(q)
>> 	unlock
>> 	free_mqd
> 
> I think if both those threads try to free the same queue, they both need
> to hold the same process->mutex. For pqm_destroy_queue that happens in
> kfd_ioctl_destroy_queue. For process_termination_cpsch that happens in
> kfd_process_notifier_release before it calls
> kfd_process_dequeue_from_all_devices.
oh, yes, you are right.
So the double free I am seeing has different root cause. :(

> 
> Regards,
>   Felix
> 
> 
>> 	
>> 
>> 
>> 
>>> Regards,
>>>  Felix
>>> 
>>> 
>>>> 	}
>>>> 
>>>> 	/* Unregister process */

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-06-16  5:44 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-06-15 17:50 [PATCH] drm/amdkfd: Fix circular lock in nocpsch path Amber Lin
2021-06-15 18:22 ` Felix Kuehling
2021-06-16  4:01   ` Pan, Xinhui
2021-06-16  4:36     ` Felix Kuehling
2021-06-16  5:44       ` Pan, Xinhui

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.